Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
codes.py
Go to the documentation of this file.
1 ## @package data_structs.codes
2 
3 from data_structs.base_objects import BLLObject
4 from parsers.errors import *
5 from utils.enum import Enum
6 
7 import re
8 
9 ## This class provides information about/validation for a particular code. This could be a transcriber code, a LENA speaker code, a LENA notes code, or other potential types of codes.
10 # A code can have any number of options. Each option is a specific string value that the code can take on (eg. a LENA speaker code has options like 'MAN', 'FAN', etc.).
11 # This class also provides the ability fo valide code options that have been read from a TRS file.
12 class Code(object):
13  ## Constructor
14  # @param self
15  # @param options_dict (dictionary) this is a dictionary with one key for each possible option. The value for each key must be a CodeInfo object for that particular option.
16  def __init__(self, options_dict):
17  self.options_dict = options_dict
18 
19  ## Retreives a CodeInfo object for a specified option.
20  # @param self
21  # @param code_str (string) a string representing the option to lookup (eg. 'MAN' or 'FAN' for a LENA speaker code, 'M' or 'F' for transcriber code 1, etc.)
22  # @returns (CodeInfo) a CodeInfo object that can be used to retreive further details about the individual option
23  def get_option(self, code_str):
24  result = None
25  if code_str in self.options_dict:
26  result = self.options_dict[code_str] #returns codeinfo object
27 
28  return result
29 
30  ## Retreives a list of all possible options for this code.
31  # @param self
32  # @returns (list) list of strings, one for each possible option
34  return self.options_dict.keys()
35 
36  ## This function returns a list of functions that are called, one by one, upon validation.
37  # Each function should accept a single argument, a string containing the option text. They should return an array containing any error messages to present to the UI (or empty list if none).
38  # @param self
39  # @returns (list) list of functions
40  def get_tests(self):
41  #For validation applying to all types of codes, add new test functions here, and append the name to the returned list.
42  #For validation applying to one type of code, create a subclass and override this function.
43  #Test functions should accept a single parameter: a string representing the transcriber code currently being validated.
44  #Test functions should return an array containing error messages from all errors encountered (or empty list if none)
45  return []
46 
47  ## Validates an option string for this Code object.
48  # To do so, it retreives a list of test functions from get_tests() and executes them one by one.
49  # The resulting lists are concatenated and the final list is returned.
50  # @param self
51  # @param cd_str (string) string for the code option being validated.
52  # @returns (list) list of error messages, or empty list if no errors were encountered.
53  def is_valid(self, cd_str):
54  errors = []
55  for test_fcn in self.get_tests():
56  error_msgs = test_fcn(cd_str)
57  if error_msgs:
58  errors.extend(error_msgs)
59 
60  return errors
61 
62 ## Transcriber codes 1, 2, and 4 have some things in common. For example, they are all single-character codes (unlike
63 # code 3, which may contain multiple characters). This class encapsulates common validation tests for codes 1, 2, and 4.
65  ## See superclass description.
66  def __init__(self, options_dict):
67  Code.__init__(self, options_dict)
68 
69  ## See superclass description.
70  def get_tests(self):
71  #grab tests from superclass
72  tests = Code.get_tests(self)
73 
74  def char_test(cd_str):
75  errors = []
76  #ensure the string is composed only of characters in the set
77  invalid_re = '[^%s]' % ( reduce(lambda accum, key: accum + key, self.options_dict.keys()) )
78  it = re.finditer(invalid_re, cd_str)
79  matches = list(it)
80  if matches:
81  invalid_cds = ''
82  for i in range(len(matches)):
83  invalid_cds += '"' + matches[i].group() + '"'
84  if i < len(matches) - 1:
85  invalid_cds += ', '
86 
87  errors.append('Invalid code character(s): %s' % (invalid_cds))
88 
89  return errors
90 
91  #transcriber codes 1, 2, and 4 must only consist of a single character
92  def len_test(cd_str):
93  errors = []
94  if len(cd_str) > 1:
95  errors.append('This code should not contain more than one character.')
96 
97  return errors
98 
99  tests.extend([char_test, len_test])
100 
101  return tests
102 
103 ## The third transcriber code is slightly different from the others because it can contain multiple characters. This subclass holds data about the code and provides some special overridden methods that work on multiple characters.
105  ## See superclass description.
106  def __init__(self, options_dict):
107  Code.__init__(self, options_dict)
108 
109  ## See superclass description.
110  def get_tests(self):
111  #grab tests from superclass
112  tests = Code.get_tests(self)
113 
114  def char_test(cd_str):
115  errors = []
116  #ensure the string is composed only of characters in the set
117  invalid_re = r'[^%s0-9]' % ( reduce(lambda accum, key: accum + key, self.options_dict.keys()) )
118  it = re.finditer(invalid_re, cd_str)
119  matches = list(it)
120 
121  invalid_re2 = r'([^IC][0-9]+)' #make sure it's not just a single number without a preceding I or C. This should really be integrated into the above regex somehow...
122  it2 = re.finditer(invalid_re2, cd_str)
123  matches2 = list(it2)
124 
125  all_matches = []
126  i = 0
127 
128  if matches is not None:
129  all_matches.extend(matches)
130  if matches2 is not None:
131  all_matches.extend(matches2)
132 
133  if all_matches:
134  invalid_cds = ''
135  for i in range(len(all_matches)):
136  invalid_cds += '"' + all_matches[i].group() + '"'
137  if i < len(all_matches) - 1:
138  invalid_cds += ', '
139 
140  errors.append('Invalid code character(s): %s' % (invalid_cds))
141 
142  return errors
143 
144  #since we can have multiple characters in a single code here, ensure there are no duplicate characters in the code string
145  def freq_test(cd_str):
146  errors = []
147  freq_dict = {}
148  for c in cd_str:
149  if c in freq_dict and not c.isdigit():
150  errors.append('Code contains more than one "%c".' % c)
151  freq_dict.pop(c) #remove so we won't get redundant errors if c is encountered again
152  else:
153  freq_dict[c] = 1
154 
155  return errors
156 
157  #according to the transcriber manual, if code 3 contains a U or F, it may not contain a C or I.
158  def link_test(cd_str):
159  errors = []
160  if 'U' in cd_str and 'F' in cd_str and ('I' in cd_str or 'C' in cd_str):
161  errors.append('Codes containing U and F may not contain C or I')
162 
163  return errors
164 
165  #append the subclass tests
166  tests.extend([char_test, freq_test, link_test])
167 
168  return tests
169 
170 ## This class holds information about a particular code option. A 'code option' is a particular value that a code can have. For example, transcriber code 1 may be set to one of the following options: ('M', 'F', 'T', 'O', 'C', 'U'). See the transcriber manual (or the transcriber_codes db table) for details about the options available for each transcriber code. Similarly, a LENA speaker code can have many different options: ('FAN', "MAN', 'NON', 'TVF', etc.). See the speaker_codes db table or the LENA documentation for more info about these codes. Other types of codes also exist.
172  ## Constructor
173  # @param self
174  # @param db_id (int) database id for this code option. This is from one of the code tables like transcriber_codes, lena_nodes_codes, etc.
175  # @param code (string) the option text
176  # @param desc (string) a description for this option that can be used for display purposes in the UI
177  # @param is_linkable (boolean integer) If an option is linkable (param is 1), then segments containing that code option will be considered when linking C/I transcriber codes. If an option is not linkable (param is 0), then segments containing that code option will be skipped when linking C/I transcriber codes (i.e. the segments can exist between I and C coded segments without causing errors).
178  # @param distance (int) one of the options from the Enum DBConstants.SPEAKER_DISTANCES (if this class is not being instantiated for a speaker code, this can be set to DBConstants.SPEAKER_CODES.NA).
179  # @param speaker_type (int) one of the options from the Enum DBConstants.SPEAKER_TYPES (if this class is not being instantiated for a speaker code, this can be set to DBConstants.SPEAKER_TYPES.NA)
180  # @param props (list) list of ints - each should be an option from DBConstants.SPEAKER_PROPS. Pass empty list if properties are not applicable or needed for this code.
181  def __init__(self,
182  db_id,
183  code,
184  desc,
185  is_linkable,
186  distance,
187  speaker_type,
188  props=[]):
189  self.db_id = db_id
190  self.desc = desc
191  self.code = code
192  self.is_linkable = is_linkable
193  self.distance = distance
194  self.speaker_type = speaker_type
195  self.props_dict = dict(zip(props, [True] * len(props)))
196 
197  ## Checks if this option has a given property from DBConstants.SPEAKER_PROPS.
198  # These properties record secondary characteristics like whether or not the option represents media noise, or overlapping speach.
199  # @param self
200  # @param prop (int) one of the options from DBConstants.SPEAKER_PROPS
201  # @returns (boolean) True if this option has the specified property, False otherwise.
202  def has_property(self, prop):
203  return prop in self.props_dict
204 
205  ## Checks if this option is linkable.
206  # If an option is linkable, then segments containing that code option will be considered when linking C/I transcriber codes. If an option is not linkable, then segments containing that code option will be skipped when linking C/I transcriber codes (i.e. the segments can exist between I and C coded segments without causing errors).
207  # @param self
208  # @returns (boolean) True if this option is linkable, False otherwise
209  def is_linkable(self):
210  return is_linkable
211 
212  ## Checks if this option has a particular distance property (eg. NEAR, FAR, NA)
213  # @param self
214  # @param distance (int) one of the options from DBConstants.SPEAKER_DISTANCES (pass the NA option if distance is not applicable for this code option).
215  # @returns (boolean) True if this option has the distance property, False otherwise.
216  def is_distance(self, distance):
217  return self.distance == distance
218 
219  ## Checks if this option has a given speaker type (where speaker types are defined as for transcriber code 1 in the transcriber manual).
220  # @param self
221  # @param speaker_type (int) one of the options from DBConstants.SPEAKER_TYPES (pass the NA option if speaker type is not applicable for this code option)
222  # @returns (boolean) True if this option has the specified speaker type, False otherwise.
223  def is_speaker_type(self, speaker_type):
224  return self.speaker_type == speaker_type
225 
226  ## Returns the description for this option.
227  # @param self
228  # @returns (string) description text
229  def get_code_desc(self):
230  return self.desc
231 
232  ## Returns the code string for this option.
233  # @param self
234  # @returns (string) an options string (like 'MAN' or 'FAN')
235  def get_code(self):
236  return self.code