Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
state_machines.py
Go to the documentation of this file.
1 ## @package parsers.state_machines
2 
3 from data_structs.utterance import Utterance
4 from db.bll_database import DBConstants
5 from utils.backend_utils import BackendUtils
6 from utils.enum import Enum
7 from errors import *
8 
9 import re
10 import logging
11 
12 ## This class handles the task of generating Utterance objects from the data within a \verbatim<Turn></Turn>\endverbatim element.
13 #
14 # Tags of interest of here:
15 # \verbatim<Sync time="8.32" />\endverbatimThis is a single tag that indicates a boundary in the audio that LENA has found.
16 #
17 # \verbatim<Who nb="1" />\endverbatim This is a single tag that appears directly after a sync tag, or directly after another who tag.
18 # LENA generates who tags when there are multiple speakers within the same turn. The nb attribute
19 # is the index of the speaker (starting at 1, not 0) in the (space-delimited) speaker list contained within the turn tag's 'speaker' attribute.
20 #
21 # Raw data appears immediately after either of these tags. This data appears as one or more lines in the following form:
22 # \verbatim
23 # <optional LENA code like VOC, SIL, etc.> <optional transcribed phrase> <optional LENA-generated codes separated by pipes - eg. |E|1|0|XM|> <optional transcriber codes separated by pipes - eg. |U|T|I|Q|>
24 # \endverbatim
25 #
26 # Since this data appears <em>after</em> sync or who tags (as opposed to <em>between</em> tags), it's tricky to pull of the information we want. Hence this state machine.
27 #
28 # Comment: If you need to interact with/modify this class, it would be helpful to read the transcriber manual first.
29 #
30 # Usage: after instantiating the class, call drive() with each element up to (but not including) the last one. Then, call finish() with the last element.
32  ## Constructor
33  # @param self
34  # @parm trs_parser (TRSParser) this state machine must be driven by an instance of the TRSParser class - this is a pointer to that instance
35  # @param seg (Segment) this is a pointer to the Segment object for which the TRSParser is using this class to generate Utterances for.
36  def __init__(self, trs_parser, seg, remove_bad_trans_codes):
37  #since this file contains multiple classes, set up the logger so we can see which class messages are coming from
38  self.logger = logging.getLogger(__name__ + '.' + self.__class__.__name__)
39 
40  self.trs_parser = trs_parser
41  self.seg = seg
42  self.remove_bad_trans_codes = remove_bad_trans_codes
43 
44  self.utter_list = [] #list of Utterances parsed so far. These are not guarenteed to be complete (eg. may have no end time assigned) until the finish() method has been executed.
45  self.States = Enum(['INITIAL', 'INITIAL_SYNC_TAG', 'WHO_TAG']) #possible states - see the drive() routine for descriptions of each of them
46  self.cur_state = self.States.INITIAL
47 
48  ## Pushes the state machine forward a single step.
49  # @param self
50  # @param next_obj (Element) this is an Element object (defined in the Python ElementTree library) corresponding to the next node of the XML file that has been encountered.
51  def drive(self, next_obj):
52  #In the initial state, we wait to encounter a sync tag.
53  if self.cur_state == self.States.INITIAL:
54  if next_obj.tag == 'Sync':
55  #note: next_obj.tail will be '\n' if this sync is immediately preceeding who tags
56  #in that case self.trs_parser._parse_speech_data() will do nothing and return an empty utterance
57  #This empty utterance will be popped in the next state.
58 
59  #grab the speech data from the element. This call returns empty Utterance objects, one for each speaker
60  utters = self.trs_parser._parse_speech_data(self.seg, next_obj, self.remove_bad_trans_codes)
61  #set the start time
62  start_time = float(next_obj.attrib['time'])
63  map(lambda u: setattr(u, 'start', start_time), utters)
64  self.utter_list.extend(utters)
65 
66  #move to next state
67  self.cur_state = self.States.INITIAL_SYNC_TAG
68 
69  #In this state, we have encountered an initial sync tag, and potentially appended some new Utterances objects to self.utters_list.
70  #Here, we are waiting for one of two things: the next Sync tag, or a Who tag (indicating that there are multiple speakers following the previous sync)
71  elif self.cur_state == self.States.INITIAL_SYNC_TAG:
72  #If we find another sync tag, it means the last one is done
73  if next_obj.tag == 'Sync':
74  #add an end time to any utterances from a previous sync
75  end_time = float(next_obj.attrib['time'])
76  #work backward through the list until we reach the point at which utters already have an end time (or we reach the beginning of the list)
77  i = len(self.utter_list) - 1
78  while i >= 0 and self.utter_list[i].end == None:
79  self.utter_list[i].end = end_time
80  i -= 1
81 
82  #parse data following new sync, appending new Utterances to self.utter_list
83  new_utters = self.trs_parser._parse_speech_data(self.seg, next_obj, self.remove_bad_trans_codes)
84  start_time = float(next_obj.attrib['time'])
85  map(lambda u: setattr(u, 'start', start_time), new_utters)
86  self.utter_list.extend(new_utters)
87 
88  #leave self.cur_state the same; wait for another sync/who tag
89 
90  #If we find a who tag, the previous sync has multiple speakers
91  elif next_obj.tag == 'Who':
92  #pop utterance created from previous empty sync tag
93  #we should never have to pop multiple elements here because the previous sync's tail will have been be empty
94  old_utter = self.utter_list.pop()
95  self.trs_parser.total_utters -= 1
96 
97  #create and insert new utterance (from who tag) instead
98  new_utters = self.trs_parser._parse_speech_data(self.seg, next_obj, self.remove_bad_trans_codes)
99  map(lambda u: setattr(u, 'start', old_utter.start), new_utters)
100  self.utter_list.extend(new_utters)
101 
102  #go the the next state, in which we wait for the next sync/who tag (same as this state, but with some special cases).
103  self.cur_state = self.States.WHO_TAG
104 
105  #In this state, we've encountered a who tag. Wait for the next sync/who tag. This state differs from the previous one in this way:
106  # Who tags contain no time data. If additional who tags are found (after the first who tag that got us to this state), they are given the start time of the last who tag. The first who tag was given the start time of the last sync tag (see previous state).
107  elif self.cur_state == self.States.WHO_TAG:
108  if next_obj.tag == 'Sync':
109  #finish previous utters by setting their end time
110  sync_time = float(next_obj.attrib['time'])
111  i = len(self.utter_list) - 1
112  while i >= 0 and self.utter_list[i].end == None:
113  self.utter_list[i].end = sync_time
114  i -= 1
115 
116  #create new utters
117  new_utters = self.trs_parser._parse_speech_data(self.seg, next_obj, self.remove_bad_trans_codes)
118  map(lambda u: setattr(u, 'start', sync_time), new_utters)
119  self.utter_list.extend(new_utters)
120 
121  #move back to the initial sync tag state
122  self.cur_state = self.States.INITIAL_SYNC_TAG
123 
124  elif next_obj.tag == 'Who':
125  new_utters = self.trs_parser._parse_speech_data(self.seg, next_obj, self.remove_bad_trans_codes)
126  #give start times to previously encountered who tags
127  #note: the only way we can arrive in this state is if we've had at least one previous who tag in this segment.
128  #therefore the indexing below is safe.
129  map(lambda u: setattr(u, 'start', self.utter_list[-1].start), new_utters)
130  self.utter_list.extend(new_utters)
131 
132  #leave self.cur_state the same; wait for another sync/who tag
133 
134  ## Completes any unfinished Utterances that may be waiting for additional information (eg. end times).
135  # @param self
136  # @param final_obj (Element) a Python ElementTree library XML node object, representing the last tag encountered in the \verbatim<Turn></Turn>\endverbatim
137  def finish(self, final_obj):
138  #grab the end time of the last node
139  final_end_time = float(final_obj.attrib['endTime'])
140 
141  #Append the end time to any outstanding Utterances in self.utter_list
142  #note: if state == States.INITIAL, the utter list is empty and nothing needs to be done - so we only need to worry about INITIAL_SYNC_TAG and WHO_TAG states
143  if self.cur_state == self.States.INITIAL_SYNC_TAG or self.cur_state == self.States.WHO_TAG:
144  #add the final end time onto the last utterances
145  i = len(self.utter_list) - 1
146  while i >= 0 and self.utter_list[i].end == None:
147  self.utter_list[i].end = final_end_time
148  i -= 1
149 
150  ## Retreieves the list of Utterances that this state machine has constructed.
151  # This method should only be called after finish() has been called (Otherwise there may be incomplete Utterances in the returned list)
152  # @param self
153  # @returns (list) list of Utterance objects
154  def get_result(self):
155  return self.utter_list
156 
157 ## This class handles the job of linking together Utterances (via their next/prev pointers) that are marked with I/C codes (transcriber code 3) by the transcribers.
158 # Utterances are 'linkable' if an I/C code can be used to tie them to another utterance. They are 'unlinkable' if they cannot be. Unlinkable Utterances usually consist of silence (speaker is SIL),
159 # or non-verbal noise (and have no transcription phrase). The ''is_linkable column of the 'speaker_codes' database table defines the particular speakers that make an utterance linkable/unlinkable.
160 # In addition to linking Utterances, this class also provides error detection for problems like 'I without C', 'C without I', 'ambiguous I/C codes', etc. These errors are appended to the TRSParser's ErrorCollector.
161 # Utterances are only linked across segments (not within segments).
162 # The transcriber manual dictates that when ambiguous I/C codes are used, they must be numbered
163 # (eg. I, I1, and I2, link to C, C1, and C2, respectively). These numbers are referred to as 'link numbers' (not to be confused with 'segment numbers', which refer to Segment objects' 'num' attributes).
164 # In order to simplify the linking process, the state machine considers the initial I or C codes to have the number 0 (eg. I links to C becomes I0 links to C0).
165 # Usage: after instantiating the class, call drive() with every Utterance. Then call finish().
167  ## Constructor
168  # @param self
169  # @param trs_parser (TRSParser) This class must be driven from an instance of a TRSParser
170  def __init__(self, trs_parser):
171  self.trs_parser = trs_parser
172 
173  #holds the link numbers of utterances with I/C codes that have been encountered in the current and previous segments, respectively
174  self.link_dict = {'cur_links': {},
175  'prev_links': {}
176  }
177 
178  # holds the segment whose utterances we are currently working with
179  self.cur_linkable_seg = None
180  # holds the segment whose utterances we were working with before we encountered cur_linkable_seg
181  self.last_linkable_seg = None
182 
183  #set up logging
184  self.logger = logging.getLogger(__name__ + '.' + self.__class__.__name__)
185 
186  ##Constructs a string that is formatted as indented list of information about all utterances in a given segment. This is useful to tack on to error messages.
187  # @param self
188  # @param seg (Segment) segment object for which to generate a list of utterance info
189  # @returns (string) an indented string containing speaker, start/end times, lena notes, transcription phrase, transcriber codes, etc. for each utterance in the segment.
190  def _get_seg_contents(self, seg):
191  contents = ''
192  for utter in seg.utters:
193  codes_str = ''
194  if utter.lena_codes:
195  codes_str += '|%s|' % ('|'.join(utter.lena_codes))
196  if utter.trans_codes:
197  codes_str = '|%s|' % ('|'.join(utter.trans_codes))
198 
199  contents += '\t%s [%s - %s] %s%s%s\n' % (utter.speaker.speaker_codeinfo.code if utter.speaker else '-',
200  BackendUtils.get_time_str(utter.start),
201  BackendUtils.get_time_str(utter.end),
202  (utter.lena_notes or '') + ' ',
203  (utter.trans_phrase or '') + ' ',
204  codes_str,
205  )
206 
207  return contents
208 
209  ## Drives the state machine ahead on step.
210  # @param self
211  # @param next_obj (Utterance) the next Utterance to consider in the linking process
212  def drive(self, next_obj):
213  #we only drive the state machine for utterances that are linkable, and have been transcribed
214  #if next_obj.is_linkable() and next_obj.trans_phrase:
215  if next_obj.trans_phrase:
216 
217  #if we've moved beyond a segment boundary (i.e. next_obj is the first utterance of a new segment), some maintanence work needs to be done on the data structures
218  if self.cur_linkable_seg == None or self.cur_linkable_seg.num != next_obj.seg.num:
219 
220  #Any utterances whose link numbers are left in the 'prev_links' sub-dictionary are now too far away from their originating segment to be linked. (eg. they are from segment 2, and we are now starting segment 4 - therefore matching C/IC codes were not found in segment 3). Therefore, we generate errors for each of them.
221  for link_num in self.link_dict['prev_links']:
222  err_msg = 'Encountered I%s with no C%s in next linkable segment.\n' % tuple([str(link_num or '')] * 2)
223  err_msg += 'Expected a \'C\' in either the current segment, or one of the these (following) segments:\n'
224  err_msg += self._get_seg_contents(self.cur_linkable_seg)
225 
226  self.trs_parser.error_collector.add(ParserError(err_msg, self.link_dict['prev_links'][link_num]))
227 
228  #update the last/cur linkable segments
230  self.cur_linkable_seg = next_obj.seg
231 
232  #replace the 'prev_links' sub-dictionary (has outdated entries from 2 segments ago) with the 'cur_links' sub-dictionary (now has entries from 1 segment ago)
233  self.link_dict['prev_links'] = self.link_dict['cur_links']
234  #reset the 'cur_links' sub-dictionary so we can enter link numbers for utterances from the new segment
235  self.link_dict['cur_links'] = {}
236 
237 
238  continued_match = None #this will be set to a regex match object if next_obj contains an I transcriber code (note: could be IC)
239  continuation_match = None #this will be set to a regex match object if next_obj contains a C transcriber code (note: could be IC)
240  continued_num = '' #if continued_match is non-None, this will be set to the link number
241  continuation_num = '' #if continuation_match is non-None, this will be set to the link number
242 
243  #make sure the Utterance has transcriber codes, then set the 'match' variables
244  if len(next_obj.trans_codes) == len(DBConstants.TRANS_CODES):
245  continued_match = re.search('(?:I(\d+)?)', next_obj.trans_codes[2])
246  continuation_match = re.search('(?:C(\d+)?)', next_obj.trans_codes[2])
247 
248  #determine the link numbers for any matches
249  continued = hasattr(continued_match, 'group') #this is a boolean - True if continued_match is non-None
250  if continued:
251  #if no number is present, use a 0 (i.e. match is just 'I' or 'IC')
252  continued_num = continued_match.groups()[0] or 0
253 
254  continuation = hasattr(continuation_match, 'group') #this is a boolean - True if continuation_match is non-None
255  if continuation:
256  #if no number is present, use a 0 (i.e. match is just 'C' or 'IC')
257  continuation_num = continuation_match.groups()[0] or 0
258 
259  #Note: if the utterance has an 'IC' code, it is possible that both continued and continuation are True at this point
260 
261  #if there was a C code (or an IC code), search for a matching 'I' code
262  if continuation:
263  #the 'prev_links' sub-dict holds the continued utterances from last linkable segment - looking for a matching 'I' code there
264  if continuation_num in self.link_dict['prev_links']:
265  #remove (pop) the matching I/IC code from the 'prev_links' sub-dict
266  prev_obj = self.link_dict['prev_links'].pop(continuation_num)
267 
268  #set the utterance pointers on both objects
269  prev_obj.next = next_obj
270  next_obj.prev = prev_obj
271 
272  #if it's not in prev_links, check cur_links (allow links to occur within the same turn)
273  elif continuation_num in self.link_dict['cur_links']:
274  #remove (pop) the matching I/IC code from the 'cur_links' sub-dict
275  prev_obj = self.link_dict['cur_links'].pop(continuation_num)
276 
277  #set the utterance pointers on both objects
278  prev_obj.next = next_obj
279  next_obj.prev = prev_obj
280 
281  #if we didn't find a matching I code, generate an error
282  else:
283  err_msg = 'Encountered C%s code with no I%s code in previous linkable segment.\n' % tuple([str(continuation_num or '')] * 2)
284  if self.last_linkable_seg:
285  err_msg += 'Expected an \'I\' either previously in the current segment, or in one of these (previous) segments:\n'
286  err_msg += self._get_seg_contents(self.last_linkable_seg)
287 
288  self.trs_parser.error_collector.add(ParserError(err_msg, next_obj))
289 
290 
291  #if there was an I code (or an IC code), insert the link number into the 'cur_links' sub-dictionary
292  if continued:
293  #first check if it's aready in the sub-dictionary. If so, we've encountered it before in this segment - therefore generate an 'ambiguous Is' error.
294  if continued_num in self.link_dict['cur_links']:
295  self.trs_parser.error_collector.add(ParserError('Ambiguous I%s in segment.' % (str(continued_num or '')), next_obj))
296  #otherwise, we're good to insert it
297  else:
298  self.link_dict['cur_links'][continued_num] = next_obj
299 
300  ## This method ensures that errors are generated for any utterances that are still waiting around for a link.
301  # This routine must be called after drive() - otherwise, you'll (potentially) be missing some errors.
302  # @param self
303  def finish(self):
304  #Any link numbers left in the 'cur_links' sub-dict are waiting for future utterances with corresponding C codes. They won't get any, since there are no more Utterances to process.
305  #Therefore they are all 'I without C' errors.
306  for link_num in self.link_dict['cur_links']:
307  self.trs_parser.error_collector.add(ParserError('Encountered I%s with no C%s in next segment.' % tuple([str(link_num or '')] * 2), self.link_dict['cur_links'][link_num]))
308 
309  #Any link numbers left in the 'prev_links' sub-dict are for 'IC' codes that were not popped because no match could be found.
310  for link_num in self.link_dict['prev_links']:
311  #self.trs_parser.error_collector.add(ParserError('Encountered C%s with no previous I%s code.' % tuple([str(link_num or '')] * 2), self.link_dict['prev_links'][link_num]))
312  self.trs_parser.error_collector.add(ParserError('Encountered I%s with no following C%s code.' % tuple([str(link_num or '')] * 2), self.link_dict['prev_links'][link_num]))