Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
trs_parser.py
Go to the documentation of this file.
1 ## @package parsers.trs_parser
2 
3 import xml.etree.ElementTree
4 import logging
5 import re
6 import traceback
7 import random
8 
9 from data_structs.segment import Segment
10 from data_structs.utterance import Utterance
11 from data_structs.speaker import Speaker
12 from db.bll_database import DBConstants
13 from parsers.state_machines import *
14 from parsers.errors import *
15 from data_structs.error_collector import ErrorCollector
16 from parsers.parser_tools import *
17 
18 ## This class parses transcribed (or untranscribed) TRS files, producing output in the form of Segment objects (which contain Utterance objects).
19 # The tasks of assigning Utterance start/end times and linking segments into chains are passed off to the parsers.state_machines.StateMachines class.
20 class TRSParser(object):
21  #every correctly transcribed line should match this regex
22  #General format: "<transcription phrase> <lena notes> <pipe-delimited LENA codes><pipe-delimited transcriber codes>"
23  TRANS_LINE_REGEX = '^\s*([^\|]*?)\s*(' + '|'.join(DBConstants.LENA_NOTES_CODES.get_all_options_codes()) + ')?\s*(?:\|(.*)\|)?\s*$'
24 
25  #this regex is used to check for angle brackets to see if a particular transcription should be marked as 'overlapping'
26  TRANS_OVERLAP_REGEX = '\s*<.*>\s*'
27 
28  ## Constructor
29  # @param self
30  # @param filename (string) full path to TRS file to parse
31  def __init__(self, filename):
32  self.logger = logging.getLogger(__name__)
33  self.filename = filename
34 
35  #perform setup of data structures used in the parsing process
37 
38  ## Sets up data structures used to iterate through the XML and track segments, utterances, errors, etc.
39  # @param self
42  self.segments = []
43  self.speakers = {}
44  self.utter_index = {} #for lookup by utter_id, only build on demand
45  self.parsed = False
47  self.total_utters = 0
48  self.tree = None
49 
50  #Use Python ElementTree library to parse the XML in the TRS file.
51  try:
52  self.tree = xml.etree.ElementTree.parse(self.filename)
53 
54  except Exception as e:
55  self.logger.error("Unable to open TRS file. Exception: %s" % e)
56  self.logger.error("Stack trace: %s" % (traceback.format_exc()))
57 
58  ## Retreives the errors and warnings found by this parser in the form of an ErrorCollector object
59  # It provides methods to look up various errors/warnings by type.
60  # @param self
61  # @returns (ErrorCollector) - this object can be used to lookup errors/warnings by type (see errors.ErrorCollector class)
62  def get_errors(self):
63  return self.error_collector
64 
65  ## Resets internal data structures and parses the TRS file a second time. Useful if the file has changed since the last parse.
66  # All cached segments/utterances from the last parse are cleared.
67  # @param self
68  # @param progress_update_fcn (function=None) function accepting a value in [0,1] to display as a progress bar - see utils.ProgressDialog. This value is used to indicate the level of completeness <em>of the current phase</em>
69  # @param progress_next_phase_fcn(function=None) - moves the progress bar to the next phase, which causes new text to be displayed in the bar - see utils.ProgressDialog
70  # @param validate (boolean=True) set to True if you want the parser to check for errors (can be retreived with get_errors()), False otherwise
71  # @param seg_filters (list=[]) list of SegFilter objects. These filters are applied to the segments list in a permanent manner (i.e. anything they filter out will not be returned by this parser)
72  # @returns (list) list of Segment objects
73  def re_parse(self, progress_update_fcn=None, progress_next_phase_fcn=None, validate=True, seg_filters=[], remove_bad_trans_codes=True):
75 
76  return self.parse(progress_update_fcn=progress_update_fcn,
77  progress_next_phase_fcn=progress_next_phase_fcn,
78  validate=validate,
79  seg_filters=seg_filters,
80  remove_bad_trans_codes=remove_bad_trans_codes)
81 
82  ## Parses the TRS file, returning a list of Segments.
83  # @param self
84  # @param progress_update_fcn (function=None) function accepting a value in [0,1] to display as a progress bar - see utils.ProgressDialog. This value is used to indicate the level of completeness <em>of the current phase</em>
85  # @param progress_next_phase_fcn(function=None) - moves the progress bar to the next phase, which causes new text to be displayed in the bar - see utils.ProgressDialog
86  # @param validate (boolean=True) set to True if you want the parser to check for errors (can be retreived with get_errors()), False otherwise
87  # @param seg_filters (list=[]) list of SegFilter objects. These filters are applied to the internal segments list in a permanent manner (i.e. anything they filter out will not be returned by this parser)
88  # @returns (list) list of Segment objects
89  def parse(self, progress_update_fcn=None, progress_next_phase_fcn=None, validate=True, seg_filters=[], remove_bad_trans_codes=True):
90  #make sure the xml was readable, and results have not yet been cached
91  if self.tree and not self.parsed:
92  #parse the file, driving the linking state machine as we go
93  self._parse(progress_update_fcn, seg_filters, remove_bad_trans_codes)
94  self.link_sm.finish()
95 
96  #validate the utterances, if requested
97  if validate:
98  if progress_next_phase_fcn:
99  progress_next_phase_fcn()
100 
101  cur_utter = 0
102  for seg in self.segments:
103  for utter in seg.utters:
104  #log errors to the collector object
105  utter.validate(self.error_collector)
106  cur_utter += 1
107 
108  if progress_update_fcn:
109  progress_update_fcn(float(cur_utter) / float(self.total_utters))
110 
111  self.parsed = True
112  self.tree = None #no need to keep this huge object in memory anymore...
113 
114  return self.segments
115 
116  ## Retreives an Utterance object (residing in one of this trs parser's segments) by its utterance id attribute.
117  # @param self
118  # @param utter_id (int) utterance id to search for
119  # @returns (Utterance) the requested Utterance object, or None if not found
120  def get_utter_by_id(self, utter_id):
121  result = None
122 
123  #build an index (keyed by utterance id) so that these acceses will be faster in the future
124  #note: this only involves copying pointers, not actual data...
125  if not self.utter_index:
126  self.parse()
127  self.utter_index = {}
128  for seg in self.segments:
129  for utter in seg.utters:
130  self.utter_index[utter.id] = utter
131 
132  if utter_id in self.utter_index:
133  result = self.utter_index[utter_id]
134 
135  return result
136 
137  ## This method performs the actual parsing work that produces a list of Segment objects from the XML.
138  # @param self
139  # @param progress_update_fcn (function) see identical parameter description in parse()
140  # @param seg_filters (list) list of SegFilter objects to apply to the segments as they are created. Anything that these filters exclude will not be in the returned list (their changes are made permanent).
141  # @returns (list) list of Segment objects
142  def _parse(self, progress_update_fcn, seg_filters, remove_bad_trans_codes):
143  #make sure xml was readable
144  if self.tree:
145  #retrieve a dictionary of speakers from the xml file and store it in self.speakers
146  self._parse_speakers()
147 
148  #grab all turns in the current section element
149  turn_list = list(self.tree.iter('Turn'))
150  total_turns = len(turn_list)
151 
152  #iterate through turns, pulling out utterances
153  for turn_num in range(total_turns):
154  #build a list of Speaker objects representing all of the speakers in this segment
155  if not 'speaker' in turn_list[turn_num].attrib: #some turns have no speakers
156  seg_speakers = []
157  else:
158  seg_speakers = map(lambda spkr_id: self.speakers[spkr_id], turn_list[turn_num].attrib['speaker'].split(' '))
159 
160  #construct the segment object
161  seg = Segment(turn_num,
162  float(turn_list[turn_num].attrib['startTime']),
163  float(turn_list[turn_num].attrib['endTime']),
164  seg_speakers)
165 
166  #parse the utterances out of this turn element and store them in the segment
167  seg.utters = self._parse_utters(seg, turn_list[turn_num], remove_bad_trans_codes)
168 
169  #make sure this isn't a segment that the user has requested be filtered out
170  if ParserTools.include_seg(seg, seg_filters):
171  #append the newly created segment object to the internal list of segments
172  self.segments.append(seg)
173 
174  #update any progress bars, if present
175  if progress_update_fcn:
176  progress_update_fcn(float(turn_num + 1) / float(total_turns))
177 
178  ## Creates Utterance objects for a given XML turn element.
179  # @param self
180  # @param seg (Segment) the parent Segment object
181  # @param turn (Element) an etree.Element object representing the XML node
182  # @returns (list) list of Utterance objects
183  def _parse_utters(self, seg, turn, remove_bad_trans_codes):
184  #this state machine handles Utterance creation and assigns their start/end times as we go through the sub-elements in the turn node
185  sm = ParseUttersStateMachine(self, seg, remove_bad_trans_codes)
186 
187  #advance the state machine for each sub-element in the turn node
188  for el in turn.iter():
189  sm.drive(el)
190 
191  #perform any final end time assignments
192  sm.finish(turn)
193 
194  #grab a list of Utterances from the state machine and return it
195  return sm.get_result()
196 
197  ## Extracts Utterance attributes (eg. transcriber codes, transcription phrase, etc.) from the text following a <sync> element.
198  # @param self
199  # @param seg (Segment) A Segment object that Utterances created from this text should appear within.
200  # @param el (etree Element) An XML "text element" from the etree library, containing the data immediately following a <sync> tag. This may span multiple lines.
201  # @returns (list) List of Utterance objects with their attributes set. Multiple Utterance objects are created from the text if it spans multiple lines (different speakers) or uses the '.' operator (see the transcriber manual).
202  def _parse_speech_data(self, seg, el, remove_bad_trans_codes):
203  utter_list = []
204  text = el.tail.strip()
205  if text:
206  #split up separate lines
207  speaker_utters = re.split('\s*\n\s*', text)
208 
209  #each line is treated as a separate utterance, since (according to the transcriber manual) new lines are used for different speakers
210  for i in range(len(speaker_utters)):
211  #split at the '.' operator, if present. Transcribers use this to "split apart" segments that LENA has mistakenly put together.
212  have_multi_utters = re.search(r'\s*\.\s*', speaker_utters[i]) != None
213  multi_utters = re.split(r'\s*\.\s*', speaker_utters[i])
214  for j in range(len(multi_utters)):
215  utter = Utterance()
216  utter.seg = seg
217  utter.is_dot_split = have_multi_utters
218  self.total_utters += 1
219 
220  #First line has the speaker indicated by LENA. Subsequent lines are considered to be other speakers.
221  if i == 0:
222  self._assign_speaker(el, utter)
223 
224  self._assign_utter_attribs(utter, multi_utters[j], remove_bad_trans_codes)
225 
226  utter_list.append(utter)
227  #If the <sync> tag was not transcribed, just create an empty Utterance for it
228  else:
229  utter = Utterance()
230  utter.seg = seg
231  if seg.speakers:
232  #assume first speaker for now...
233  utter.speaker = seg.speakers[0]
234  self.total_utters += 1
235  utter_list.append(utter)
236 
237  return utter_list
238 
239  ## Determines the speaker for an Utterance, and sets the Utterance speaker attribute to an appropriate Speaker object.
240  # @param self
241  # @param el (etree Element object) The XML element (with either a "sync" or a "who" tag) that corresponds to utter
242  # @param utter (Utterance) The Utterance object to assign a speaker to
243  def _assign_speaker(self, el, utter):
244  #"sync" tags receive the enclosing segment's speaker, if any
245  if el.tag == 'Sync':
246  utter.speaker = utter.seg.speakers[0] if len(utter.seg.speakers) > 0 else None
247  #For "who" tags, we need to examine the 'nb' attribute. This gives the index (starts at 1) of the speaker in the enclosing segment's speaker list.
248  elif el.tag == 'Who':
249  speaker_index = int(el.attrib['nb']) - 1
250  #sometimes there's human error and we do not have enough speakers in the enclosing segment
251  if speaker_index < len(utter.seg.speakers):
252  utter.speaker = utter.seg.speakers[speaker_index]
253 
254  ## Performs the actual assignment of utterance attributes (like transcription phrase, codes, etc.), based upon a line from the TRS file.
255  # @param self
256  # @param utter (Utterance) the object we are assigning attributes to
257  # @param line (string) the text following a "sync" or "who" element. This contains LENA codes, plus transcriber added data (and more)
258  def _assign_utter_attribs(self, utter, line, remove_bad_trans_codes):
259  #sometimes there is no data...
260  if (line):
261  #grab the data using regex capturing groups
262  match = re.search(TRSParser.TRANS_LINE_REGEX, line)
263  #the above match "should" never fail (the regex will match anything), but it may not capture groups if the corresponding text isn't present. Therefore we assign them carefully.
264  utter.trans_phrase = ''
265  utter.lena_notes = ''
266  codes = ''
267  try:
268  utter.trans_phrase = match.groups()[0] or '' #change None into empty string
269  utter.is_trans_overlap = re.search(TRSParser.TRANS_OVERLAP_REGEX, utter.trans_phrase) != None
270  utter.lena_notes = match.groups()[1] or ''
271  codes = match.groups()[2] or ''
272  except Exception as err:
273  self.logger.error('Found invalid transcription line in TRS file: %s' % line)
274 
275  #assign any codes using another regex
276  if codes:
277  codes_list = re.findall('[^\|]+', codes)
278 
279  #These have been verified through TRANS_LINE_REGEX.
280  lena_codes = codes_list[0: len(codes_list) - 4]
281  #These have not.
282  #We assume last 4 codes are transcriber codes, setting invalid ones to empty
283  #string if the remove_bad_trans_codes flag is set.
284  trans_codes = codes_list[len(codes_list) - 4:]
285 
286  if remove_bad_trans_codes:
287  for i in range(len(trans_codes)):
288  code = trans_codes[i]
289  pattern = '^[%s]$' % (''.join(DBConstants.TRANS_CODES[i].get_all_options_codes()))
290 
291  #code 2 can have multiple chars, with numbers (I1, C1, etc.)
292  if i == 2:
293  pattern = '^([%s][1-9]?)+$' % (''.join(DBConstants.TRANS_CODES[i].get_all_options_codes()))
294 
295  if not re.match(pattern, code):
296  trans_codes[i] = ''
297 
298  utter.lena_codes = lena_codes
299  utter.trans_codes = trans_codes
300 
301  #let the state machine know that we've got this data (this method should really be refactored so that this line can be moved to state_machines.py, where it belongs...)
302  self.link_sm.drive(utter)
303 
304  ## Grabs a list of all of the speakers in the TRS file, from the <Speakers> tag (which appears near the top). Creates Speaker objects for them and stores them in the self.speakers list.
305  # @param self
306  def _parse_speakers(self):
307  if not self.speakers:
308  for person in self.tree.getroot().find('Speakers').findall('Speaker'):
309  speaker_id = person.attrib['id'].strip()
310  speaker_code = person.attrib['name'].strip()
311  self.speakers[speaker_id] = Speaker(speaker_id, DBConstants.SPEAKER_CODES.get_option(speaker_code))
312  if self.speakers[speaker_id].speaker_codeinfo == None: #indicates speaker code is not in the DB table
313  self.logger.error('Unrecognized speaker code: %s' % (speaker_code))