3 import xml.etree.ElementTree
23 TRANS_LINE_REGEX =
'^\s*([^\|]*?)\s*(' +
'|'.join(DBConstants.LENA_NOTES_CODES.get_all_options_codes()) +
')?\s*(?:\|(.*)\|)?\s*$'
26 TRANS_OVERLAP_REGEX =
'\s*<.*>\s*'
32 self.
logger = logging.getLogger(__name__)
54 except Exception
as e:
55 self.logger.error(
"Unable to open TRS file. Exception: %s" % e)
56 self.logger.error(
"Stack trace: %s" % (traceback.format_exc()))
73 def re_parse(self, progress_update_fcn=None, progress_next_phase_fcn=None, validate=True, seg_filters=[], remove_bad_trans_codes=True):
76 return self.
parse(progress_update_fcn=progress_update_fcn,
77 progress_next_phase_fcn=progress_next_phase_fcn,
79 seg_filters=seg_filters,
80 remove_bad_trans_codes=remove_bad_trans_codes)
89 def parse(self, progress_update_fcn=None, progress_next_phase_fcn=None, validate=True, seg_filters=[], remove_bad_trans_codes=True):
93 self.
_parse(progress_update_fcn, seg_filters, remove_bad_trans_codes)
98 if progress_next_phase_fcn:
99 progress_next_phase_fcn()
103 for utter
in seg.utters:
108 if progress_update_fcn:
109 progress_update_fcn(float(cur_utter) / float(self.
total_utters))
129 for utter
in seg.utters:
142 def _parse(self, progress_update_fcn, seg_filters, remove_bad_trans_codes):
149 turn_list = list(self.tree.iter(
'Turn'))
150 total_turns = len(turn_list)
153 for turn_num
in range(total_turns):
155 if not 'speaker' in turn_list[turn_num].attrib:
158 seg_speakers = map(
lambda spkr_id: self.
speakers[spkr_id], turn_list[turn_num].attrib[
'speaker'].split(
' '))
162 float(turn_list[turn_num].attrib[
'startTime']),
163 float(turn_list[turn_num].attrib[
'endTime']),
167 seg.utters = self.
_parse_utters(seg, turn_list[turn_num], remove_bad_trans_codes)
170 if ParserTools.include_seg(seg, seg_filters):
172 self.segments.append(seg)
175 if progress_update_fcn:
176 progress_update_fcn(float(turn_num + 1) / float(total_turns))
188 for el
in turn.iter():
195 return sm.get_result()
204 text = el.tail.strip()
207 speaker_utters = re.split(
'\s*\n\s*', text)
210 for i
in range(len(speaker_utters)):
212 have_multi_utters = re.search(
r'\s*\.\s*', speaker_utters[i]) !=
None
213 multi_utters = re.split(
r'\s*\.\s*', speaker_utters[i])
214 for j
in range(len(multi_utters)):
217 utter.is_dot_split = have_multi_utters
226 utter_list.append(utter)
233 utter.speaker = seg.speakers[0]
235 utter_list.append(utter)
246 utter.speaker = utter.seg.speakers[0]
if len(utter.seg.speakers) > 0
else None
248 elif el.tag ==
'Who':
249 speaker_index = int(el.attrib[
'nb']) - 1
251 if speaker_index < len(utter.seg.speakers):
252 utter.speaker = utter.seg.speakers[speaker_index]
262 match = re.search(TRSParser.TRANS_LINE_REGEX, line)
264 utter.trans_phrase =
''
265 utter.lena_notes =
''
268 utter.trans_phrase = match.groups()[0]
or ''
269 utter.is_trans_overlap = re.search(TRSParser.TRANS_OVERLAP_REGEX, utter.trans_phrase) !=
None
270 utter.lena_notes = match.groups()[1]
or ''
271 codes = match.groups()[2]
or ''
272 except Exception
as err:
273 self.logger.error(
'Found invalid transcription line in TRS file: %s' % line)
277 codes_list = re.findall(
'[^\|]+', codes)
280 lena_codes = codes_list[0: len(codes_list) - 4]
284 trans_codes = codes_list[len(codes_list) - 4:]
286 if remove_bad_trans_codes:
287 for i
in range(len(trans_codes)):
288 code = trans_codes[i]
289 pattern =
'^[%s]$' % (
''.join(DBConstants.TRANS_CODES[i].get_all_options_codes()))
293 pattern =
'^([%s][1-9]?)+$' % (
''.join(DBConstants.TRANS_CODES[i].get_all_options_codes()))
295 if not re.match(pattern, code):
298 utter.lena_codes = lena_codes
299 utter.trans_codes = trans_codes
302 self.link_sm.drive(utter)
308 for person
in self.tree.getroot().find(
'Speakers').findall(
'Speaker'):
309 speaker_id = person.attrib[
'id'].strip()
310 speaker_code = person.attrib[
'name'].strip()
311 self.
speakers[speaker_id] =
Speaker(speaker_id, DBConstants.SPEAKER_CODES.get_option(speaker_code))
312 if self.
speakers[speaker_id].speaker_codeinfo ==
None:
313 self.logger.error(
'Unrecognized speaker code: %s' % (speaker_code))