Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
its_parser.py
Go to the documentation of this file.
1 import xml.etree.ElementTree as etree
2 import logging
3 import traceback
4 import re
5 
6 from data_structs.segment import Segment
7 from data_structs.utterance import Utterance
8 from data_structs.speaker import Speaker
9 from db.bll_database import DBConstants
10 
11 #note: there is no transcriber information or validation to worry about here
12 class ITSParser(object):
13  def __init__(self, filename):
14  self.logger = logging.getLogger(__name__)
15  self.filename = filename
16  self.segments = []
17  self.parsed = False
18 
19  try:
20  self.tree = etree.parse(self.filename)
21 
22  except Exception as e:
23  self.logger.error('Unable to parse ITS file. Exception: %s' % e)
24  self.logger.error('Stack trace: %s' % (traceback.format_exc()))
25 
26  def parse(self):
27  if self.tree and not self.parsed:
28  rec_it = self.tree.iter('Recording')
29 
30  for rec in rec_it:
31  children = rec.getchildren()
32  for i in range(len(children)):
33  seg = self._build_seg(i, children[i])
34  self.segments.append(seg)
35 
36  self.parsed = True
37 
38  return self.segments
39 
40  def _parse_time(self, time_str):
41  match = re.match(r'^PT(.*)S$', time_str)
42  return float(match.groups()[0])
43 
44  #el is either a "Converstion" or "Pause" element
45  def _build_seg(self, num, el):
46  start = self._parse_time(el.attrib['startTime'])
47  end = self._parse_time(el.attrib['endTime'])
48 
49  seg = Segment(
50  num,
51  start,
52  end,
53  )
54 
55  seg_utters = []
56  seg_spkrs = {}
57  it = el.iter('Segment')
58  for child_el in it:
59  utter = self._build_utter(child_el, seg)
60  seg_utters.append(utter)
61  seg_spkrs[utter.speaker.speaker_codeinfo.code] = utter.speaker
62 
63  seg.utters = seg_utters
64  seg.speakers = seg_spkrs.values()
65 
66  return seg
67 
68  #el is a "Segment" element
69  def _build_utter(self, el, seg):
70  start = self._parse_time(el.attrib['startTime'])
71  end = self._parse_time(el.attrib['endTime'])
72  spkr_cd = el.attrib['spkr']
73  codeinfo = DBConstants.SPEAKER_CODES.get_option(spkr_cd)
74  speaker = Speaker(None, codeinfo) #ITS files have no speaker_id (e.g. 'spk1') like TRS files do
75 
76  utter = Utterance()
77  utter.start = start
78  utter.end = end
79  utter.speaker = speaker
80  utter.lena_notes = None #don't have access to this info in ITS
81 
82  if 'recordingInfo' in el.attrib and len(el.attrib['recordingInfo']) > 1:
83  utter.lena_codes.extend(el.attrib['recordingInfo'][1:-1].split('|'))
84 
85  if 'conversationInfo' in el.attrib and len(el.attrib['conversationInfo']) > 1:
86  utter.lena_codes.extend(el.attrib['conversationInfo'][1:-1].split('|'))
87 
88  utter.seg = seg
89 
90  return utter
91