4 from xml.etree
import ElementTree
17 self.
logger = logging.getLogger(__name__)
20 self.
filename_base = re.match(
'.*[\\\/]([^\\\/]+)\.[Tt][Rr][Ss]$', filename).groups()[0]
26 except Exception
as e:
27 self.logger.error(
"Unable to open TRS file. Exception: %s" % e)
35 for person
in self.tree.getroot().find(
'Speakers').findall(
'Speaker'):
36 cur_speaker_num = int( re.match(
'^spk(\d+)$', person.attrib[
'id']).groups()[0] )
37 max_speaker_num = max(max_speaker_num, cur_speaker_num)
39 return max_speaker_num + 1
46 speakers = tree.getroot().find(
'Speakers')
47 speakers.append(ElementTree.Element(
'Speaker', attrib={
'accent':
'',
50 'id':
'spk%d' % (speaker_num),
60 hours, mins, sec = BackendUtils.break_time(total_sec)
62 return '%02d_%02d_%s%0.2f' % (hours, mins,
'0' if sec < 10
else '', sec)
68 def split(self, win_len, progress_update_fcn=None):
70 sections = list(self.tree.iter(
'Section'))
74 while i < len(sections):
75 new_episode, i, start_time, end_time = self.
_build_episode(sections, i, win_len, void_speaker_num, progress_update_fcn)
77 temp_tree = ElementTree.parse(self.
filename)
80 trans = temp_tree.getroot()
81 old_episode = trans.find(
'Episode')
82 trans.remove(old_episode)
83 trans.append(new_episode)
85 start_str = str(start_time).replace(
'.',
'_')
86 end_str = str(end_time).replace(
'.',
'_')
90 dest_file = open(dest_filename,
'wb')
91 dest_file.write(
'<?xml version="1.0" encoding="ISO-8859-1"?>\n')
92 dest_file.write(
'<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
94 dest_file = open(dest_filename,
'a')
95 temp_tree.write(dest_file, xml_declaration=
False)
106 void_section = ElementTree.Element(
'Section', attrib={
'startTime': str(start_time),
'endTime': str(end_time),
'type':
'report'})
107 void_turn = ElementTree.SubElement(void_section,
'Turn', attrib={
'startTime': str(start_time),
'endTime': str(end_time),
'speaker':
'spk%d' % (speaker_num)})
118 def _build_episode(self, sections, start_offset, win_len, void_speaker_num, progress_update_fcn):
119 new_episode = ElementTree.Element(
'Episode')
122 start_seg_time = float(sections[i].attrib[
'startTime'])
123 end_seg_time = float(sections[i].attrib[
'endTime'])
124 limit_end_time = start_seg_time + win_len
127 if start_seg_time > 0:
129 new_episode.append(void_section)
131 while i < len(sections)
and float(sections[i].attrib[
'endTime']) < limit_end_time:
132 end_seg_time = float(sections[i].attrib[
'endTime'])
133 new_episode.append(sections[i])
136 if progress_update_fcn:
137 progress_update_fcn(float(i) / float(len(sections)))
139 if (i - start_offset) == 0
and i < len(sections)
and float(sections[i].attrib[
'endTime']) >= limit_end_time:
140 new_episode.append(sections[i])
141 end_seg_time = float(sections[i].attrib[
'endTime'])
144 if progress_update_fcn:
145 progress_update_fcn(float(i) / float(len(sections)))
147 src_file_end_time = float(sections[-1].attrib[
'endTime'])
148 if end_seg_time < src_file_end_time:
149 void_section = self.
_get_void_section(end_seg_time, src_file_end_time, void_speaker_num)
150 new_episode.append(void_section)
152 return new_episode, i, start_seg_time, end_seg_time
163 self.
split_files = filter(
lambda name: re.match(
'.+?-\[\d+_\d+_\d+\.\d+-\d+_\d+_\d+\.\d+\]-\d+\.[Tt][Rr][Ss]$', name), self.
split_files)
173 speakers = tree.getroot().find(
'Speakers').findall(
'Speaker')
177 while not void_id
and i < len(speakers):
178 if speakers[i].attrib[
'name'] ==
'VOID':
179 void_id = speakers[i].attrib[
'id']
189 speakers_el = tree.getroot().find(
'Speakers')
190 speakers = speakers_el.findall(
'Speaker')
194 while not found
and i < len(speakers):
195 if speakers[i].attrib[
'name'] ==
'VOID':
196 speakers_el.remove(speakers[i])
204 def merge(self, progress_update_fcn=None):
205 new_episode = ElementTree.Element(
'Episode')
207 void_speaker_id =
None
209 for i
in range(num_split_files):
213 if not void_speaker_id:
216 episode = tree.getroot().find(
'Episode')
217 for child
in episode:
218 turn = child.find(
'Turn')
219 if not (child.tag ==
'Section' and turn
is not None and turn.attrib[
'speaker'] == void_speaker_id):
220 new_episode.append(child)
222 if progress_update_fcn:
223 progress_update_fcn(float(i + 1) / float(num_split_files))
228 trans = merged_tree.getroot()
229 trans.remove(trans.find(
'Episode'))
230 trans.append(new_episode)
234 dest_file = open(dest_filename,
'wb')
235 dest_file.write(
'<?xml version="1.0" encoding="ISO-8859-1"?>\n')
236 dest_file.write(
'<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
238 dest_file = open(dest_filename,
'a')
239 merged_tree.write(dest_file, xml_declaration=
False)
250 match = re.match(
'.*?-(\d+)\.[Tt][Rr][Ss]$', x)
251 x_num = int(match.groups()[0])
253 match = re.match(
'.*?-(\d+)\.[Tt][Rr][Ss]$', y)
254 y_num = int(match.groups()[0])