Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
trs_splitter.py
Go to the documentation of this file.
1 ## @package parsers.trs_splitter
2 
3 import traceback
4 from xml.etree import ElementTree
5 import logging
6 import re
7 import os
8 from utils.backend_utils import BackendUtils
9 
10 ## This class splits a TRS file into chunks of a given length, writing the split files to a user-specified directory.
11 class TRSSplitter(object):
12  ## Constructor
13  # @param self
14  # @param filename (string) name of the TRS file to split (absolute path)
15  # @param dest_path (string) directory in which to store the split TRS files (absolute path)
16  def __init__(self, filename, dest_path):
17  self.logger = logging.getLogger(__name__)
18  self.dest_path = dest_path
19  #grab the portion of the filename between the path prefix or the extension
20  self.filename_base = re.match('.*[\\\/]([^\\\/]+)\.[Tt][Rr][Ss]$', filename).groups()[0]
21  self.filename = filename
22 
23  try:
24  self.tree = ElementTree.parse(self.filename)
25 
26  except Exception as e:
27  self.logger.error("Unable to open TRS file. Exception: %s" % e)
28  traceback.print_exc()
29 
30  ## Speakers are given string ids 'spk0', 'spk1', etc. This method retreives the integer from the next available id.
31  # @param self
32  # @returns (int) next available id for a Speaker
34  max_speaker_num = -1
35  for person in self.tree.getroot().find('Speakers').findall('Speaker'):
36  cur_speaker_num = int( re.match('^spk(\d+)$', person.attrib['id']).groups()[0] )
37  max_speaker_num = max(max_speaker_num, cur_speaker_num)
38 
39  return max_speaker_num + 1
40 
41  ## Inserts a speaker with code 'VOID' into the xml file. This speaker is used to pad the start and end of the file (from time 0 to start of first segment, and from end of last segment to end of file time). This is done so that whent he split file is opened in transcriber, the wav file will still sync up. This method modifies the "Speakers" tag at the top of a TRS file. This tag contains a list of all the speakers in the file. Nothing is returned - instead, the tree param is directly modified.
42  # @param self
43  # @param tree (etree ElementTree) The XML tree in which to search for the speakers tag
44  # @param speaker_num (int) the number for the new VOID speaker (should be unused by other speakers already present) - see _get_next_speaker_num()
45  def _insert_void_speaker(self, tree, speaker_num):
46  speakers = tree.getroot().find('Speakers')
47  speakers.append(ElementTree.Element('Speaker', attrib={'accent': '',
48  'check': 'no',
49  'dialect': 'native',
50  'id': 'spk%d' % (speaker_num),
51  'name': 'VOID',
52  'scope': 'local',
53  }))
54 
55  ## Constructs a string in the format "hh:mm:ss.ss" from a total seconds count.
56  # @param self
57  # @param total_sec (float) The total second count to convert the the specified format
58  # @returns (string) the formatted result, as indicated above
59  def _get_time_str(self, total_sec):
60  hours, mins, sec = BackendUtils.break_time(total_sec)
61 
62  return '%02d_%02d_%s%0.2f' % (hours, mins, '0' if sec < 10 else '', sec)
63 
64  ## Splits the TRS file. This will write to the destination file.
65  # @param self
66  # @param win_len (float) The size of the chunks we want to split this file into (specified in seconds)
67  # @param progress_update_fcn (function=None) function that updates the progress bar, accepting a single parameter, a real number in [0.0, 1.0]
68  def split(self, win_len, progress_update_fcn=None):
69  void_speaker_num = self._get_next_speaker_num()
70  sections = list(self.tree.iter('Section'))
71  file_num = 0
72  i = 0;
73 
74  while i < len(sections):
75  new_episode, i, start_time, end_time = self._build_episode(sections, i, win_len, void_speaker_num, progress_update_fcn)
76 
77  temp_tree = ElementTree.parse(self.filename)
78  self._insert_void_speaker(temp_tree, void_speaker_num)
79 
80  trans = temp_tree.getroot()
81  old_episode = trans.find('Episode')
82  trans.remove(old_episode)
83  trans.append(new_episode)
84 
85  start_str = str(start_time).replace('.', '_')
86  end_str = str(end_time).replace('.', '_')
87  dest_filename = '%s\\%s-[%s-%s]-%d.trs' % (self.dest_path, self.filename_base, self._get_time_str(start_time), self._get_time_str(end_time), file_num)
88 
89  #python ElementTree library isn't adding xml declaration (bug), so we have to add it ourselves
90  dest_file = open(dest_filename, 'wb')
91  dest_file.write('<?xml version="1.0" encoding="ISO-8859-1"?>\n') #this is not really true, but it works for now...
92  dest_file.write('<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
93  dest_file.close()
94  dest_file = open(dest_filename, 'a')
95  temp_tree.write(dest_file, xml_declaration=False)
96  dest_file.close()
97 
98  file_num += 1
99 
100  ## Constructs a section element (with a turn subelement) for the void speaker, for the specified time period.
101  # @param start_time (float) section start time, in seconds (specified as offset from beginning of file)
102  # @param end_time (float) section end time, in seconds (specified as offset from beginning of file)
103  # @param speaker_num (int) next available speaker integer - see _get_next_speaker_num()
104  # @returns (Element) etree "section" Element for the void speaker
105  def _get_void_section(self, start_time, end_time, speaker_num):
106  void_section = ElementTree.Element('Section', attrib={'startTime': str(start_time), 'endTime': str(end_time), 'type': 'report'})
107  void_turn = ElementTree.SubElement(void_section, 'Turn', attrib={'startTime': str(start_time), 'endTime': str(end_time), 'speaker': 'spk%d' % (speaker_num)})
108 
109  return void_section
110 
111  ## Each TRS file contains a single <episode> tag that encloses all <turn> tags. This method constructs a new <episode> tag containing as many sections as will fit into the time period specified by win_len. This can be used to write a new TRS file. If a single section is bigger than win_len, the section is appended separately as a single file.
112  # @param self
113  # @param start_offset (int) section index at which to start building the episode
114  # @param win_len (float) The size of the chunks we want to split this file into (specified in seconds)
115  # @param void_speaker_num (int) next available speaker integer - see _get_next_speaker_num()
116  # @param progress_update_fcn (function=None) function that updates the progress bar, accepting a single parameter, a real number in [0.0, 1.0]
117  # @returns (Element, int, float, float) New Episode XML element, index of the last segment we stuffed into it, start time of the first segment in the episode, end time of the last segment in the episode
118  def _build_episode(self, sections, start_offset, win_len, void_speaker_num, progress_update_fcn): #win_len is the length of time (in sec) for each window
119  new_episode = ElementTree.Element('Episode')
120 
121  i = start_offset
122  start_seg_time = float(sections[i].attrib['startTime'])
123  end_seg_time = float(sections[i].attrib['endTime'])
124  limit_end_time = start_seg_time + win_len
125 
126  #add 'void' speaker between 0 and start_time of current section. This ensures the wav file is aligned when the split file is opened in transcriber.
127  if start_seg_time > 0:
128  void_section = self._get_void_section(0, start_seg_time, void_speaker_num)
129  new_episode.append(void_section)
130 
131  while i < len(sections) and float(sections[i].attrib['endTime']) < limit_end_time:
132  end_seg_time = float(sections[i].attrib['endTime'])
133  new_episode.append(sections[i])
134  i += 1
135 
136  if progress_update_fcn:
137  progress_update_fcn(float(i) / float(len(sections)))
138 
139  if (i - start_offset) == 0 and i < len(sections) and float(sections[i].attrib['endTime']) >= limit_end_time:
140  new_episode.append(sections[i])
141  end_seg_time = float(sections[i].attrib['endTime'])
142  i += 1
143 
144  if progress_update_fcn:
145  progress_update_fcn(float(i) / float(len(sections)))
146 
147  src_file_end_time = float(sections[-1].attrib['endTime'])
148  if end_seg_time < src_file_end_time:
149  void_section = self._get_void_section(end_seg_time, src_file_end_time, void_speaker_num)
150  new_episode.append(void_section)
151 
152  return new_episode, i, start_seg_time, end_seg_time
153 
154 ## Merges split files back into a single TRS file.
155 class TRSMerger(object):
156  ## Constructor
157  # @param self
158  # @param src_dir (string) full path to the directory containing the split files
159  def __init__(self, src_dir):
160  self.src_dir = src_dir
161 
162  self.split_files = os.listdir(self.src_dir)
163  self.split_files = filter(lambda name: re.match('.+?-\[\d+_\d+_\d+\.\d+-\d+_\d+_\d+\.\d+\]-\d+\.[Tt][Rr][Ss]$', name), self.split_files)
164 
165  self.split_files.sort(self._cmp_filenames)
166  self.filename_base = re.match('^(.+?)-\[\d+_\d+_\d+\.\d+-\d+_\d+_\d+\.\d+\]-\d+\.[Tt][Rr][Ss]$', self.split_files[0]).groups()[0]
167 
168  ## Finds the id number of the VOID speaker that was inserted when the original TRS file was split (see TRSSplitter class)
169  # @param self
170  # @param tree (etree ElementTree) The XML tree object to search.
171  # @returns (int) the id of the void speaker, as determined from the "Speaker" tag (which lists all speakers in the file)
172  def _get_void_speaker_id(self, tree):
173  speakers = tree.getroot().find('Speakers').findall('Speaker')
174 
175  void_id = None
176  i = 0
177  while not void_id and i < len(speakers):
178  if speakers[i].attrib['name'] == 'VOID':
179  void_id = speakers[i].attrib['id']
180 
181  i += 1
182 
183  return void_id
184 
185  ## Removes the void speaker from the given XML tree.
186  # @param self
187  # @param tree (etree ElementTree) the XML tree to search
188  def _remove_void_speaker(self, tree):
189  speakers_el = tree.getroot().find('Speakers')
190  speakers = speakers_el.findall('Speaker')
191 
192  found = False
193  i = 0
194  while not found and i < len(speakers):
195  if speakers[i].attrib['name'] == 'VOID':
196  speakers_el.remove(speakers[i])
197  found = True
198 
199  i += 1
200 
201  ## Performs the acutal merger, writing the results to a destination file with the name "<old name>-merged.trs"
202  # @param self
203  # @param progress_update_fcn (function=None) function that updates the progress bar, accepting a single parameter, a real number in [0.0, 1.0]
204  def merge(self, progress_update_fcn=None):
205  new_episode = ElementTree.Element('Episode')
206  num_split_files = len(self.split_files)
207  void_speaker_id = None
208 
209  for i in range(num_split_files):
210  tree = ElementTree.parse('%s\\%s' % (self.src_dir, self.split_files[i]))
211 
212  #void speaker id is the same for all split files, so only retreive it once
213  if not void_speaker_id:
214  void_speaker_id = self._get_void_speaker_id(tree)
215 
216  episode = tree.getroot().find('Episode')
217  for child in episode:
218  turn = child.find('Turn') #grab the first turn element - if the void speaker is present, it will always be in the first turn
219  if not (child.tag == 'Section' and turn is not None and turn.attrib['speaker'] == void_speaker_id):
220  new_episode.append(child)
221 
222  if progress_update_fcn:
223  progress_update_fcn(float(i + 1) / float(num_split_files))
224 
225  merged_tree = ElementTree.parse('%s\\%s' % (self.src_dir, self.split_files[0]))
226  self._remove_void_speaker(merged_tree)
227 
228  trans = merged_tree.getroot()
229  trans.remove(trans.find('Episode'))
230  trans.append(new_episode)
231 
232  #python ElementTree library isn't adding xml declaration (bug), so we have to add it ourselves
233  dest_filename = '%s\\%s-merged.trs' % (self.src_dir, self.filename_base)
234  dest_file = open(dest_filename, 'wb')
235  dest_file.write('<?xml version="1.0" encoding="ISO-8859-1"?>\n') #this is not really true, but it works for now...
236  dest_file.write('<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
237  dest_file.close()
238  dest_file = open(dest_filename, 'a')
239  merged_tree.write(dest_file, xml_declaration=False)
240  dest_file.close()
241 
242  ## This comparison function is used to sort the split files into ascending order before they are merged. This is done using the index appended to the end of the filename by the splitter app.
243  # @param self
244  # @param x (string) filename number one
245  # @param y (string) filename number two
246  # @returns (int) 0 if names are identical, -1 if filename x should come before filename y, 1 if filename x should come after filename y
247  def _cmp_filenames(self, x, y):
248  result = 0
249 
250  match = re.match('.*?-(\d+)\.[Tt][Rr][Ss]$', x)
251  x_num = int(match.groups()[0])
252 
253  match = re.match('.*?-(\d+)\.[Tt][Rr][Ss]$', y)
254  y_num = int(match.groups()[0])
255 
256  if x_num < y_num:
257  result = -1
258  elif x_num > y_num:
259  result = 1
260 
261  return result
262