Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
output_calcs.py
Go to the documentation of this file.
1 ## @package data_structs.output_calcs
2 
3 import logging
4 from db.bll_database import DBConstants
5 from data_structs.base_objects import BLLObject
6 from parsers.filter_manager import FilterManager
7 from utils.backend_utils import BackendUtils
8 from collections import OrderedDict
9 
10 import re
11 
12 ## An OutputCalc encapsulates all of the calculations for a particular output.
13 # There is a one-to-one relationship between Outputs and OutputCalcs.
14 # This is an abstract base class - all subclasses should implement the methods defined here.
15 # Idea: the add_seg() and add_chain() methods take in segs to be included in the calculation. Finally, write_segs() is called, which writes the results to a spreadsheet file.
16 # Since output calcs can deal with either segments or chains, it is convenient to refer to them collectively as 'items' in this documentation.
18  ## Constructor
19  # @param self
20  def __init__(self):
21  pass
22 
23  ## Retreives an ordered list of args needed to instantiate this object these will be inserted into the DB. (this list will be passed directly to the constructor when the object is reinstantiated from the DB).
24  # @param self
25  # @returns (list) list of args to pass to the constructor when the object is recreated from the DB at a later date
26  def get_db_args(self):
27  pass
28 
29  ## Retreives a short description string that indicates what type of calculation this object is performing (eg. 'Count', 'Time Period', etc.)
30  # @param self
31  # @returns (string) short type description string to display in the UI
32  def get_calc_type_str(self):
33  pass
34 
35  ## Adds an unlinked segment to be considered in the calculations that this object is performing.
36  # @param self
37  # @param seg (Segment) A segment to consider in this object's calculations.
38  def add_seg(self, seg):
39  pass
40 
41  ## Adds a linked segment (i.e. an Utterance) to be considered in the calculations that this object is performing.
42  # @param self
43  # @param head (Utterance) the head of a chain of potentially linked Utterances. The whole chain will be considered in this object's calculations.
44  def add_chain(self, head):
45  pass
46 
47  ## Writes a short description of the calculation, along with the calculation results, to a csv file.
48  # @param self
49  # @param csv_writer (CSVWriter) this is a Python csv library writer objects, configured to write to the appropriate csv file.
50  def write_csv_rows(self, csv_writer):
51  pass
52 
53  ## This method clears any cached data from previous run (anything added by add_seg() or add_chain()), in preparation for new run
54  # @param self
55  def reset(self):
56  pass
57 
58 ## This OutputCalc searches for a regex match in a segment/chain phrase and counts the number of matches.
59 # The counting can be done on a per-item basis, as an average across items, or as a sum across items.
60 class CountOutputCalc(OutputCalc):
61  #this is an enum containing combo_options from the combo group COUNT_OUTPUT_CALC_TYPE
62  COUNT_TYPES = None
63 
64  ## Constructor
65  # @param self
66  # @param search_term (string) a Python regular expression to search for in the phrase
67  # @param count_type (int) one of the values from the CountOutputCalc.COUNT_TYPES enum - specifies how it count (per item, avg across items, or sum across items)
68  # @param max_count (int=-1) a maximum threshold for the matches (-1 means no threshold) that an <em>individual item</em> can have.
69  def __init__(self, search_term, count_type, max_count=-1):
70  self.logger = logging.getLogger(__name__)
71  self.search_term = search_term
72  self.count_type = count_type
73  self.max_count = max_count
74  self._init_data_structs()
75 
76  ## Initializes internal data structures used to record items added to this OutputCalc.
77  # @param self
78  def _init_data_structs(self):
79  self.utter_dict = OrderedDict()
80  self.chain_dict = OrderedDict()
81 
82  ## See superclass description.
83  def reset(self):
84  #clear any saved items from this instance
85  self._init_data_structs()
86 
87  ## See superclass description.
88  def get_db_args(self):
89  return [self.search_term, self.count_type, self.max_count]
90 
91  ## See superclass description.
92  def get_calc_type_str(self):
93  return 'Count'
94 
95  ## See superclass description.
96  def add_seg(self, seg):
97  i = 0
98  while seg.utters and i < len(seg.utters):
99  #search the phrase for regex matches
100  if seg.utters[i].trans_phrase:
101  count = len(re.findall(self.search_term, seg.utters[i].trans_phrase))
102  if count > self.max_count and self.max_count != -1:
103  count = self.max_count
104 
105  #track the counts for each utterance in a dictionary
106  if seg.utters[i] in self.utter_dict:
107  #handle case where same utterance is added twice
108  self.utter_dict[seg.utters[i]] += count
109  else:
110  self.utter_dict[seg.utters[i]] = count
111 
112  i += 1
113 
114  ## See superclass description.
115  def add_chain(self, head):
116  if head.trans_phrase: #filter out untranscribed utterances
117  full_phrase = ''
118  cur = head
119  while cur:
120  if cur.trans_phrase:
121  full_phrase += cur.trans_phrase
122  if cur.next:
123  full_phrase += ' ' #separate with a space (for search, below)
124  cur = cur.next
125 
126  count = len(re.findall(self.search_term, full_phrase))
127  if self.max_count != -1 and count > self.max_count:
128  count = self.max_count
129 
130  #track chain counts in a dictionary, keyed by their head utterance
131  if not head in self.chain_dict:
132  self.chain_dict[head] = count
133  else:
134  #handle case where same chain is added twice
135  self.chain_dict[head] += count
136 
137  ## Writes out the count results for a 'per item' count.
138  # This consists of a single line for each item, followed by its count. The total count is displayed in a row at the bottom of the section.
139  # @param self
140  # @param chained (boolean) True if we are considering linked segments, False if unlinked.
141  # @param csv_writer (CSVWriter) Python csv library writer object, set to write to the appropriate csv file.
142  def _write_per_seg(self, chained, csv_writer):
143  #write table headers
144  csv_writer.writerow(['Start Time', 'End Time', 'Phrase', 'Count'])
145 
146  utter_list = self.chain_dict if chained else self.utter_dict
147  total = 0
148  #go through items, writing out each one, with its count
149  for utter in utter_list:
150  #linked segments
151  if chained:
152  phrase, tail = FilterManager.get_chain_phrase(utter)
153  start = BackendUtils.get_time_str(utter.start)
154  end = BackendUtils.get_time_str(tail.end)
155  count = self.chain_dict[utter]
156 
157  if count > 0:
158  total += count
159  csv_writer.writerow([start,
160  end,
161  phrase.replace('\n', '').replace('\r', ''),
162  count,
163  ])
164 
165  #unlinked segments
166  else:
167  start = BackendUtils.get_time_str(utter.start)
168  end = BackendUtils.get_time_str(utter.end)
169  count = self.utter_dict[utter]
170 
171  if count > 0:
172  total += count
173  csv_writer.writerow([
174  start,
175  end,
176  utter.trans_phrase,
177  count
178  ])
179 
180  #append the total count in a row at the bottom
181  csv_writer.writerow([''])
182  csv_writer.writerow(['Total:', '', '', total])
183 
184  ## Writes out the count results for an 'average across items' count.
185  # This consists of a single row with the average count.
186  # The average calculation is computed as (sum of counts from all items) / number of items <em>containing a match<\em>)
187  # @param self
188  # @param chained (boolean) True if we are considering linked segments, False if unlinked.
189  # @param csv_writer (CSVWriter) Python csv library writer object, set to write to the appropriate csv file.
190  def _write_avg_across_segs(self, chained, csv_writer):
191  counts = self.chain_dict.values() if chained else self.utter_dict.values()
192  if counts:
193  total = reduce(lambda accum, x: accum + x, counts, 0)
194  avg = float(total) / float(len(counts))
195 
196  csv_writer.writerow(['Avg:', '%0.3f' % (avg)])
197  else:
198  csv_writer.writerow(['No matches found in TRS file.'])
199 
200  ## Writes out the count results for a 'sum across items' count.
201  # This consists of single row with the total sum.
202  # @param self
203  # @param chained (boolean) True if we are considering linked segments, False if unlinked.
204  # @param csv_writer (CSVWriter) Python csv library writer object, set to write to the appropriate csv file.
205  def _write_sum_across_segs(self, chained, csv_writer):
206  counts = self.chain_dict.values() if chained else self.utter_dict.values()
207  total = reduce(lambda accum, x: accum + x, counts, 0)
208 
209  csv_writer.writerow(['Sum:', total])
210 
211  ## See superclass description.
212  def write_csv_rows(self, chained, csv_writer):
213  combo_option = DBConstants.COMBOS[DBConstants.COMBO_GROUPS.COUNT_OUTPUT_CALC_TYPES][self.count_type]
214  csv_writer.writerow(['Count:', combo_option.disp_desc])
215  csv_writer.writerow(['Search Term:', self.search_term])
216 
217  #call the appropriate method based on the type setting
218  {CountOutputCalc.COUNT_TYPES.PER_SEG: self._write_per_seg,
219  CountOutputCalc.COUNT_TYPES.AVG_ACROSS_SEGS: self._write_avg_across_segs,
220  CountOutputCalc.COUNT_TYPES.SUM_ACROSS_SEGS: self._write_sum_across_segs,
221  }[self.count_type](chained, csv_writer)
222 
223 ## This OutputCalc searches for a regex match in a segment/chain phrase and counts the number of matches, then divides the count by the length of the segment in seconds.
224 # The output is a measure of count/sec. This can be computed for each item individually ('per item'), or as an average across items.
226  #this is an enum containing options from combo group RATE_OUTPUT_CALC_TYPE
227  RATE_TYPES = None
228 
229  ## Constructor
230  # @param self
231  # @param search term (string) a Python regular expression to search for in the the item phrases.
232  # @param rate_type (int) one of the options from the enum RateOutputCalc.RATE_TYPES - indicating how the rate is to be calculated (per item, or average across items)
233  def __init__(self, search_term, rate_type):
234  self.logger = logging.getLogger(__name__)
235  self.search_term = search_term
236  self.rate_type = rate_type
237  self._init_data_structs()
238 
239  ## Initializes internal data structures used to record items added to this OutputCalc.
240  # @param self
242  self.chain_dict = OrderedDict()
243  self.utter_dict = OrderedDict()
244 
245  ## See superclass description.
246  def reset(self):
247  self._init_data_structs()
248 
249  ## See superclass description.
250  def get_db_args(self):
251  return [self.search_term, self.rate_type]
252 
253  ## See superclass description.
254  def get_calc_type_str(self):
255  return 'Rate'
256 
257  ## See superclass description.
258  def add_seg(self, seg):
259  i = 0
260  while seg.utters and i < len(seg.utters):
261  #search the phrase for regex matches
262  if seg.utters[i].trans_phrase:
263  count = len(re.findall(self.search_term, seg.utters[i].trans_phrase))
264 
265  #skip utters with no end/start time recorded
266  if seg.utters[i].end != None and seg.utters[i].start != None:
267  time = seg.utters[i].end - seg.utters[i].start
268 
269  #record the result in a dictionary, keyed by utterance. Each entry is a tuple of the form (number of matches, length of segment in seconds)
270  if seg.utters[i] in self.utter_dict:
271  self.utter_dict[seg.utters[i]][0] += count
272  self.utter_dict[seg.utters[i]][1] += time
273  else:
274  self.utter_dict[seg.utters[i]] = (count, time)
275  i += 1
276 
277  ## See superclass description.
278  def add_chain(self, head):
279  if head.trans_phrase: #filter out untranscribed utterances
280  count = 0
281  i = 0
282  cur = head
283  tail = head
284  #count the number of matches in the whole chain
285  while cur:
286  if cur.trans_phrase:
287  count += len(re.findall(self.search_term, cur.trans_phrase))
288  tail = cur
289  cur = cur.next
290 
291  #skip chains with no end/start time recorded
292  if tail.end != None and head.start != None:
293  time = tail.end - head.start
294  #record the result in a dictionary, keyed by the head utterance. Each entry is a tuple of the form (number of matches, length of segment in seconds)
295  if not head in self.utter_dict:
296  self.chain_dict[head] = (count, time)
297  else:
298  self.chain_dict[head][0] += count
299  self.chain_dict[head][1] += time
300 
301  ## Writes the results for a 'per item' rate calculation to a spreadsheet file.
302  # This consists of a single row for each item, with it's corresponding rate.
303  # @param self
304  # @param chained (boolean) True if we are considering linked segments, False if unlinked.
305  # @param csv_writer (CSVWriter) Python csv library writer object, set to write to the appropriate csv file.
306  def _write_per_seg(self, chained, csv_writer):
307  #write headers row
308  csv_writer.writerow(['Start Time', 'End Time', 'Phrase', 'Occurances', 'Time Elapsed(sec)', 'Rate (occurrances/sec)'])
309 
310  #go through all items, performing the rate calculation and writing it to the spreadsheet file
311  utter_list = self.chain_dict if chained else self.utter_dict
312  for utter in utter_list:
313  start = None
314  end = None
315  phrase = None
316  rate = None
317  count = None
318  time = None
319 
320  if chained:
321  #count number of matches in entire chain
322  phrase, tail = FilterManager.get_chain_phrase(utter)
323  start = utter.start
324  end = tail.end
325  count, time = self.chain_dict[utter]
326  rate = float(count) / float(time)
327 
328  else:
329  start = utter.start
330  end = utter.end
331  phrase = utter.trans_phrase
332  count, time = self.utter_dict[utter]
333  rate = float(count) / float(time)
334 
335  csv_writer.writerow([BackendUtils.get_time_str(start),
336  BackendUtils.get_time_str(end),
337  phrase.replace('\n', '').replace('\r', ''),
338  count,
339  time,
340  rate,
341  ])
342 
343  ## Writes the results for a 'average across items' rate calculation to a spreadsheet file.
344  # This consists of a single row containing the average.
345  # The average is calculated as (sum of number of matches across all items) / (sum of lengths of <em>all</em> items, in seconds)
346  def _write_avg_across_segs(self, chained, csv_writer):
347  pairs = self.chain_dict.values() if chained else self.utter_dict.values()
348  total_time = 0.0
349  total_count = 0
350 
351  for cur_pair in pairs:
352  count, time = cur_pair
353  total_count += count
354  total_time += time
355 
356  avg = float(total_count) / total_time
357 
358  csv_writer.writerow(['Avg:', avg])
359 
360  ## See superclass description.
361  def write_csv_rows(self, chained, csv_writer):
362  combo_option = DBConstants.COMBOS[DBConstants.COMBO_GROUPS.RATE_OUTPUT_CALC_TYPES][self.rate_type]
363  csv_writer.writerow(['Rate:', combo_option.disp_desc])
364  csv_writer.writerow(['Search Term:', self.search_term])
365 
366  #call the appropriate method based on the type setting
367  {RateOutputCalc.RATE_TYPES.PER_SEG: self._write_per_seg,
368  RateOutputCalc.RATE_TYPES.AVG_ACROSS_SEGS: self._write_avg_across_segs,
369  }[self.rate_type](chained, csv_writer)
370 
371 ## This OutputCalc searches for a regex match in the items' phrases, and calculates the total length of the items (in seconds) that contain a match.
373  ## Constructor
374  # @param self
375  # @param search term (string) a Python regular expression to search for in the the item phrases.
376  def __init__(self, search_term):
377  self.logger = logging.getLogger(__name__)
378  self.search_term = search_term
379  self._init_data_structs()
380 
381  ## Initializes the data structures used to record the items added to this OutputCalc.
383  self.utters_dict = {}
384  self.chains_dict = {}
385 
388 
389  ## See superclass description.
390  def reset(self):
391  self._init_data_structs()
392 
393  ## See superclass description.
394  def get_db_args(self):
395  return [self.search_term]
396 
397  ## See superclass description.
398  def get_calc_type_str(self):
399  return 'Time Period'
400 
401  ## Searches through the internal dictionary for a key that intersects with the specified start and end times.
402  # The dictionary is keyed by tuples of the form (start_time, end_time).
403  # This routine checks to see if the specified start and end range intersects with any key tuple already in the dictionary.
404  # @param self
405  # @param start (float) an Utterance start time
406  # @param end (float) an Utterance end time
407  # @returns (tuple) if an intersecting tuple is found, that tuple is returned. Else None is returned.
408  def _get_chain_intersection(self, start, end):
409  found_key = None
410  keys = self.chains_dict.keys()
411 
412  i = 0
413  while not found_key and i < len(keys):
414  if ( (start >= keys[i][0] and start <= keys[i][1]) or
415  (end <= keys[i][1] and end >= keys[i][0]) ):
416  found_key = keys[i]
417  i += 1
418 
419  return found_key
420 
421  ## Searched through the phrases of Utterances in a chain, for a match against the regex.
422  # @param self
423  # @param head (Utterance) head of the chain to search
424  # @returns (boolean) True if a match was found anywhere in the chain phrase, False otherwise.
425  def _search_chain_phrase(self, head):
426  cur = head
427  found = False
428 
429  while not found and cur:
430  if cur.trans_phrase:
431  found = re.search(self.search_term, cur.trans_phrase) != None
432  cur = cur.next
433 
434  return found
435 
436  ## See superclass description.
437  # We may have segments that start and end at exactly the same time.
438  # We are guarenteed never to have segments that intersect in other ways.
439  def add_seg(self, seg):
440  for utter in seg.utters:
441  #make sure the Utterance has a start and end time so we can calculate it's length. Also make sure it's been transcribed.
442  if utter.end != None and utter.start != None and utter.trans_phrase:
443  #only store an entry in the dictionary if an Utterance with equivalent start/end times is not already present
444  if not (utter.start, utter.end) in self.utters_dict:
445  self.utters_dict[(utter.start, utter.end)] = True #it doesn't matter what we store here - the key is the important part. We're just using a dictionary for it's fast lookup abilities.
446 
447  #maintain a sum of the lengths of the segments that have been inserted (and contain a match)
448  if utter.trans_phrase and re.search(self.search_term, utter.trans_phrase):
449  self.utters_filtered_time += (utter.end - utter.start)
450 
451  ## See superclass description.
452  # Here things are more complicated than in add_seg() because chains can intersect in time (in any way).
453  def add_chain(self, head):
454  #only proceed if the first Utterance is transcribed
455  if head.trans_phrase:
456  tail = FilterManager.get_endpoint(FilterManager.ENDPOINT_TYPES.TAIL, head)
457  #make sure both start and end times on the the ends of the chain are present
458  if tail.end != None and head.start != None:
459  #if we find a match, check for intersection in the internal dictionary
460  if self._search_chain_phrase(head):
461  key_tuple = self._get_chain_intersection(head.start, tail.end)
462  new_key_tuple = None
463 
464  #if an intersection occurred we need to figure out how to adjust things...
465  if key_tuple:
466  #remove the intersecting key and subtract it's length from the sum (this sum was previously added)
467  self.chains_dict.pop(key_tuple)
468  self.chains_filtered_time -= (key_tuple[1] - key_tuple[0])
469 
470  #Obtain the widest (start, end) range by comparing the intersecting key with the start and end times of the Utterance we're trying to insert.
471  #This new key will be inserted into the dictionary below.
472  #This ensures that we will always detect future intersections.
473  new_key_tuple = (min(key_tuple[0], head.start), max(key_tuple[1], tail.end))
474 
475  #if no intersection occurred, we can just add the (start, end) tuple of the head Utterance to the dictionary
476  else:
477  new_key_tuple = (head.start, tail.end)
478 
479  #factor the length of the new key into the sum, and add the key to the internal dictionary
480  self.chains_filtered_time += (new_key_tuple[1] - new_key_tuple[0])
481  self.chains_dict[new_key_tuple] = True
482 
483  ## See superclass description.
484  def write_csv_rows(self, chained, csv_writer):
485  csv_writer.writerow(['Time Period'])
486 
487  total_time = self.chains_filtered_time if chained else self.utters_filtered_time
488 
489  csv_writer.writerow(['Search Term:', self.search_term])
490  csv_writer.writerow(['Time Containg Matches:', BackendUtils.get_time_str(total_time)])
491 
492 ## This type of OutputCalc constructs a table of counts.
493 # The user can select two transcriber codes. The first is the row criteria. All elements of the code are enumerated along the left (vertical axis) side of the table.
494 # The second is the column criteria. All elements of this code are enumerated along the top (horizontal axis) of the table.
495 # Each internal table cell will contain a count of the number of items found that have both the horizontal and vertical codes.
497  #This is an enum of options from BREAKDOWN_OUTPUT_CALC_CRITERIA
498  BREAKDOWN_CRITERIA = None
499 
500  ## Constructor
501  # @param self
502  # @param row_criteria (int) one of the values from the enum BreakdownOutputCalc.BREAKDOWN_CRITERIA. Indicates the row code.
503  # @param col_criteria (int) one of the values from the enum BreakdownOutputCalc.BREAKDOWN_CRITERIA. Indicates the column code.
504  def __init__(self, row_criteria, col_criteria):
505  self.logger = logging.getLogger('stats_app')
506  self.row_criteria = row_criteria
507  self.col_criteria = col_criteria
508  self._init_data_structs()
509 
510  ## Initializes data structs used to keep track of items added to this OutputCalc.
511  # @param self
513  self.seg_list = []
514  self.chain_list = []
515 
516  ## See superclass description.
517  def reset(self):
518  self._init_data_structs()
519 
520  ## See superclass description.
521  def get_db_args(self):
522  return [self.row_criteria, self.col_criteria]
523 
524  ## See superclass description.
525  def get_calc_type_str(self):
526  return 'Breakdown'
527 
528  ## See superclass description.
529  def add_seg(self, seg):
530  self.seg_list.append(seg)
531 
532  ## See superclass description.
533  def add_chain(self, head):
534  self.chain_list.append(head)
535 
536  ## Given a row or column criteria selected by the user, provides the corresponding transcriber code index.
537  # @param self
538  # @param criteria (int) an option from BreakdownOutputCalc.BREAKDOWN_CRITERIA
539  # @returns (int) the index (zero-based) of the transcriber code corresponding the specified criteria that the user has selected
540  def _get_trans_code_index(self, criteria):
541  return {
542  BreakdownOutputCalc.BREAKDOWN_CRITERIA.SPEAKER_TYPE: 0,
543  BreakdownOutputCalc.BREAKDOWN_CRITERIA.TARGET_LISTENER: 1,
544  BreakdownOutputCalc.BREAKDOWN_CRITERIA.COMPLETENESS: 2,
545  BreakdownOutputCalc.BREAKDOWN_CRITERIA.UTTERANCE_TYPE: 3,
546  }[criteria]
547 
548  ## Determines the transcriber code index that the specified criteria corresponds to, then returns the code in that index from the specified Utterance.
549  # @self
550  # @utter (Utterance) the utterance who's code you want to grab
551  # @criteria (int) an option from BreakdownOutputCalc.BREAKDOWN_CRITERIA
552  # @returns (string) a transcriber code, or None if the Utterance has no transcriber codes
553  def _get_utter_criteria_code(self, utter, criteria):
554  code = None
555  trans_code_index = self._get_trans_code_index(criteria)
556  if len(utter.trans_codes) > trans_code_index:
557  code = utter.trans_codes[trans_code_index]
558 
559  return code
560 
561  ## See superclass description.
562  # This method writes out the table of counts to a spreadsheet file.
563  # The user can select two codes. The first is the row criteria. All elements of the code are enumerated along the left (vertical axis) side of the table.
564  # The second is the column criteria. All elements of this code are enumerated along the top (horizontal axis) of the table.
565  # Each internal table cell will contain a count of the number of items found that have both the horizontal and vertical codes.
566  # In cases where codes can have multiple characters (eg. transcriber code 3), only the single codes will be enumerated in the table headers.
567  # If items are found containing multiple character codes, then each character in the code increments one cell (the item will be 'counted once for each character'').
568  def write_csv_rows(self, chained, csv_writer):
569  row_combo = DBConstants.COMBOS[DBConstants.COMBO_GROUPS.BREAKDOWN_OUTPUT_CALC_CRITERIA][self.row_criteria]
570  col_combo = DBConstants.COMBOS[DBConstants.COMBO_GROUPS.BREAKDOWN_OUTPUT_CALC_CRITERIA][self.col_criteria]
571 
572  csv_writer.writerow(['Row Criteria:', row_combo.disp_desc])
573  csv_writer.writerow(['Column Criteria:', col_combo.disp_desc])
574 
575  #build an array of all possible options for the code corresponding to the row criteria
576  row_code_index = self._get_trans_code_index(self.row_criteria)
577  row_code = DBConstants.TRANS_CODES[row_code_index]
578  row_code_strs = row_code.get_all_options_codes()
579 
580  #do the same for the column criteria
581  col_code_index = self._get_trans_code_index(self.col_criteria)
582  col_code = DBConstants.TRANS_CODES[col_code_index]
583  col_code_strs = col_code.get_all_options_codes()
584 
585  csv_writer.writerow([''] + col_code_strs) # headers (note: top left cell is blank)
586 
587  #this has records the values of the internal table cells
588  count_hash = OrderedDict()
589 
590  #locate our data source
591  data_list = self.chain_list if chained else self.seg_list
592 
593  #initialize all table cell values to 0
594  for row in row_code_strs:
595  count_hash[row] = OrderedDict()
596  for col in col_code_strs:
597  count_hash[row][col] = 0
598 
599  #go through the data, incrementing the appropriate table cells in the count_hash
600  for datum in data_list:
601  if chained:
602  if datum.trans_phrase: #filter out untranscribed utters
603  #Each chain contributes one to the count (with the exception of those containing multi-char codes).
604  #It is assumed that the head has the same trans codes as the rest (with exception of C code in tail).
605  row_code = self._get_utter_criteria_code(datum, self.row_criteria)
606  col_code = self._get_utter_criteria_code(datum, self.col_criteria)
607  if row_code != None and col_code != None:
608  for row_char in row_code: #for multi-char codes, increment count_hash individually for each char
609  for col_char in col_code:
610  try:
611  count_hash[row_char][col_char] += 1
612 
613  except KeyError as err:
614  self.logger.info('Output Calc encountered unrecognized key: %s' % (err))
615  self.logger.info('row_code: %s, col_code: %s' % (row_code, col_code))
616  self.logger.info('Utterance: %s' % (datum))
617 
618  #also dump to stdout for now to make it obvious
619  print 'Output Calc encountered unrecognized key: %s' % (err)
620  print 'row_code: %s, col_code: %s' % (row_code, col_code)
621 
622  else:
623  utter_index = 0
624  while datum.utters and utter_index < len(datum.utters):
625  if datum.utters[utter_index].trans_phrase: #filter out untranscribed utters
626  row_code = self._get_utter_criteria_code(datum.utters[utter_index], self.row_criteria)
627  col_code = self._get_utter_criteria_code(datum.utters[utter_index], self.col_criteria)
628  if row_code != None and col_code != None:
629  for row_char in row_code: #for multi-char codes, increment count_hash individually for each char
630  for col_char in col_code:
631  try:
632  count_hash[row_char][col_char] += 1
633 
634  except KeyError as err:
635  self.logger.info('Output Calc encountered unrecognized key: %s' % (err))
636  self.logger.info('row_code: %s, col_code: %s' % (row_code, col_code))
637  self.logger.info('Utterance: %s' % (datum))
638 
639  #also dump to stdout for now to make it obvious
640  print 'Output Calc encountered unrecognized key: %s' % (err)
641  print 'row_code: %s, col_code: %s' % (row_code, col_code)
642 
643  utter_index += 1
644 
645  #add column (vertical) headers and count_matrix rows to spreadsheet
646  for row_key in count_hash:
647  csv_writer.writerow( [row_key] + map(lambda col_key: count_hash[row_key][col_key], count_hash[row_key]) )
648 
649 ## This type of OutputCalc searches for a regex match in the phrase of items. Items containing one or more matches are shown in a list. The list is grouped by a particular (user-selected) transcriber code.
651  #this is an enum of combo options from the LIST_OUTPUT_CALC_CATS combo group
652  LIST_CATS = None
653 
654  ## Constructor
655  # @param self
656  # @param search_term (string) a Python regular expression
657  # @cat (int) one of the options from ListOutpuCalc.LIST_CATS. Indicates which transcriber code to group by.
658  def __init__(self, search_term, cat):
659  self.logger = logging.getLogger(__name__)
660  self.search_term = search_term
661  self.cat = cat
662  self._init_data_structs()
663 
664  ## Initializes the data structs used to keep track of the items that have been added to this OutputCalc.
666  self.utter_list = []
667  self.chain_list = []
668 
669  ## See superclass description.
670  def reset(self):
671  self._init_data_structs()
672 
673  ## See superclass description.
674  def get_db_args(self):
675  return [self.search_term, self.cat]
676 
677  ## See superclass description.
678  def get_calc_type_str(self):
679  return 'List'
680 
681  ## See superclass description.
682  def add_seg(self, seg):
683  i = 0
684  #filter the utterances using the regex - only include utterances that contain a match
685  while seg.utters and i < len(seg.utters):
686  if seg.utters[i].trans_phrase and re.search(self.search_term, seg.utters[i].trans_phrase):
687  self.utter_list.append(seg.utters[i])
688  i += 1
689 
690  ## See superclass description.
691  def add_chain(self, head):
692  cur = head
693  found = False
694  #only include chains that yield at least one regex match
695  while not found and cur:
696  if cur.trans_phrase:
697  found = re.search(self.search_term, cur.trans_phrase)
698  cur = cur.next
699 
700  if found:
701  self.chain_list.append(head)
702 
703  ## Retreives the transcriber code index, given a user-selected category option.
704  # @param self
705  # @param cat (int) one of the options from ListOutpuCalc.LIST_CATS
706  # @returns (int) the index (zero-based) or the corresponding transcriber code
707  def _get_trans_code_index(self, cat):
708  return {
709  ListOutputCalc.LIST_CATS.SPEAKER_TYPE: 0,
710  ListOutputCalc.LIST_CATS.TARGET_LISTENER: 1,
711  ListOutputCalc.LIST_CATS.COMPLETENESS: 2,
712  ListOutputCalc.LIST_CATS.UTTERANCE_TYPE: 3,
713  }[cat]
714 
715  ## See superclass description.
716  def write_csv_rows(self, chained, csv_writer):
717  combo = DBConstants.COMBOS[DBConstants.COMBO_GROUPS.LIST_OUTPUT_CALC_CATS][self.cat]
718  csv_writer.writerow(['List:', combo.disp_desc])
719  csv_writer.writerow(['Search Term:', self.search_term])
720 
721  #use the selected category option to grab a list of all possible values for the corresponding code
722  trans_code_index = self._get_trans_code_index(self.cat)
723  group_code = DBConstants.TRANS_CODES[trans_code_index]
724  group_code_strs = group_code.get_all_options_codes()
725 
726  #This dictionary holds the list items, separating the groups - it's keyed by transcriber code.
727  #Each value is another dictionary, keyed by utterance.
728  #Note that we can add the same utterance to different groups if we're working with a multi-character transcriber code.
729  group_dict = OrderedDict()
730  data_list = self.chain_list if chained else self.utter_list
731 
732  #organize the data into groups based on transcriber code value
733  for utter in data_list:
734  code_str = None
735  if chained:
736  cur = utter
737  while cur:
738  if len(cur.trans_codes) > trans_code_index:
739  code_str = cur.trans_codes[trans_code_index]
740 
741  if not code_str in group_dict:
742  group_dict[code_str] = OrderedDict()
743 
744  if not utter in group_dict[code_str]:
745  group_dict[code_str][utter] = True
746 
747  cur = cur.next
748 
749  else:
750  if len(utter.trans_codes) > trans_code_index:
751  code_str = utter.trans_codes[trans_code_index]
752 
753  if not code_str in group_dict:
754  group_dict[code_str] = OrderedDict()
755 
756  if not utter in group_dict[code_str]:
757  group_dict[code_str][utter] = True
758 
759  #write out the csv file
760  for code in group_dict:
761  csv_writer.writerow([''])
762  csv_writer.writerow(['', 'Code:', code])
763  csv_writer.writerow(['', 'Start Time', 'End Time', 'LENA Speakers', 'Phrase', 'Transcriber Codes'])
764  for utter in group_dict[code]:
765  if chained:
766  trans_codes, tail = FilterManager.get_chain_trans_codes(utter)
767  trans_codes = trans_codes.replace('\n', '').replace('\r', '')
768  speakers, tail = FilterManager.get_chain_lena_speakers(utter)
769  speakers = speakers.replace('\n', '').replace('\r', '')
770  phrase, tail = FilterManager.get_chain_phrase(utter)
771  phrase = phrase.replace('\n', '').replace('\r', '')
772  start_str = BackendUtils.get_time_str(utter.start)
773  end_str = BackendUtils.get_time_str(tail.end)
774  csv_writer.writerow(['', start_str, end_str, speakers, phrase, trans_codes])
775 
776  else:
777  csv_writer.writerow(['',
778  BackendUtils.get_time_str(utter.start),
779  BackendUtils.get_time_str(utter.end),
780  utter.speaker.speaker_codeinfo.code if utter.speaker and utter.speaker.speaker_codeinfo else '?',
781  utter.trans_phrase,
782  '|%s|' % ('|'.join(utter.trans_codes)) if utter.trans_codes else 'None',
783  ])
784 
785 ## This function fills in the combo_option constants in the above classes.
786 # These constants exist only for convenience (so you don't have to access them via DBConstants using the combo group, which invloves a lot of typing).
788  #should really remove the 'EMPTY' option here somehow...
789  CountOutputCalc.COUNT_TYPES = DBConstants.COMBO_OPTIONS[DBConstants.COMBO_GROUPS.COUNT_OUTPUT_CALC_TYPES]
790  RateOutputCalc.RATE_TYPES = DBConstants.COMBO_OPTIONS[DBConstants.COMBO_GROUPS.RATE_OUTPUT_CALC_TYPES]
791  BreakdownOutputCalc.BREAKDOWN_CRITERIA = DBConstants.COMBO_OPTIONS[DBConstants.COMBO_GROUPS.BREAKDOWN_OUTPUT_CALC_CRITERIA]
792  ListOutputCalc.LIST_CATS = DBConstants.COMBO_OPTIONS[DBConstants.COMBO_GROUPS.LIST_OUTPUT_CALC_CATS]
793