Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
utterance.py
Go to the documentation of this file.
1 ## @package data_structs.utterance
2 
3 import re
4 
5 from data_structs.base_objects import BLLObject
6 from db.bll_database import DBConstants
7 from parsers.errors import ParserError, ParserWarning
8 
9 ## This class represents an Utterance.
10 # For TRS files, an Utterance object corresponds to a single line of text following a "sync" or "who" tag. For CSV files, an Utterance object corresponds to a single row.
11 # Utterances are usually contained within a parent Segment object (corresponding to a "Turn" tag in a TRS file), and contain a pointer back to this parent Segment object.
12 # There may multiple utterances in a single Segment (see Segment.utters property), but each Utterance may only belong to one Segment.
13 # Utterances may be chained together (via transcriber C/I codes - see transcriber manual). Such a link is represented using the 'next' and 'prev' instance variables,
14 # which point to adjacent Utterances in the chain. These should never by linked circularly (i.e. it is impossible for a transcriber to code a circularly linked utterance).
15 # Unlike Segments, utterances are not stored in the database, and should never be manually constructed. Instead, they are generated by Parser objects (see parsers directory).
16 
18  #This is a unique identifies that is given to each Utterance object. (This provides a simple way to index them within the Parser classes.)
19  next_id = 0
20 
21  ## Constructor. Typically, this object is not constructed manually, so no parameters are provided. Instead, the parser objects create Utterances and set their parameters as they process a CSV/TRS file.
22  # @param self
23  def __init__(self):
24  self.speaker = None #Speaker object, obtained from enclosing segment
25  self.trans_phrase = None #transcriber-written text (string) containing the words spoken in this utternace
26  self.start = None #absolute start time (float)
27  self.end = None #absolute end time (float)
28  self.lena_notes = None #(string) lena markings like "VOC" - see DBConstants.LENA_NOTES_CODES
29  self.lena_codes = [] #(list of strings) lena generated codes
30  self.trans_codes = [] #(list of strings) transcriber codes (see transcriber manual / DBConstants.TRANS_CODES
31  self.next = None #(Utterance or None) pointer to the next Utterance in the chain, if any
32  self.prev = None #(Utterance or None) pointer to the previous Utterence in the chain, if any
33  self.is_trans_overlap = False #(boolean) True if this object represents was marked as overlapping by the transcribers using the angle brackets (<, >) - see transcriber manual
34  self.seg = None #(Segment) pointer to the enclosing Segment object
35  self.is_dot_split = False
36  self.id = Utterance.next_id #(int) unique id for each object that's created
37 
38  Utterance.next_id += 1
39 
40  ## Gives a string-representation of this utterance. Also see superclass definition.
41  # @param
42  # @returns (string) a string with formatted info suitable for printing to the console
43  def __str__(self):
44  #print out all attributes, except for these
45  output = BLLObject.__str__(self, ['next', 'prev', 'seg'])
46  #Just print segment number (printing out whole segment would result in infinite recursion, since segment prints out this utterance)
47  output += '-seg: %s\n' % (self.seg.num)
48 
49  #print out id of next/prev utterance in the chain, if present
50  output += '-next: %s\n' % (str(self.next.id) if self.next else 'None')
51  output += '-prev: %s\n' % (str(self.prev.id) if self.prev else 'None')
52 
53  return output
54 
55  ## Checks if this Utterance can be linked (chained via the next/prev instance vars) to other Utterances. This is used by the parsers when they go through a file. Utterances that represent silence, for example, should not be linked.
56  # @param self
57  # @returns (boolean) True if it's ok to link this Utterance, False otherwise
58  def is_linkable(self):
59  is_linkable = True
60 
61  #certain speakers (like silence) are not linkable. See the db table 'speaker_codes' (specifically, the 'is_linkable' column) for specific details.
62  i = 0
63  while is_linkable and i < len(self.seg.speakers):
64  #if not self.seg.speakers[i].speaker_codeinfo:
65  # print self.seg.speakers[i].speaker_id
66  is_linkable = self.seg.speakers[i].speaker_codeinfo.is_linkable
67  i += 1
68 
69  #some lena notes are also used to mark silence. Utterances with these markings should not be linkable, unless they have been transcribed (i.e. unless lena was wrong to mark them as silence). See DBConstants.LENA_NOTES_CODES for specific details.
70  notes_opt = DBConstants.LENA_NOTES_CODES.get_option(self.lena_notes)
71  if is_linkable and notes_opt != None and not self.trans_phrase:
72  is_linkable = notes_opt.is_linkable
73 
74  return is_linkable
75 
76  ## This method checks this Utterance for errors/warnings (e.g. invalid transcriber codes, ambiguous I/C codes, etc.). These are added to the error_collector object.
77  # @param self
78  # @param error_collector (ErrorCollector) An object used to organize and retreive errors in various ways (see data_structs.error_collector.py), so the UI can present them nicely. This method collects errors and adds them to this object.
79  def validate(self, error_collector):
80  warnings = []
81  errors = []
82 
83  #if self.is_linkable(): #eliminates silence, media, nonverbal-noise, and other types that don't require transcription
84  if self.trans_phrase:
85  #note: for now, we don't validate the speaker, since it's not always possible to determine who's speaking with the transcriber multi-line schema
86 
87  #if not self.trans_phrase:
88  # error_collector.add(ParserWarning('Utterance has no transcription phrase.', self))
89  #else:
90  if self.trans_phrase:
91  #search for invalid character in trascription phrase
92  #replace angle bracket alternative encodings
93  self.trans_phrase.replace('&lt;', '<').replace('&gt;', '>')
94  bad_chars = re.findall('[^A-Za-z\'\"<>\s\^]', self.trans_phrase)
95  #remove duplicates
96  bad_chars = dict(zip(bad_chars, [True] * len(bad_chars))).keys()
97 
98  if bad_chars:
99  bad_chars_str = reduce(lambda accum, c: '%s, "%c"' % (accum, c),
100  bad_chars[1:],
101  '"%c"' % (bad_chars[0]))
102  error_collector.add(ParserError('Transcription phrase contains the following invalid characters: %s' % (bad_chars_str), self))
103 
104  #this can happen if the TRS file's XML structure gets messed up
105  if self.start == None:
106  error_collector.add(ParserError('Parser was unable to determine utterance start time.', self))
107  if self.end == None:
108  error_collector.add(ParserError('Parser was unable to determine utterance end time.', self))
109  if self.lena_notes:
110  if DBConstants.LENA_NOTES_CODES.get_option(self.lena_notes) == None:
111  error_collector.add(ParserError('Unrecognized LENA note.', self))
112 
113  #make sure we have the right number of transcriber codes
114  if self.trans_phrase:
115  if len(self.trans_codes) < len(DBConstants.TRANS_CODES):
116  error_collector.add(ParserError('Utterance has less than %d transcriber codes.' % (len(DBConstants.TRANS_CODES)), self))
117 
118  i = 0
119  bad_indices = []
120 
121  for i in range(len(self.trans_codes)):
122  error_msgs = DBConstants.TRANS_CODES[i].is_valid(self.trans_codes[i])
123  if error_msgs:
124  bad_indices.append({'index': i + 1, 'error_msgs': error_msgs})
125 
126  if bad_indices:
127  err_str = 'Utterance transcriber codes contain the following errors:\n'
128  for issue in bad_indices:
129  err_str += ' Code %d:\n' % (issue['index'])
130  for msg in issue['error_msgs']:
131  err_str += ' -%s\n' % (msg)
132 
133  error_collector.add(ParserError(err_str, self))