Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
filter_manager.py
Go to the documentation of this file.
1 ## @package parsers.filter_manager
2 
3 from utils.enum import Enum
4 from collections import OrderedDict
5 
6 ## This class impersonates a Segment instance, but maintains an alternate (filtered) utterance list. This filtered utterance <em>list</em> can be set or modified without altering up the original Segment's utterance list.
7 # However, modifying the underlying Utterance instances themselves will affect the original Segment's utterance list (i.e. we are dealing with pointers).
8 # An instance of this class will return properties of the underlying Segment object for everything except the 'utters' property. Similarly, setting any attribute other
9 # than 'utters' will set the attribute in the underlying Segment object.
10 # Instances of this class can therefore be used in any context where a Segment instance would be used.
11 class FilteredSeg(object):
12  ## Constructor
13  # @param self
14  # @param orig_seg (Segment) the Segment that's being filtered (this segment will be impersonated)
15  # @param filtered_utters (list) a list of utterances representing the contents of orig_seg after it has been filtered.
16  def __init__(self, orig_seg, filtered_utters):
17  # Note: attributes of this instance are prefixed with an underscore. This indicates that they should not be accessed directly and lessens the confusion between this instance's attributes and the underlying Segment's attributes.
18  # See __getattr__() for more details.
19  self._filtered_utters = filtered_utters
20  self._orig_seg = orig_seg
21 
22  ## Retreives an attribute for this FilteredSeg. You may request any of the attributes of the Segment class.
23  # This method overrides Python's attribute retreival for this object.
24  # This allows us to return the corresponding attribute of the underlying Segment when anything other than 'utters' is requested.
25  # When 'utters' is requested, we return this object's list instead of the underlying Segment's list.
26  # @param self
27  # @param name (string) the name of the attribute being requested.
28  # @returns (object/primitive) returns the requested attribute, or throws an AttributeError if it doesn't exist
29  def __getattr__(self, name):
30  result = None
31 
32  #check if the requested name exists in this instance's (Python maintained) internal attribute dictionary - (note: requests originating inside this instance also hit this method, so this case is needed to prevent infinite recursion.)
33  if name in self.__dict__:
34  result = self.__dict__[name]
35  #if they're requesting the 'utters' attribute, give them this FilteredSeg's list instead of the orig_seg's list
36  elif name == 'utters':
37  result = self.__dict__['_filtered_utters']
38  #this case provides the ability to retreive the underlying segment's attributes
39  elif hasattr(self.__dict__['_orig_seg'], name):
40  result = getattr(self.__dict__['_orig_seg'], name)
41  #if the attribute doesn't exist in either the underlying segment, or this instance, raise an exception
42  else:
43  raise AttributeError('FilteredSeg has no attribute "%s"' % (name))
44 
45  return result
46 
47  ## Sets an attribute for this FilteredSeg. You may set any of the attributes of the Segment class. However, when the 'utters' attribute is set, the underlying Segment's list is not modified (instead, this instance's '_filtered_utters' attribute is used).
48  # This method overrides Python's attribute setting functionality.
49  # This allows us to set the corresponding attribute of the underlying Segment when anything other than 'utters' is requested.
50  # @param self
51  # @param name (string) the name of the attribute to set
52  # @param value (object/primitive) the value to set the specified attribute to
53  def __setattr__(self, name, value):
54  #intercept requests to set the 'utters' attribute, and instead set this instance's '_filtered_utters' attribute
55  if name == 'utters':
56  self.__dict__['_filtered_utters'] = value
57  #requests originating inside this instance also hit this method. This case is needed to prevent infinite recursion.
58  elif name == '_orig_seg' or name == '_filtered_utters':
59  self.__dict__[name] = value
60  #this case provides the ability to set this instance's attributes
61  elif name in self.__dict__['_orig_seg']:
62  self.__dict__['_orig_seg'][name] = value
63  #if the attribute doesn't exist in either the underlying segment, or this instance, raise an exception
64  else:
65  raise AttributeError('FilteredSeg has no attribute "%s"' % (name))
66 
67 ## This class provides various ways of looking up segments or utterances (from a list of Segment-like objects passed to the constructor).
68 # It provides lookup methods for both 'Segments' and 'chains'.
69 # A 'chain' is a linked list of Utterances that have been linked using I/C transcriber codes.
70 class FilterManager(object):
71  #Used to indicate the beginning or end of a chain
72  ENDPOINT_TYPES = Enum('HEAD TAIL'.split(), ['prev', 'next'])
73 
74  ## Constructor
75  # @param self
76  # @param segs (list) list of Segment objects
77  def __init__(self, segs):
78  self.segs = segs #list of all segments
79  self.chains = None #list containing the heads of chains that have been parsed from self.segs (see get_chains() for how this is done) - this is build upon request
80  self.seg_index = None #dictionary that allows fast access (via segment number) to individual segments in self.segs - built upon request
81  self.chain_index = None #dictionary that allows fast access (via the head utterance's id) to individual utterances in self.chains - built upon request
82 
83  ## Retreives a list of all of the segments in this FilterManager
84  # @param self
85  # @returns (list) list of Segments
86  def get_segs(self):
87  return self.segs
88 
89  ## Retreives a list of all of the chains in this FilterManager
90  # @param self
91  # @returns (list) list containing Utterance objects - each element is the head of a chain. Chains are parsed from the segs element of the constructor - see get_chains() for how this is done.
92  def get_chains(self):
93  #'cache' the chains so we don't have to parse them again if this method is called in the future
94  if not self.chains:
95  self.chains = FilterManager.get_chains(self.segs)
96 
97  return self.chains
98 
99  ## Retreives a segment by segment number (Segment class's 'num' attribute).
100  # Note: this number is unique to all Segments parsed by the TRSParser class <em>for a single file.</em>
101  # @param self
102  # @param num (int) the segment number to lookup
103  # @returns (Segment) the Segment with the requested number, or None if not found.
104  def get_seg_by_num(self, num):
105  #use a dictionary to provide a basic index/cache mechanism to speed up future lookups
106  if not self.seg_index:
107  self.seg_index = {}
108  for seg in self.segs:
109  self.seg_index[seg.num] = seg
110 
111  result = None
112  if num in self.seg_index:
113  result = self.seg_index[num]
114 
115  return result
116 
117  ## Retreives a chain by the utterance id of the head node (head node is the first Utterance object in the chain)
118  # Note: this number is unique to all Utterances parsed by the TRSParser class <em>for a single file.</em>
119  # @param self
120  # @param num (int) the utterance id to lookup.
121  # @returns (Utterance) the head of the chain, or None if no matching chain was found.
122  def get_chain_by_num(self, num):
123  #use a dictionary to provide a basic index/cache mechanism to speed up future lookups
124  if not self.chain_index:
125  self.chain_index = {}
126  for cur_head in self.get_chains():
127  self.chain_index[cur_head.id] = cur_head
128 
129  result = None
130  if num in self.chain_index:
131  result = self.chain_index[num]
132 
133  return result
134 
135  ## Retreives a list of chains using the specified list of segments.
136  # The method iterates over all utterances within the specified segments. On each iteration, it follows the 'prev' pointer back to the start of the utterance chain.
137  # This means that this method returns a list of (the heads of) all chains have a node that is in the utterance list of one of the specified segments.
138  # Steps are taken to ensure that if two utterances lead back to the same head, that head is only included once in the returned list (duplicate heads are discarded).
139  # The list that is returned is sorted ascending by the start time of the head nodes.
140  # @param segs (list) list of Segments
141  # @returns (list) list of the (unique) head nodes (utterance objects) of all chains found. The list is sorted in ascending order by the start time of the head utterances.
142  @staticmethod
143  def get_chains(segs):
144  #Note: we use an OrderedDict here to address cases where we have two identical start times.
145  #If we used a regular dict in these cases, the ordering sometimes swaps. This appears strange to the user because
146  #when clicking the 'Group Linked Segments' checkbox, the ordering changes even when there are no chained utterances
147  #in the list.
148  #The swap that occurs in these cases is not due to the sort (according to the python docs it's guarenteed to be stable) - it's
149  #due to the order that the keys are retreived from the dictionary.
150  #Using an OrderedDict causes the keys to be retreived in the same order they were inserted.
151  heads = OrderedDict()
152  for seg in segs:
153  for utter in seg.utters:
154  cur = utter
155  prev = cur
156  while cur != None:
157  prev = cur
158  cur = cur.prev
159 
160  heads[prev] = True #dictionary weeds out duplicates
161 
162  result = heads.keys()
163  result.sort(key=lambda cur_utter: cur_utter.start)
164 
165  return result
166 
167  ## Constructs a string containing the transcription phrases or all utterances in a given chain.
168  # @param head (Utterance) the Utterance object at the head of the chain
169  # @returns (string) a string containing the transcription phrases of all nodes in the chain, separated by an arrow (' -> ')
170  @staticmethod
171  def get_chain_phrase(head):
172  result = ''
173  cur = head
174  prev = head
175  while cur:
176  result += cur.trans_phrase
177 
178  prev = cur
179  cur = cur.next
180  if cur:
181  result += '\n -> '
182 
183  return result, prev
184 
185  @staticmethod
187  result = ''
188  cur = head
189  prev = head
190  while cur:
191  if cur.speaker and cur.speaker.speaker_codeinfo:
192  result += cur.speaker.speaker_codeinfo.code
193  else:
194  result += '?'
195 
196  prev = cur
197  cur = cur.next
198  if cur:
199  result += '\n -> '
200 
201  return result, prev
202 
203  @staticmethod
205  result = ''
206  cur = head
207  prev = head
208  while cur:
209  result += '|%s|' % ('|'.join(cur.trans_codes)) if cur.trans_codes else 'None'
210 
211  prev = cur
212  cur = cur.next
213  if cur:
214  result += '\n -> '
215 
216  return result, prev
217 
218  ## Finds the start/end node of a chain.
219  # @param endpoint_type (int) a member of the enum FilterManager.ENDPOINT_TYPES, indicating whether we're searching for the start or end of the chain
220  # @param utter (Utterance) any utterance in the chain
221  # @returns (Utterance) the start or end node of the chain, as specified by the 'endpoint_type' parameter
222  @staticmethod
223  def get_endpoint(endpoint_type, utter):
224  cur = utter
225  while getattr(cur, endpoint_type) != None:
226  cur = getattr(cur, endpoint_type)
227 
228  return cur