Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
parser_tools.py
Go to the documentation of this file.
1 ## @package parsers.parser_tools
2 
3 import random
4 
5 ## This class encapsulates some common static methods used by various parsers.
6 class ParserTools(object):
7  ## Checks whether or not a given segment passes all filters in a given list. The filters are run in such a way that any changes they make on the segment or it's utterances are made permanent.
8  # @param seg (Segment) the segment to put through the filters
9  # @param seg_filters (list) list of SegFilter objects
10  # @returns (boolean) True if segment passed all filters, False otherwise
11  @staticmethod
12  def include_seg(seg, seg_filters):
13  i = 0
14  included = True
15  while i < len(seg_filters) and included:
16  #make filter exclusions/changes permanent
17  filtered_seg = seg_filters[i].filter_seg(seg)
18  included = filtered_seg != None
19  if included:
20  seg.utters = filtered_seg.utters
21  i += 1
22 
23  return included
24 
25  ## Picks n random segments from a list (without duplicates).
26  # @param n (int) number of segments to pick from the given list
27  # @param segs (list) List of Segment objects to pick from
28  # @returns (list) List of picked Segment objects. If returned list is empty, there were less than n segments in segs.
29  @staticmethod
30  def pick_rand_segs(n, segs):
31  total_segs = len(segs)
32  rand_indices = range(total_segs)
33  random.shuffle(rand_indices)
34  picked_segs = []
35 
36  if total_segs >= n:
37  random.shuffle(rand_indices)
38  picked_segs = map(lambda i: segs[i], rand_indices[:n])
39 
40  return picked_segs
41 
42  ## Picks n consecutive elements from a list.
43  # @param n (int) number of segments to pick from the given list
44  # @param segs (list) List of Segment objects to draw from
45  # @returns (list) List of n consecutive Segment objects. If returned list is empty, there were less than n segments in segs.
46  @staticmethod
47  def pick_contiguous_segs(n, segs):
48  picked_segs = []
49 
50  if len(segs) >= n:
51  picked_segs = segs[:n]
52 
53  return picked_segs
54 
55  @staticmethod
56  def hacked_pick_rand_segs(n, segs, filename):
57  taken_dict = None
58  if filename == 'C003_20090708.csv':
59  taken_dict = {
60  #FAN
61  '17242.17': True,
62  '3964.05': True,
63  '4657.84': True,
64  '5050.81': True,
65  '5443.52': True,
66  '9028.79': True,
67  '16972.12': True,
68  '3725.36': True,
69  '3838.99': True,
70  '9077.36': True,
71  '5559.10': True,
72  '4846.05': True,
73  '4624.57': True,
74  '17524.45': True,
75  '8713.28': True,
76  '10557.96': True,
77  '5081.40': True,
78  '17250.11': True,
79  '3686.86': True,
80  '4211.19': True,
81  '4546.91': True,
82  '4016.98': True,
83  '4483.29': True,
84  '9026.90': True,
85  '16385.44': True,
86  '4483.92': True,
87  '4588.89': True,
88  '4992.00': True,
89  '8875.75': True,
90  '4568.56': True,
91  '5033.57': True,
92  '3719.53': True,
93  '9365.84': True,
94  #CHN
95  '4520.66': True,
96  '17226.81': True,
97  '5513.61': True,
98  '4306.33': True,
99  '5404.28': True,
100  '4366.81': True,
101  '3730.20': True,
102  '16615.44': True,
103  '3891.41': True,
104  '7802.87': True,
105  '8691.22': True,
106  '10642.64': True,
107  '7812.48': True,
108  '9884.79': True,
109  '16555.55': True,
110  '8827.25': True,
111  '9700.96': True,
112  '3979.40': True,
113  '16520.45': True,
114  '4605.32': True,
115  '5558.33': True,
116  '10102.12': True,
117  '9506.63': True,
118  '8736.20': True,
119  '17002.30': True,
120  '17366.97': True,
121  }
122 
123  elif filename == 'C006_20090827.csv':
124  taken_dict = {
125  #FAN
126  '762.46': True,
127  '12067.15': True,
128  '2497.60': True,
129  '2500.56': True,
130  '4005.93': True,
131  '7014.08': True,
132  '1605.25': True,
133  '758.74': True,
134  '4980.74': True,
135  '9862.81': True,
136  '6114.66': True,
137  '4016.32': True,
138  '1678.45': True,
139  '9750.89': True,
140  '492.12': True,
141  '10341.67': True,
142  '4036.22': True,
143  '3141.15': True,
144  '8896.20': True,
145  '5591.67': True,
146  '3145.76': True,
147  '9827.39': True,
148  '7243.67': True,
149  '3142.98': True,
150  '11877.85': True,
151  '372.73': True,
152  '2711.66': True,
153  '5754.33': True,
154  '7824.91': True,
155  '2297.99': True,
156  '9510.07': True,
157  '11902.29': True,
158  '3129.17': True,
159  '8699.76': True,
160  #CHN
161  '5265.42': True,
162  '1613.54': True,
163  '3705.76': True,
164  '6663.56': True,
165  '5065.69': True,
166  '7011.00': True,
167  '8115.81': True,
168  '7865.42': True,
169  '6878.60': True,
170  '6315.12': True,
171  '478.84': True,
172  '4939.96': True,
173  '2170.33': True,
174  '6363.28': True,
175  '10125.46': True,
176  '1332.36': True,
177  '10075.21': True,
178  '10293.21': True,
179  '1624.18': True,
180  '2687.92': True,
181  '996.51': True,
182  '8781.95': True,
183  '8822.81': True,
184  '2596.32': True,
185  '2667.29': True,
186  '10089.00': True,
187  '10021.41': True,
188  '4376.08': True,
189  '6348.46': True,
190  '10107.56': True,
191  '11251.48': True,
192  '2499.73': True,
193  '286.47': True,
194  '8769.91': True,
195  '4991.67': True,
196  '2494.38': True,
197  }
198 
199  elif filename == 'C023_20100823.csv':
200  taken_dict = {
201  #FAN
202  '21740.33': True,
203  '2845.94': True,
204  '17096.02': True,
205  '33020.80': True,
206  '31704.45': True,
207  '20926.28': True,
208  '17508.61': True,
209  '17529.43': True,
210  '32065.27': True,
211  '14602.93': True,
212  '771.12': True,
213  '17830.76': True,
214  '2328.79': True,
215  '3512.89': True,
216  '19930.19': True,
217  '30807.89': True,
218  '18001.32': True,
219  '9981.22': True,
220  '16902.46': True,
221  '18589.66': True,
222  '18372.09': True,
223  '33937.65': True,
224  '21280.72': True,
225  '13412.66': True,
226  '21867.95': True,
227  '17899.45': True,
228  '20739.55': True,
229  '6468.05': True,
230  '18492.06': True,
231  '32959.28': True,
232  '13252.51': True,
233  '3779.60': True,
234  '13298.03': True,
235  '20507.62': True,
236  '488.93': True,
237  '13082.66': True,
238  '22277.63': True,
239  '16634.54': True,
240  '18155.61': True,
241  #CHN
242  '11766.19': True,
243  '434.36': True,
244  '4178.19': True,
245  '12018.17': True,
246  '32117.80': True,
247  '32780.66': True,
248  '3202.78': True,
249  '21645.76': True,
250  '22653.66': True,
251  '32936.19': True,
252  '31512.64': True,
253  '1562.17': True,
254  '2538.92': True,
255  '20384.04': True,
256  '17322.38': True,
257  '22500.27': True,
258  '20650.86': True,
259  '22929.71': True,
260  '1418.28': True,
261  '2353.34': True,
262  '33608.65': True,
263  '20330.16': True,
264  '17939.05': True,
265  '20193.14': True,
266  '14497.25': True,
267  '5211.13': True,
268  '22274.30': True,
269  '33747.42': True,
270  '32619.74': True,
271  '21627.21': True,
272  '10561.03': True,
273  '12331.10': True,
274  '1160.63': True,
275  '22190.38': True,
276  }
277 
278  elif filename == 'C074_20130221.csv':
279  taken_dict = {
280  #FAN
281  '6705.6': True,
282  '2984.55': True,
283  '2855.29': True,
284  '4023.14': True,
285  '4734.26': True,
286  '5638.11': True,
287  '4437.63': True,
288  '3022.48': True,
289  '3633.28': True,
290 
291  #CHN
292  '1125.92': True,
293  '5354.19': True,
294  '5010.58': True,
295  '23.00': True,
296  '4511.46': True,
297  '4645.87': True,
298  '4945.31': True,
299  '156.59': True,
300  '5170.13': True,
301  '1373.89': True,
302  '4600.50': True,
303  '4573.14': True,
304  '626.23': True,
305  }
306 
307  elif filename == 'C085_20130311.csv':
308  taken_dict = {
309  #FAN
310  '3982.13': True,
311  '9272.77': True,
312  '6761.11': True,
313  '8251.69': True,
314  '11734.68': True,
315  '11199.65': True,
316  '6409.41': True,
317  '11804.76': True,
318  '4833.22': True,
319  '10393.21': True,
320  '11736.97': True,
321  '6016.37': True,
322  '11467.62': True,
323  '9108.90': True,
324  '12071.67': True,
325  '970.49': True,
326  '11234.62': True,
327  '11415.88': True,
328  '5489.04': True,
329  '6731.35': True,
330  '7305.50': True,
331  '7816.28': True,
332  '4852.05': True,
333  '5500.60': True,
334  '5597.07': True,
335  '5408.10': True,
336  '6693.79': True,
337  '6985.12': True,
338  '6636.05': True,
339  '7007.05': True,
340  '11579.30': True,
341  '8186.82': True,
342  '7605.80': True,
343  '7057.29': True,
344  '9360.01': True,
345  '2942.61': True,
346  '4323.16': True,
347  '11463.67': True,
348  '9242.39': True,
349  '5671.34': True,
350  '578.54': True,
351  '12035.77': True,
352 
353  #CHN
354  '9824.82': True,
355  '6833.14': True,
356  '5266.27': True,
357  '6015.64': True,
358  '7262.01': True,
359  '8063.05': True,
360  '8424.66': True,
361  '11348.86': True,
362  '962.98': True,
363  '8559.17': True,
364  '7350.70': True,
365  '8255.04': True,
366  '7362.99': True,
367  '13102.11': True,
368  '636.92': True,
369  '5199.32': True,
370  '7342.88': True,
371  '10080.56': True,
372  '5413.87': True,
373  '4063.53': True,
374  '8333.67': True,
375  '9865.08': True,
376  '11256.24': True,
377  '4540.65': True,
378  '6381.08': True,
379  '4899.32': True,
380  '7507.60': True,
381  '6465.93': True,
382  '6942.53': True,
383  '10046.31': True,
384  '2973.70': True,
385  '4534.57': True,
386  '12091.98': True,
387  '12017.46': True,
388  '5204.61': True,
389  '4926.44': True,
390  '4505.40': True,
391  '5920.17': True,
392  '9114.44': True,
393  }
394 
395  if taken_dict:
396  hacked_segs = []
397  for cur_seg in segs:
398  start = '%0.2f' % (cur_seg.start)
399  if start not in taken_dict:
400  hacked_segs.append(cur_seg)
401  segs = hacked_segs
402 
403  total_segs = len(segs)
404  rand_indices = range(total_segs)
405  random.shuffle(rand_indices)
406  picked_segs = []
407 
408  if total_segs >= n:
409  random.shuffle(rand_indices)
410  picked_segs = map(lambda i: segs[i], rand_indices[:n])
411 
412  return picked_segs