Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
reliability2_parser.py
Go to the documentation of this file.
1 ## @package parsers.reliability2_parser
2 
3 import csv
4 import os
5 import glob
6 import random
7 
8 from data_structs.test2 import Test2
9 from utils.backend_utils import BackendUtils
10 from db.bll_database import BLLDatabase
11 
12 ## This class parses information from an ADEX-exported csv file, for use in the Reliability2 program. It provides tools
13 # to obtain a list of the possible activity and environment categories present in the file, and to pick a given number of segments that belong to
14 # a specific environment-activity group.
15 # The csv file this class operates upon should contain the usual ADEX columns (but with one 5 minute block per row rather than one segment per row - you
16 # can set this up using an option in ADEX), in addition to two columns called "Activity" and "environment".
17 # Environment is the type of daycare in which the recording took place (eg. "home", "home daycare", or "daycare centre"). This generally stays the same for
18 # all 5 minute blocks in a csv file (though it need not).
19 # Activity is something the child is engaged in like "mealtime", "playtime - organized", "naptime", etc., and often differs every couple of blocks (depending on
20 # the attention span of the child :)
21 class Reliability2Parser(object):
22  ## Constructor
23  # @param self
24  # @param filename (string) path to the csv file to read.
25  def __init__(self, filename):
26  self.csv_file = open(filename, 'rb')
27  self.reader = csv.DictReader(self.csv_file)
28 
29  #this is a list of all unique environments in the file
30  self.envs = []
31 
32  #this is a list of all unique activities in the file
33  self.acts = []
34 
35  #data from the csv file will be pushed into this multi-level dictionary, in the format:
36  # self.parse_dict[environment][activity]
37  # each element of this 2 level dictionary is another dictionary, with one of two keys: 'used' or 'unused'
38  # Finally, each of the 'used' and 'unused' keys map to an array of rows from the csv file.
39  # Rows in the 'used' array have already been picked for used by the user. 'unused' rows have not.
40  self.parse_dict = None
41 
42  self.parse()
43 
44  ## This method closes the file that the parser is reading. It should always be called when you are done with a parser instance.
45  # @param self
46  def close(self):
47  self.csv_file.close()
48 
49  ## Reads the csv file and organizes the information according to environment, activity, and used/unused. The info is placed into the
50  # organized parse_dict dictionary described in the constructor.
51  # @param self
52  def parse(self):
53  db = BLLDatabase()
54 
55  self.parse_dict = {}
56 
57  lines = list(self.reader)
58 
59  if lines:
60  #Note: the "Elapsed_Time" column does not reset here because we are using 5 minute blocks
61  start = float(lines[0]['Elapsed_Time'])
62 
63  acts_dict = {}
64  for row in lines:
65  env = row['environment'].strip()
66  act = row['Activity'].strip()
67 
68  if not env in self.parse_dict:
69  self.parse_dict[env] = {}
70 
71  if not act in self.parse_dict[env]:
72  self.parse_dict[env][act] = {'used': [],
73  'unused': [],
74  }
75 
76  spreadsheet_timestamp = Reliability2Parser.get_row_timestamp(row, start)
77  child_code = Reliability2Parser.get_child_code(row)
78  #note: no need to check wav_file here, since that is derived from child_code
79  result_set = db.select('tests2',
80  ['count(id)'],
81  'spreadsheet_timestamp=? AND child_code=?',
82  [spreadsheet_timestamp, child_code]
83  );
84  if int(result_set[0][0]) > 0:
85  self.parse_dict[env][act]['used'].append(row)
86  else:
87  self.parse_dict[env][act]['unused'].append(row)
88 
89  if not act in acts_dict:
90  acts_dict[act] = True
91 
92  self.acts = acts_dict.keys()
93  self.envs = self.parse_dict.keys()
94 
95  db.close()
96 
97  ## Provides a list of all unique activities in the file.
98  # @param self
99  # @returns (list) list of strings, one for each unique activity
100  def get_acts_list(self):
101  return self.acts
102 
103  ## Provides a list of all unique environments in the file.
104  # @param self
105  # @returns (list) list of strings, one for each unique environment
106  def get_envs_list(self):
107  return self.envs
108 
109  ## Attempts to locate a particular wav file in a given folder hierarchy. Returns the path if it's found.
110  # @param self
111  # @param wav_filename (string) the name of the wav file to search for (this should not contain any path information - just the bare filename, eg. "C001a_20090708.wav")
112  # @param root_dir (string) the full path to the root directory of the folder hierarchy to search for the wav file
113  # @returns (string) the full path to the located wav file (including the wav filename itself), or None if the wav file could not be found in the hierarchy
114  def locate_wavfile(self, wav_filename, root_dir):
115  dirs = glob.glob(root_dir + '*' + os.path.sep)
116 
117  path = None
118 
119  if os.path.exists(root_dir + os.path.sep + wav_filename):
120  path = root_dir + os.path.sep + wav_filename
121 
122  else:
123  #if we have an appropriately named directory, search that first
124  target_dir = root_dir + os.path.sep + wav_filename[:-4] + os.path.sep
125  if os.path.exists(target_dir):
126  path = self.locate_wavfile(wav_filename, target_dir)
127  dirs.remove(target_dir)
128 
129  i = 0
130  while not path and i < len(dirs):
131  path = self.locate_wavfile(wav_filename, dirs[i])
132  i += 1
133 
134  return path
135 
136  ## Picks a set of blocks from the csv file, for a given check2 object.
137  # The check2 object has properties that tell the parser which environment-activitiy categories to pick from, and how
138  # many blocks to pick for each. It is possible that there are not enough blocks of a particular category in the csv file.
139  # You can check this before calling this method, by calling the have_enough_blocks() method. If there are not enough blocks of a given type, and
140  # this method is still called anyway, it will return as many blocks as exist for each requested category.
141  # @param self
142  # @param check2 (Check2) the Check2 instance for which we are picking blocks.
143  # @param alt_wav_locate_fcn (function) allows you to specify a custom search function that returns the path to the wav file, if the default locator method
144  # in this class (locate_wavfile()) cannot find it. This param was added to allow calling code to pass in a function that opens a "browse file" dialog box, so that
145  # the user can select the file manually if the code can't find it. The function is passed the name of the wav file we are looking for. If this custom search function
146  # also fails to return a path, an exception is raised.
147  # @param include_used (boolean) By default, this method only picks blocks from the parser's 'unused' category (see parse()). If this is set to True, then
148  # the same block may be picked more than once.
149  # @returns (list) returns a list of Test2 objects, one for each block. If there were not enough blocks of a requested environment-activity type,
150  # this list will be too short. Please call the have_enough_blocks() method first to make sure you have enough blocks of each type.
151  def pick_rows(self, check2, alt_wav_locate_fcn, include_used):
152  sel_test2s = []
153  #if self.have_enough_blocks(check2, include_used):
154  check2_envs = check2.environments
155  check2_envs.sort() #sort these so the UI always goes through them in the same order
156  check2_acts = check2.activities
157  check2_acts.sort()
158 
159  for env in check2_envs:
160  for act in check2_acts:
161  blocks = self.parse_dict[env][act]['unused']
162  if include_used:
163  blocks = blocks + self.parse_dict[env][act]['used']
164 
165  indices = range(len(blocks))
166  random.shuffle(indices)
167  indices = indices[:check2.blocks_per_activity]
168  for i in indices:
169  row = blocks[i]
170  short_wav_filename = Reliability2Parser.get_child_code(row) + '.wav'
171  full_wav_filename = self.locate_wavfile(short_wav_filename, check2.wav_foldername + os.path.sep)
172  if not full_wav_filename:
173  full_wav_filename = alt_wav_locate_fcn(short_wav_filename)
174  if not full_wav_filename:
175  raise Exception('Unable to locate wav file: "%s"' % (short_wav_filename))
176 
177  test2 = Test2(
178  check2.db_id,
179  full_wav_filename,
180  Reliability2Parser.get_child_code(row),
181  Reliability2Parser.get_row_timestamp(row),#row['clock_time_tzadj'],
182  )
183  sel_test2s.append(test2)
184 
185  return sel_test2s
186 
187  ## Returns a timestamp string that uniquely identifies a row.
188  # @param row (list) the row of the csv file for which to generate a timestamp string
189  # @param row_offset_sec (float) the absolute start time of the row. You cannot use the "Elapsed_Time" column value for this - it may restart several times in the same file.
190  # Instead, you should use an accumulator variable that calculates the absolute start time of each row as you loop through the csv file. See parse() for an example.
191  # @returns (string) a timestamp string in the format "yyyy mm dd accum_sec"
192  @staticmethod
193  def get_row_timestamp(row, row_offset_sec):
194  year = row['year']
195  month = BackendUtils.pad_num_str(row['month'])
196  day = BackendUtils.pad_num_str(row['day'])
197 
198  return '%s %s %s %f' % (day, month, year, row_offset_sec)
199 
200  ## Returns the child code for a give row.
201  # @param row (list) the row of the csv file from which to extract the child code
202  # @returns (string) a child code in the format "C_yyyymmdd" - for example, "C002_20091231"
203  @staticmethod
204  def get_child_code(row):
205  year = row['year']
206  month = BackendUtils.pad_num_str(row['month'])
207  day = BackendUtils.pad_num_str(row['day'])
208  child_code = row['child_id']
209 
210  return '%s_%s%s%s' % (child_code, year, month, day)
211 
212  ## Checks to see if we have enough blocks in a csv file to satisfy the requirements for a given Check2 object.
213  #The check2 object has properties that tell the parser which environment-activitiy categories to pick from, and how
214  # many blocks to pick for each.
215  # @param self
216  # @param check2 (Check2) the Check2 instance for which we are counting blocks.
217  # @param include_used (boolean) By default, this method only counts blocks from the parser's 'unused' category (see parse()). If this is set to True, then
218  # the method assumes that the same block may be picked more than once.
219  # @returns (2-tuple) the first element is a boolean that is True if we have enough blocks to satisfy all requirements for the check2 object, False otherwise. The second
220  # element is a string (with newlines such that it is formatted as a three column table) that indicates how many blocks for each type of environment-activity pair are available in the csv file.
221  # This is suitable for printing to the UI in the event that there are not enough of one category (lets them know which category is short).
222  def have_enough_blocks(self, check2, include_used):
223  enough = True
224 
225  table_str = '{0:10} {1:25} {2}\n'.format('Env', 'Act', 'Count')
226  i = 0
227  while i < len(check2.environments): #don't short circuit where enough == True, since we want to build the whole table_str
228  env = check2.environments[i]
229  enough = enough and ( env in self.parse_dict )
230  j = 0
231 
232  while j < len(check2.activities): #don't short circuit where enough == True, since we want to build the whole table_str
233  act = check2.activities[j]
234  if include_used:
235  enough = enough and ( act in self.parse_dict[env] and (len(self.parse_dict[env][act]['unused']) + len(self.parse_dict[env][act]['used'])) >= check2.blocks_per_activity )
236  else:
237  enough = enough and ( act in self.parse_dict[env] and len(self.parse_dict[env][act]['unused']) >= check2.blocks_per_activity )
238 
239  table_str += '{0:10} {1:25} {2}\n'.format(
240  env,
241  act,
242  '%d total - %d used = %d unused' % (
243  len(self.parse_dict[env][act]['used']) + len(self.parse_dict[env][act]['unused']),
244  len(self.parse_dict[env][act]['used']),
245  len(self.parse_dict[env][act]['unused']),
246  ),
247  )
248 
249  j += 1
250  i += 1
251 
252  return enough, table_str