Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
bll_database.py
Go to the documentation of this file.
1 ## package db.bll_database
2 
3 import sqlite3
4 import os
5 import datetime
6 import re
7 import logging
8 
9 from db.database import Database
10 from data_structs.combo_option import ComboOption
11 from data_structs.codes import CodeInfo, Code, TranscriberCode124, TranscriberCode3
12 from utils.enum import Enum
13 from collections import OrderedDict
14 
15 ## Provides BLL-app-specific data retreival and storage routines, building on the base database class.
16 # Constants needed for segment and utterance parsing/manipulation (lena speaker codes, transcriber codes, etc.), application-wide settings (info needed to launch Praat and Excel), common UI combobox options, and any associated information are retreived from database tables and put into Enums by routines in this class.
17 # The DBConstants class (see bottom of file) is populated using these methods. DBConstants' public attributes (Enums) can then be used anywhere in the application code.
19  DB_LOCATION = 'db/bll_db.db'
20  CREATE_TABLES_SCRIPT = 'db/sql/init/create_tables.sql' #this script creates the DB if it doesn't exist
21  INSERT_DATA_SCRIPT = 'db/sql/init/insert_data.sql' #inserts 'constant' data upon DB creation (speaker_codes, transcriber_codes, etc.)
22  UPDATE_SCRIPT_DIR = 'db/sql/updates/' #must include trailing '/'
23  NUM_TRANS_CODES = 4
24 
25  ## Constructor
26  # @param self
27  def __init__(self):
28  #check if the DB is present
29  tables_exist = os.path.exists(BLLDatabase.DB_LOCATION)
30 
31  #this will create the database file if it doesn't exist
32  super(BLLDatabase, self).__init__(BLLDatabase.DB_LOCATION)
33 
34  #create the tables and insert the initial data if the DB was just generated
35  if not tables_exist:
36  logging.info('No database found - creating one...')
37  self.execute_script(BLLDatabase.CREATE_TABLES_SCRIPT)
38  self.execute_script(BLLDatabase.INSERT_DATA_SCRIPT)
39 
40  #apply any previously unapplied update scripts in UPDATE_SCRIPT_DIR
42 
43  ## Runs any sql scripts in UPDATE_SCRIPT_DIR that haven't been run yet.
44  # In order to do this, scripts must follow a naming convention: "update-x.sql", where x is an integer.
45  # The files in UPDATE_SCRIPT_DIR are sorted by this integer before being applied.
46  # If previous update scripts have been applied, the last script number is recorded in the settings table.
47  # Only scripts with numbers greater than this number are applied when this method is called.
48  # @param self
50  settings = self._get_settings_enum() #this is not defined in DBConstants.SETTINGS at the time this method is called...it's an enum of everything in the settings table (attribute names from 'code_name' col, values from 'val' col)
51 
52  #grab the filenames of all the scripts in the directory and insert them into a dictionary, keyed by script number
53  filenames = {}
54  for name in os.listdir(BLLDatabase.UPDATE_SCRIPT_DIR):
55  #filter out any files that don't match the naming convention
56  match = re.match('^update-(\d+)\.sql$', name)
57  if match:
58  script_num = int(match.groups()[0])
59 
60  #filter out scripts that have already been executed
61  if script_num > int(settings.LAST_UPDATE_SCRIPT_NUM):
62  filenames[script_num] = name
63 
64  #sort by script num and execute
65  try:
66  nums = filenames.keys()
67  if nums:
68  nums.sort()
69  for key in nums:
70  logging.info("Executing script: '%s'" % (filenames[key]))
71  sql_file = open('%s%s' % (BLLDatabase.UPDATE_SCRIPT_DIR, filenames[key]), 'r')
72  sql = sql_file.read()
73  sql_file.close()
74  self.cursor.executescript(sql)
75 
76  #update the last script number in the settings table
77  new_update_num = nums[-1]
78  self.cursor.execute('UPDATE settings SET val=? WHERE code_name=?', [new_update_num, 'LAST_UPDATE_SCRIPT_NUM'])
79  DBConstants.SETTINGS = self.select_enum('settings')
80  self.conn.commit()
81 
82  #rollback on error
83  except sqlite3.OperationalError as err:
84  logging.error('Error executing update script:\n%s' % (err))
85  self.conn.rollback()
86 
87  ## Builds an Enum of recognized speaker types from the speaker_types table.
88  # These are derrived from transcriber code 1's possible options (see transcriber manual) like 'MALE_ADULT', 'FEMALE_ADULT', 'OTHER_CHILD', etc.
89  # @param self
90  # @returns (Enum) attributes are from code_name column, values from id column
92  rows = self.select('speaker_types', 'id code_name'.split())
93  ids, code_names = zip(*rows)
94 
95  return Enum(code_names, ids)
96 
97  ## Builds an enum of possible speaker 'distances' (eg. FAN speaker has distance of 'near', FAF speaker has distance of 'far', 'SIL' speaker has distance of 'not applicable').
98  # @param self
99  # @param returns (Enum) attributes are 'NA, NEAR, FAR', values are 0, 1, 2 (respectively)
101  #Note: for now, it doesn't really pay to have a table for this - just don't change the ordering in the enum...the items' values should map directly to the id used the database
102  return Enum(['NA', 'NEAR', 'FAR'])
103 
104  ## Builds an Enum of possible speaker 'properties' (Eg. speakers can be overlapping, be non-verbal noise, or be tv/radio sound)
105  # @param self
106  # @returns (Enum) attributes are 'MEDIA', 'OVERLAPPING', 'NON_VERBAL_NOISE', values are 0, 1, 2, (respectively)
108  #Note: for now, it doesn't really pay to have a table for this - just don't change the ordering in the enum...
109  return Enum(['MEDIA', 'OVERLAPPING', 'NON_VERBAL_NOISE'])
110 
111  ## Builds an Enum of all possible combo groups. Each combo group corresponds to a single dropdown box in the UI, and maps to one or more combo options. The Enum is built from the combo_groups table.
112  # @param self
113  # @returns (Enum) attributes are from the 'code_name' column, values are from the 'id' column
115  rows = self.select('combo_groups', 'id code_name'.split())
116  ids, code_names = zip(*rows)
117 
118  return Enum(code_names, ids)
119 
120  ## Builds an Enum of common regular expressions from the common_regex DB table.
121  # @param self
122  # @returns (Enum) attributes are from the 'code_name' column, values are from the 'id' column
124  rows = self.select('common_regexs', 'code_name regex'.split())
125  code_names, regexs = zip(*rows)
126 
127  return Enum(code_names, regexs)
128 
129  ## Builds a dictionary (of Enums - one for each combo group), containing all prossible combo box options that the UI can use.
130  # @param self
131  # @returns (dictionary) a dictionary of Enums. The dictionary contains a key for each id from the combo_groups table. Each entry contains an Enum of all possible combo options for that group (Retreived from the combo_options table. Enum attributes are from the 'code_name' column, values are from the 'id' column).
133  options = {}
134  groups = self._get_combo_groups_enum()
135  for i in range(len(groups)):
136  rows = self.select('combo_options', 'id code_name'.split(), 'combo_group_id=?', [groups[i]], 'id ASC')
137  ids, code_names = zip(*rows)
138  options[groups[i]] = Enum(code_names, ids)
139 
140  return options
141 
142  ## Builds an Enum of the application settings from the settings table.
143  # @param self
144  # @returns (Enum) Enum attributes are from the 'code_name' column, values are from the 'id' column).
146  settings = None
147  rows = self.select('settings', 'code_name val'.split())
148  (names, vals) = zip(*rows)
149  settings = Enum(names, vals)
150 
151  return settings
152 
153  ## This method creates Code objects containing all the options for a particular type of code (transcriber, lena_notes, speaker).
154  # @param self
155  # @param table (string) name of the table to select from.
156  # @returns (Code/list of Code) a Code object (or list of Code objects, if selecting transcriber codes) that contains code options and properties from the DB table
157  def select_codes(self, table):
158  return {'transcriber_codes': self._select_transcriber_codes,
159  'lena_notes_codes': self._select_lena_notes_codes,
160  'speaker_codes': self._select_speaker_codes,
161  }[table]()
162 
163  ## Builds an Enum based upon a selection from a particular property or table.
164  # @param self
165  # @param table (string) name of the table to select from
166  # @returns (Enum) an Enum with attributes for all of the options within the table (attributes are from the code_name column, values from the id column).
167  def select_enum(self, table):
168  return {'speaker_types': self._get_speaker_types_enum,
169  'speaker_distances': self._get_speaker_distances_enum, #this is not an actual DB table...
170  'speaker_props': self._get_speaker_props_enum, #this is not an actual DB table...
171  'combo_groups': self._get_combo_groups_enum,
172  'combo_options': self._get_combo_options_enum,
173  'common_regexs': self._get_common_regexs_enum,
174  'settings': self._get_settings_enum,
175  }[table]()
176 
177  ## Builds a list of Code objects for transcriber codes. The Code objects contain information from the transcriber_codes database table.
178  # @param self
179  # @returns (list) list of Code objects, one for each transcriber code.
181  #select the data
182  rows = self.select('transcriber_codes', 'id code trans_index display_desc speaker_type_id'.split())
183  distances = self.select_enum('speaker_distances')
184 
185  #construct CodeInfo objects for each option within each of the transcriber codes
186  options_dicts = []
187  for i in range(BLLDatabase.NUM_TRANS_CODES):
188  options_dicts.append(OrderedDict())
189 
190  for cur_row in rows:
191  code_info = CodeInfo(
192  cur_row[0], #db_id
193  cur_row[1], #code
194  cur_row[3], #desc
195  False, #is_linkable
196  distances.NA, #distance
197  cur_row[4], #speaker_type
198  )
199  cur_dict = options_dicts[cur_row[2] - 1]
200  cur_dict[cur_row[1]] = code_info
201 
202  #create the Code objects, using the newly created CodeInfo objects
203  codes = []
204  for i in range(BLLDatabase.NUM_TRANS_CODES):
205  new_code = None
206  #transcriber code 3 can have multiple characters, so there's a special subclass for it...
207  if i in (0, 1, 3,):
208  new_code = TranscriberCode124(options_dicts[i])
209  elif i == 2: #2 because of 0-indexed array
210  new_code = TranscriberCode3(options_dicts[i])
211 
212  codes.append(new_code)
213 
214  return codes
215 
216  ## Creates a Code object containing information about options from the lena_notes_codes table. ('LENA notes codes' look like 'VOC', 'SIL', etc.)
217  # @param self
218  # @returns (Code) Code object that can be used to access info about different lena notes code options.
220  #select the data
221  rows = self.select('lena_notes_codes', 'id code speaker_type_id display_desc'.split())
222  distances = self.select_enum('speaker_distances')
223 
224  #create a dictionary of CodeInfo objects (one for each option)
225  codes = {}
226  for cur_row in rows:
227  code_info = CodeInfo(
228  cur_row[0], #db_id,
229  cur_row[1], #code
230  cur_row[3], #desc
231  cur_row[1] == 'VOC' or cur_row[1] == 'FAN', #is_linkable
232  distances.NA, #distance
233  cur_row[2], #speaker_type
234  )
235  codes[cur_row[1]] = code_info
236 
237  return Code(codes)
238 
239  ## Creates a Code object containing information about options from the speaker_codes table. (LENA speaker codes like 'MAN', 'FAN', etc.)
240  # @param self
241  # returns (Code) a Code object that can be used to access info about different speaker code options.
243  #select the data
244  rows = self.select('speaker_codes', 'id code speaker_type_id display_desc distance is_linkable is_media is_nonverbal_noise is_overlapping'.split())
245  props_enum = self._get_speaker_props_enum()
246 
247  #build a dictionary of CodeInfo objects, one for each option
248  codes = {}
249  for cur_row in rows:
250  #append any special properties recorded in the table
251  props = []
252  if cur_row[6]:
253  props.append(props_enum.MEDIA)
254  if cur_row[7]:
255  props.append(props_enum.NON_VERBAL_NOISE)
256  if cur_row[8]:
257  props.append(props_enum.OVERLAPPING)
258 
259  code_info = CodeInfo(
260  cur_row[0], #db_id
261  cur_row[1], #code
262  cur_row[3], # desc
263  cur_row[5], #is_linkable
264  cur_row[4], #distance
265  cur_row[2], #speaker_type
266  props, #properties
267  )
268  codes[cur_row[1]] = code_info
269 
270  return Code(codes)
271 
272 ## This class does an initial SQL select on database 'lookup tables' (tables containing information that is frequently used, but never updated while the program is running), and provides acess to this information in the form of a number of static data structures.
273 # This way, the lookup table info is only retreived from the database once at the start of the application. In addition, code that needs this data can retrieve it without worrying about building SQL statements and executing them.
274 # See the code for an in-comment description of each data structure.
275 class DBConstants(object):
276  #this is the format that SQLite stores timestamps in. See Python documentation on Time.strftime() for placeholder definitions
277  DB_DATETIME_FMT = '%Y-%m-%d %H:%M:%S'
278 
279  #list of Code objects, one for each transcriber code
280  TRANS_CODES = None
281  #Code object for lena codes (like 'VOC', 'SIL')
282  LENA_NOTES_CODES = None
283  #Code object for lena speakers codes (like 'MAN', 'FAN')
284  SPEAKER_CODES = None
285 
286  #Enum of speaker types (as defined in the transcriber manual for code 1) like 'MALE_ADULT', 'FEMALE_ADULT', 'OTHER_CHILD', etc.
287  SPEAKER_TYPES = None
288  #Enum of speaker distances (like 'NEAR', 'FAR', 'NA')
289  SPEAKER_DISTANCES = None
290  #Enum of speaker properties (like 'MEDIA', 'OVERLAPPING', 'NON_VERBAL_NOISE')
291  SPEAKER_PROPS = None
292 
293  #Enum of all (code_name, val) pairs from the settings table (stuff like path to MS Excel, etc.)
294  SETTINGS = None
295  #Enum of common regular expressions from the common_regexs table
296  COMMON_REGEXS = None
297 
298  #an Enum of all options in the combo_groups table
299  COMBO_GROUPS = None
300  #this is a dictionary. keys are group_ids from the COMBO_GROUPS enum above, and values are enums (enum properties are code_name column values from the combo_options table in the DB)
301  COMBO_OPTIONS = None
302  #this is a dictionary - keys are elements of the enum COMBO_GROUPS, values are sub-dictionaries. Sub-dictionary keys are values of the enum COMBO_OPTIONS[group_id], values are ComboOption objects.
303  COMBOS = None
304 
305  #In other words, to access the option objects for the combo group with code_name LENA_CATEGORIES, you would do the following:
306  #lena_options_dict = COMBOS[COMBO_GROUPS.LENA_CATEGORIES]
307  #lena_options_dict_keys_enum = COMBO_OPTIONS[COMBO_GROUPS.LENA_CATEGORIES]
308  #for i in range(len(lena_options_dict_keys_enum)):
309  # group_option_key = lena_options_dict_keys_enum[i]
310  # option_obj = COMBOS[COMBO_GROUPS.LENA_CATEGORIES][group_option_key]
311 
312 ## This function constructs the data structure used to initialize the DBConstants.COMBOS constant.
313 # This is a dictionary - keys are elements of the enum COMBO_GROUPS, values are sub-dictionaries. Sub-dictionary keys are values of the enum COMBO_OPTIONS[group_id], values are ComboOption objects.
314 # @param db (BLLDatabase) database handle object
315 # @returns (dictionary) a dictionary, as described above
316 def _get_combos(db):
317  groups = db.select_enum('combo_groups')
318  rows = db.select('combo_options', 'id code_name combo_group_id disp_desc hidden'.split(), None, [], 'id ASC')
319  options = {}
320 
321  for cur_row in rows:
322  group_id = cur_row[2]
323  opt_id = cur_row[0]
324  if not group_id in options:
325  options[group_id] = {}
326 
327  options[group_id][opt_id] = ComboOption(*cur_row)
328 
329  return options
330 
331 ## Populates the constants in the DBConstants class.
333  db = BLLDatabase()
334  DBConstants.TRANS_CODES = db.select_codes('transcriber_codes')
335  DBConstants.LENA_NOTES_CODES = db.select_codes('lena_notes_codes')
336  DBConstants.SPEAKER_CODES = db.select_codes('speaker_codes')
337 
338  DBConstants.SPEAKER_TYPES = db.select_enum('speaker_types')
339  DBConstants.SPEAKER_DISTANCES = db.select_enum('speaker_distances')
340  DBConstants.SPEAKER_PROPS = db.select_enum('speaker_props')
341  DBConstants.COMMON_REGEXS = db.select_enum('common_regexs')
342  DBConstants.SETTINGS = db.select_enum('settings')
343 
344  DBConstants.COMBO_GROUPS = db.select_enum('combo_groups')
345  DBConstants.COMBO_OPTIONS = db.select_enum('combo_options')
346  DBConstants.COMBOS = _get_combos(db)
347 
348  db.close()
349 
350 #Note: this is only executed once, the first time the Python interpreter encounters the file