Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
csv_database.py
Go to the documentation of this file.
1 ## @package db.csv_database
2 
3 from collections import OrderedDict
4 from db.mem_database import MemDatabase
5 from datetime import datetime
6 from utils.enum import Enum
7 import csv
8 import os
9 #from db.database import Database
10 
11 ## This class provides a way to convert a csv file into an in-memory database.
12 # This allows you to do SQL-operations on the data, which in some circumstances can be much more convenient than implementing
13 # search (query), sort, or data modification methods in straight Python. The cost for doing this is a little extra time inserting all the data - however, this
14 # cost isn't usually too bad because in-memory databases are up to an order of magnitude faster than file-based ones.
15 #
16 # Sample usage:
17 # \code{.py}
18 #
19 # #create a list of data types corresponding to the format of (a row in) your csv file
20 # data_types = [str, int, float, bool, str, str, str]
21 #
22 # #this will create a single in-memory table with the contents of multiple csv files (files must have the same columns)
23 # db = CSVDatabase.create_with_files(['myFile1.csv', 'myFile2.csv'], data_types)
24 #
25 # #select Age, Gender from table where Age > 12
26 # rows = db.csv_select_by_name(col_names=['Age', 'Gender'],
27 # where_body='%s > ?',
28 # where_cols=['Age'], params=[12])
29 #
30 # #you can append new named columns to the db that do not exist in the csv file
31 # db.add_column('Extra column', bool)
32 #
33 # #See also csv_insert(), csv_delete_by_name(), csv_update_by_name()
34 #
35 # #You can write out the database to a csv file like this:
36 # db.write_to_file('path/to/database.db', omit_col_indices=[0]) #this will omit the first column from the exported spreadsheet
37 #
38 # \endcode
40  TABLE_NAME = 'data'
41  COL_PREFIX = 'col' #column names are like "COL_PREFIX<index>" - e.g. col0, col1, col2, ...
42  DATA_TYPES = {
43  str: 'TEXT',
44  float: 'REAL',
45  int: 'INTEGER',
46  bool: 'INTEGER', #sqlite does not have a separate boolean type. It's no trouble to convert these to integers for DB operations.
47  datetime: 'TEXT', #sqlite doesn't have a datetime type, but provides functions that allows us to interpret strings as dates.
48  #These should be entered in the form 'yyyy-mm-dd hh:mm:ss.sss'
49  }
50  ORDER_DIRS = Enum(['ASC', 'DESC'], ['ASC', 'DESC'])
51 
52  ## Constructor
53  # @param self
54  # @param header_row (list of Strings) List of names of the columns in the csv file (generally this is just the top row of the csv file
55  # @param col_datatypes (list) List of data type functions (these must be keys from the static CSVDatabase.DATA_TYPES dictionary) corresponding to the columns in your csv file.
56  def __init__(self, header_row, col_datatypes):
57  super(CSVDatabase, self).__init__()
58 
59  self.name_lookup = OrderedDict( zip(header_row, range(len(col_datatypes))) )
60  self.header_row = list(header_row)
61  self.col_datatypes = list(col_datatypes)
62  self._create_data_table()
63 
64  ## Convenience method to create a database from a list of rows that have already been read from a csv file.
65  # @param rows (list) List of lists, where inner lists are rows of csv data. First row must be column names.
66  # @param col_datatypes (list) List of data type functions (these must be keys from the static CSVDatabase.DATA_TYPES dictionary) corresponding to the columns in your csv file.
67  @staticmethod
68  def create_with_rows(rows, col_datatypes):
69  db = CSVDatabase(rows[0], col_datatypes)
70 
71  for i in range(1, len(rows)):
72  db.csv_insert(rows[i])
73 
74  return db
75 
76  ## Convenience method to create a database from a bunch of files that all have the same columns (and column order).
77  # Note: This method will complain if it finds that any of the files you pass it have different column names.
78  # @param file_list (List) list of paths (Strings) to the files whose data you want to put into the new database
79  # @param col_datatypes (list) List of data type functions (these must be keys from the static CSVDatabase.DATA_TYPES dictionary) corresponding to the columns in your csv file.
80  # @returns (CSVDatabase) a CSVDatabase object containing the rows from all of the files. This won't have a primary key, so if the data is large you may want to add some indices before you start querying.
81  @staticmethod
82  def create_with_files(file_list, col_datatypes):
83  file_in = open(file_list[0], 'rb')
84  reader = csv.reader(file_in)
85  db = CSVDatabase(reader.next(), col_datatypes)
86  file_in.close()
87 
88  for cur_file in file_list:
89  file_in = open(cur_file, 'rb')
90  reader = csv.reader(file_in)
91  cur_header = reader.next()
92 
93  if db.header_row == cur_header:
94  for row in reader:
95  db.csv_insert(row)
96 
97  file_in.close()
98  else:
99  raise Exception('File %s has different column headers than the other files.' % (cur_file))
100 
101  return db
102 
103  ## Creates the single table used by this in-memory database.
104  # @param self
106  col_sql = 'id INTEGER PRIMARY KEY AUTOINCREMENT'
107  for i in range(len(self.col_datatypes)):
108  col_sql += ', %s%d %s NULL' % (CSVDatabase.COL_PREFIX, i, CSVDatabase.DATA_TYPES[self.col_datatypes[i]])
109 
110  data_sql = 'CREATE TABLE ' + CSVDatabase.TABLE_NAME + '(' + col_sql + ')'
111 
112  self.cursor.execute(data_sql)
113 
114  ## Inserts a row of data straight from a csv file. The data must have the types given in the col_datatypes param that was passed to the constructor or this will cause an exception.
115  # @param self
116  # @param csv_row (list) list of data from a csv file (corresponding to a single row)
117  # @returns (list) list of all the primary key values of the inserted rows
118  def csv_insert(self, csv_row):
119  return super(CSVDatabase, self).insert(
120  'data',
121  map(lambda num: '%s%d' % (CSVDatabase.COL_PREFIX, num), range(len(self.col_datatypes))), [csv_row]
122  )
123 
124  ## Selects data from the database. This method works using the indices of the columns in the CSV file that you'd like to select.
125  # @param col_indices (list=None) a list of the indices of the columns you'd like to select.
126  # @param where_cond (String=None) see database.select()
127  # @param params (list=[]) see database.select()
128  # @param order_by_indices (list=None) list of indices of columns you'd like to order by
129  # @param order_by_dirs (list=None) list of strings (each must be either 'ASC', 'DESC') that indicate the direction in which you want to order the columns specified by order_by_indices.
130  # @param dump_sql (boolean=False) if True, prints the generated SQL statement to stdout
131  # @param fcn_indices (list=None) list of indices of columns you'd like to apply SQL aggregate functions to ('avg', 'sum', etc.).
132  # @param fcns (list=None) list of Strings (SQL functions like 'avg', 'sum', etc.) to apply to the columns specified by fcn_indices
133  # @param group_by_index (int=None) index of column you'd like to group by
134  # @returns (list) list of lists, one sub-list per row. Each sublist contains data in the order specified by col_indices.
136  self,
137  col_indices=None,
138  where_cond=None,
139  params=[],
140  order_by_indices=None,
141  order_by_dirs=None,
142  dump_sql=False,
143  fcn_indices=None,
144  fcns=None,
145  group_by_index=None
146  ):
147  if col_indices == None:
148  col_indices = range(len(self.col_datatypes))
149 
150  cols = []
151  for index in col_indices:
152  name = '%s%d' % (CSVDatabase.COL_PREFIX, index)
153  cols.append(name)
154 
155  #add in any aggregate functions
156  if fcn_indices and fcns:
157  for i in range(len(fcn_indices)):
158  for j in range(len(col_indices)):
159  if fcn_indices[i] == col_indices[j]:
160  cols[j] = '%s(%s)' % (fcns[i], cols[j])
161 
162  group_by = None
163  if group_by_index != None:
164  group_by = '%s%d' % (CSVDatabase.COL_PREFIX, group_by_index)
165 
166  order_by = None
167  if order_by_indices != None:
168  order_by = ''
169  for i in range(len(order_by_indices)):
170  index = order_by_indices[i]
171  order_by += '%s%d %s' % (CSVDatabase.COL_PREFIX, index, order_by_dirs[i])
172  if i < len(order_by_indices) - 1:
173  order_by += ', '
174 
175  return super(CSVDatabase, self).select(
176  CSVDatabase.TABLE_NAME,
177  cols,
178  where_cond=where_cond,
179  params=params,
180  order_by=order_by,
181  dump_sql=dump_sql,
182  group_by=group_by
183  )
184 
185 
186  ## Gets a list of the indices of columns you specify by name.
187  # @param self
188  # @param col_names (list) list of column names (Strings)
189  # @returns (list) list of the indices that correspond to the column names you specified
190  def _get_col_indices(self, col_names):
191  col_indices = None
192  if col_names == None:
193  col_indices = self.name_lookup.values()
194  else:
195  col_indices = [self.name_lookup[name] for name in col_names]
196 
197  return col_indices
198 
199  ## Translates CSV column names to DB column names to build a SQL where clause like 'x = y'. This will presumably be attached to a SQL statement to form '... where x = y'.
200  # @param self
201  # @param where_body (String) body of the where clause, containing placeholders (%s) for the column names (Eg. '%s = ?')
202  # @param where_cols (list) list of csv names of columns. These will be translated to DB column names and placed in the where_body (these go in the %s placeholders).
203  # @returns (String) the clause. This contains db column names.
204  def _build_where_clause(self, where_body, where_cols):
205  where_cond = None
206  if where_body and where_cols:
207  db_col_names = []
208  for col in where_cols:
209  db_col_names.append( '%s%d' % (CSVDatabase.COL_PREFIX, self.name_lookup[col]) )
210  where_cond = where_body % tuple(db_col_names)
211 
212  return where_cond
213 
214  ## Fetches a list of all of the DB column names from the DB table.
215  # @param self
216  # @returns (list)
217  def get_db_col_names(self):
218  return dict(zip(
219  self.header_row,
220  map(lambda i: '%s%d' % (CSVDatabase.COL_PREFIX, i), self._get_col_indices(self.header_row))
221  ))
222 
223  ## Selects data from the database. This method works using the CSV names of the columns you want to select.
224  # @param col_names (list=None) a list of the csv names of the columns you'd like to select.
225  # @param where_cond (String=None) a string like ('%s = ?'). This will be translated into 'where age = 14' using the where_cols and params parameters (see below).
226  # @param where_cols (list) list of csv names of columns to put in the %s placeholders in where_cond.
227  # @param params (list=[]) see database.select(). These values will be placed in the ? placeholders in where_cond.
228  # @param order_by (list=None) list of csv names of columns you'd like to order by
229  # @param order_by_dirs (list=None) list of strings (each must be either 'ASC', 'DESC') that indicate the direction in which you want to order the columns specified by order_by.
230  # @param dump_sql (boolean=False) if True, prints the generated SQL statement to stdout
231  # @param fcn_col_names (list=None) list of csv names of columns you'd like to apply SQL aggregate functions to ('avg', 'sum', etc.).
232  # @param fcns (list=None) list of Strings (SQL functions like 'avg', 'sum', etc.) to apply to the columns specified by fcn_col_names
233  # @param group_by (int=None) csv name of column you'd like to group by
234  # @returns (list) list of lists, one sub-list per row. Each sublist contains data in the order specified by col_names.
235  def csv_select_by_name(
236  self,
237  col_names=None,
238  where_body=None,
239  where_cols=None,
240  params=[],
241  order_by=None,
242  order_by_dirs=None,
243  dump_sql=False,
244  fcn_col_names=None,
245  fcns=None,
246  group_by=None
247  ):
248  col_indices = self._get_col_indices(col_names)
249  where_cond = self._build_where_clause(where_body, where_cols)
250  fcn_indices = self._get_col_indices(fcn_col_names)
251 
252  group_by_index = None
253  if group_by != None:
254  group_by_index = self._get_col_indices([group_by])[0]
255 
256  order_by_indices = None
257  if order_by != None:
258  order_by_indices = self._get_col_indices(order_by)
259  #order_by_index = self._get_col_indices([order_by])[0]
260 
261  return self.csv_select_by_index(
262  col_indices=col_indices,
263  where_cond=where_cond,
264  params=params,
265  order_by_indices=order_by_indices,
266  order_by_dirs=order_by_dirs,
267  dump_sql=dump_sql,
268  fcn_indices=fcn_indices,
269  fcns=fcns,
270  group_by_index=group_by_index
271  )
272 
273  ## Generates and executes a SQL update statement. This method uses the indices of the columns in the csv file. See csv_select_by_index() for a description of the parameters.
274  # @param self
275  # @param col_indices (list)
276  # @param where_cond (String=None)
277  # @param params (list)
278  # @param dump_sql (boolean=False)
279  # @returns (int) number of rows updated
280  def csv_update_by_index(self, col_indices, where_cond=None, params=[], dump_sql=False):
281  cols = map(lambda num: '%s%d' % (CSVDatabase.COL_PREFIX, num), col_indices)
282 
283  return super(CSVDatabase, self).update(
284  CSVDatabase.TABLE_NAME,
285  cols,
286  where_cond=where_cond,
287  params=params,
288  dump_sql=dump_sql
289  )
290 
291  ## Generates and executes a SQL delete statement. This method uses the indices of the columns in the csv file. See csv_select_by_index() for a description of the parameters.
292  # @param self
293  # @param where_cond (String=None)
294  # @param params (list)
295  # @returns (int) number of rows deleted
297  self,
298  where_cond=None,
299  params=[]
300  ):
301  return super(CSVDatabase, self).delete(
302  CSVDatabase.TABLE_NAME,
303  where_cond,
304  params
305  )
306 
307  ## Generates and executes a SQL delete statement. This method uses the names of the columns in the csv file. See csv_select_by_name() for a description of the parameters.
308  # @param self
309  # @param where_body (String=None)
310  # @param where_cols (list=None)
311  # @param params (list)
312  # @returns (int) number of rows deleted
313  def csv_delete_by_name(
314  self,
315  where_body=None,
316  where_cols=None,
317  params=[]
318  ):
319  where_cond = self._build_where_clause(where_body, where_cols)
320 
321  return self.csv_delete_by_index(
322  where_cond,
323  params
324  )
325 
326  ## Generates and executes a SQL update statement. This method uses the names of the columns in the csv file. See csv_select_by_name() for a description of the parameters.
327  #Sample usage:
328  #col_names = ['File_Name', 'Child_Gender']
329  #where_body = '%s > ? and %s > ?'
330  #where_cols = ('FAN, 'CHN')
331  #params = (23.3, 45.5, 20.0, 10.0)
332  #Using these parameters will generate the following SQL:
333  # 'update data set col0=23.3, col3=45.5 where col10 > 20.0 and col11 > 10.0;'
334  # Assuming 'File_Name' -> col0, 'Child_Gender' -> col3, 'FAN' -> col10, 'CHN' -> col11.
335  # @param self
336  # @param where_body (String=None)
337  # @param where_cols (list=None)
338  # @param params (list)
339  # @param dump_sql (boolean=False)
340  def csv_update_by_name(self, col_names, where_body=None, where_cols=None, params=[], dump_sql=False):
341  col_indices = None
342  if col_names == None:
343  col_indices = self.name_lookup.values()
344  else:
345  col_indices = [self.name_lookup[name] for name in col_names]
346 
347  where_cond = None
348  if where_body and where_cols:
349  db_col_names = []
350  for col in where_cols:
351  db_col_names.append( '%s%d' % (CSVDatabase.COL_PREFIX, self.name_lookup[col]) )
352  where_cond = where_body % tuple(db_col_names)
353 
354  return self.csv_update_by_index(col_indices, where_cond, params, dump_sql)
355 
356  def get_db_col_name(self, text_name):
357  return '%s%d' % (CSVDatabase.COL_PREFIX, self.name_lookup[name])
358 
359  ## Adds a named column to the this database's single table.
360  # The values in the newly added column are all set to NULL.
361  # @param self
362  # @param name (string) name of the column to add.
363  # @param datatype (fcn) a data-type function (must be in the keys of the static CSVDatabase.DATA_TYPES dictionary) corresponding to the type of data that this column will hold
364  def add_column(self, name, datatype):
365  col_index = len(self.col_datatypes)
366 
367  self.cursor.execute('ALTER TABLE %s ADD COLUMN %s %s NULL' % (
368  CSVDatabase.TABLE_NAME,
369  '%s%d' % (CSVDatabase.COL_PREFIX, col_index),
370  CSVDatabase.DATA_TYPES[datatype],
371  ))
372 
373  self.col_datatypes.append(datatype)
374  self.header_row.append(name)
375  self.name_lookup[name] = col_index
376 
377  ## Takes a row (list) of data and removes the specified columns from the list. This is a convenience method and does not affect the data in the DB in any way.
378  # @param self
379  # @param row (list) a row of values from the database or csv file
380  # @param col_indices (list) list of the column indices that you want to remove from the row
381  # @returns (list) a new row that does not contain any of the columns specified using col_indices.
382  def _remove_cols(self, row, col_indices):
383  filtered_row = []
384  for i in range(len(row)):
385  if not (i in col_indices):
386  filtered_row.append(row[i])
387 
388  return filtered_row
389 
390  ## Writes out the data in this CSVDatabase contains to a csv file.
391  # A header row (column names) will be written at the top of the file.
392  # @param self
393  # @param path (String) path to the csv file to write to
394  # @param omit_col_indices (list=[]) list of indices of columns that you want to omit from the exported csv file.
395  def write_to_file(self, path, omit_col_indices=[]):
396  file_out = open(path, 'wb')
397  writer = csv.writer(file_out)
398  rows = self.csv_select_by_index() #select *
399 
400  header = self.header_row
401  if omit_col_indices:
402  header = self._remove_cols(header, omit_col_indices)
403 
404  writer.writerow(header)
405  for cur_row in rows:
406  if omit_col_indices:
407  cur_row = self._remove_cols(cur_row, omit_col_indices)
408  writer.writerow(cur_row)
409 
410  file_out.close()
411 
412  ## Sets all values in all columns to NULL if they are equal to the empty string.
413  # @param self
415  for col in self.header_row:
416  self.csv_update_by_name(
417  [col],
418  where_body="%s = ?",
419  where_cols=[col],
420  params=[None, '']
421  )
422