Baby Language Lab Scripts
A collection of data processing tools.
 All Classes Namespaces Files Functions Variables Pages
csv_parser.py
Go to the documentation of this file.
1 ## @package parsers.csv_parser
2 
3 import csv
4 import logging
5 
6 from data_structs.segment import Segment
7 from data_structs.utterance import Utterance
8 from data_structs.speaker import Speaker
9 from db.bll_database import DBConstants
10 from parsers.parser_tools import *
11 
12 ## Parses Segment objects out a CSV file (one segment per row). Each row must contain the columns "Elapsed_Time, Segment_Duration, Speaker_ID".
13 class CSVParser(object):
14  ## Constructor
15  # @param self
16  # @param filename (string) full path to csv file to parse
17  def __init__(self, filename):
18  self.logger = logging.getLogger(__name__)
19  self.filename = filename
20  self.segments = []
21  self.parsed = False
22 
23  ## Parses the CSV file row by row, generating Segment objects.
24  # @param self
25  # @param progress_update_fcn (function=None) function that updates the progress bar, accepting a single parameter, a real number in [0.0, 1.0]
26  # @param seg_filters (list=[]) list of SegFilter objects. These filters are applied to the internal segments list in a permanent manner (i.e. anything they filter out will not be returned by this parser)
27  # @returns (list) List of the segments parsed out of the file - this list is also saved internally as self.segments
28  def parse(self, progress_update_fcn=None, seg_filters=[]):
29  #check if cached - if not, parse and store the results in self.segments. Otherwise, just return the cached list
30  if not self.parsed:
31  self._parse(progress_update_fcn, seg_filters)
32  self.parsed = True
33 
34  return self.segments
35 
36  ## Goes through the file row by row.
37  # @param self
38  # @param progress_update_function (see parse())
39  # @param seg_filters (see parse())
40  def _parse(self, progress_update_fcn=None, seg_filters=[]):
41  #try to open the csv file
42  csv_file = None
43  try:
44  csv_file = open(self.filename, 'rb')
45  except Exception as e:
46  self.logger.error('Error opening file: %s' % e)
47  return []
48 
49  #determine delimiter (comma or tab)
50  line = csv_file.next()
51  delim = '\t' if line.find('\t') >= 0 else ','
52  csv_file.seek(0)
53  #print 'delim: %s' % delim
54 
55  #grab the header row
56  headers = None
57  reader = csv.reader(csv_file, delimiter=delim)
58  rows = []
59  for cur_row in reader:
60  if reader.line_num == 1:
61  headers = cur_row
62  else:
63  rows.append(cur_row)
64 
65  #iterate through the remaining rows, creating Segment objects for them
66  row_dict = dict( zip(headers, rows[0]) )
67  start = float(row_dict['Elapsed_Time'])
68  for i in range(len(rows)):
69  row_dict = dict( zip(headers, rows[i]) )
70  end = start + float(row_dict['Segment_Duration'])
71 
72  #build the Utterance and put it inside the Segment
73  utter = Utterance()
74  utter.speaker = Speaker( '', DBConstants.SPEAKER_CODES.get_option(row_dict['Speaker_ID']) )
75  utter.start = start
76  utter.end = end
77  utter.lena_codes.append(row_dict['Speaker_ID'])
78 
79  seg = Segment(i, utter.start, utter.end, [utter.speaker], [utter])
80  utter.seg = seg
81 
82  #seg = self._parse_row(headers, rows[i], i, start, end)
83  #check if the segment should be filtered out
84  if ParserTools.include_seg(seg, seg_filters):
85  self.segments.append(seg)
86 
87  if progress_update_fcn:
88  progress_update_fcn( float(i + 1) / float(len(rows)) )
89 
90  start = end
91 
92  csv_file.close()
93 
94  ## Constructs a Segment object from a single row of the csv file. Each row must contain the columns "Elapsed_Time, Segment_Duration, Speaker_ID".
95  # @param self
96  # @param headers (list) List of strings, corresponding to the column titles given in the first line of the csv file. Currently, these should be unique - if things change in the future, this method will need a touchup.
97  # @param row (list) List of string - the column data from the current row we are processing
98  # @param line_num (int) Zero-based index of row, excluding the header (i.e. the first row after the header is row 0)
99  # @returns (Segment) A Segment object constructed from the data in row
100  # def _parse_row(self, headers, row, line_num, accum_start, accum_end):
101  # row_dict = dict( zip(headers, row) ) #works as long as all header col names are unique
102  # utter = Utterance()
103  # utter.speaker = Speaker( '', DBConstants.SPEAKER_CODES.get_option(row_dict['Speaker_ID']) )
104  # utter.start = accum_start
105  # utter.end = accum_end
106  # utter.lena_codes.append(row_dict['Speaker_ID'])
107 
108  # seg = Segment(line_num, utter.start, utter.end, [utter.speaker], [utter])
109  # utter.seg = seg
110 
111  # return seg
112