csv.py - read/write/investigate CSV files
from functools import reduce
from _csv import Error, __version__, writer, reader, register_dialect, \
unregister_dialect, get_dialect, list_dialects, \
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
from _csv import Dialect as _Dialect
from cStringIO import StringIO
from StringIO import StringIO
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
"Error", "Dialect", "__doc__", "excel", "excel_tab",
"field_size_limit", "reader", "writer",
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
"""Describe an Excel dialect.
This must be subclassed (see csv.excel). Valid attributes are:
delimiter, quotechar, escapechar, doublequote, skipinitialspace,
if self.__class__ != Dialect:
# We do this for compatibility with py2.3
"""Describe the usual properties of Excel-generated CSV files."""
register_dialect("excel", excel)
"""Describe the usual properties of Excel-generated TAB-delimited files."""
register_dialect("excel-tab", excel_tab)
def __init__(self, f, fieldnames=None, restkey=None, restval=None,
dialect="excel", *args, **kwds):
self._fieldnames = fieldnames # list of keys for the dict
self.restkey = restkey # key to catch long rows
self.restval = restval # default value for short rows
self.reader = reader(f, dialect, *args, **kwds)
if self._fieldnames is None:
self._fieldnames = self.reader.next()
self.line_num = self.reader.line_num
# Issue 20004: Because DictReader is a classic class, this setter is
# ignored. At this point in 2.7's lifecycle, it is too late to change the
# base class for fear of breaking working code. If you want to change
# fieldnames without overwriting the getter, set _fieldnames directly.
def fieldnames(self, value):
# Used only for its side effect.
self.line_num = self.reader.line_num
# unlike the basic reader, we prefer not to return blanks,
# because we will typically wind up with a dict full of None
d = dict(zip(self.fieldnames, row))
lf = len(self.fieldnames)
d[self.restkey] = row[lf:]
for key in self.fieldnames[lr:]:
def __init__(self, f, fieldnames, restval="", extrasaction="raise",
dialect="excel", *args, **kwds):
self.fieldnames = fieldnames # list of keys for the dict
self.restval = restval # for writing short dicts
if extrasaction.lower() not in ("raise", "ignore"):
("extrasaction (%s) must be 'raise' or 'ignore'" %
self.extrasaction = extrasaction
self.writer = writer(f, dialect, *args, **kwds)
header = dict(zip(self.fieldnames, self.fieldnames))
def _dict_to_list(self, rowdict):
if self.extrasaction == "raise":
wrong_fields = [k for k in rowdict if k not in self.fieldnames]
raise ValueError("dict contains fields not in fieldnames: "
+ ", ".join([repr(x) for x in wrong_fields]))
return [rowdict.get(key, self.restval) for key in self.fieldnames]
def writerow(self, rowdict):
return self.writer.writerow(self._dict_to_list(rowdict))
def writerows(self, rowdicts):
rows.append(self._dict_to_list(rowdict))
return self.writer.writerows(rows)
# Guard Sniffer's type checking against builds that exclude complex()
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Returns a Dialect object.
# in case there is more than one possible delimiter
self.preferred = [',', '\t', ';', ' ', ':']
def sniff(self, sample, delimiters=None):
Returns a dialect (or None) corresponding to the sample
quotechar, doublequote, delimiter, skipinitialspace = \
self._guess_quote_and_delimiter(sample, delimiters)
delimiter, skipinitialspace = self._guess_delimiter(sample,
raise Error, "Could not determine delimiter"
dialect.doublequote = doublequote
dialect.delimiter = delimiter
# _csv.reader won't accept a quotechar of ''
dialect.quotechar = quotechar or '"'
dialect.skipinitialspace = skipinitialspace
def _guess_quote_and_delimiter(self, data, delimiters):
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
by the same character (the probable delimiter).
The quote with the most wins, same with the delimiter.
If there is no quotechar the delimiter can't be determined
for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
matches = regexp.findall(data)
# (quotechar, doublequote, delimiter, skipinitialspace)
return ('', False, None, 0)
n = regexp.groupindex['quote'] - 1
quotes[key] = quotes.get(key, 0) + 1
n = regexp.groupindex['delim'] - 1
if key and (delimiters is None or key in delimiters):
delims[key] = delims.get(key, 0) + 1
n = regexp.groupindex['space'] - 1
quotechar = reduce(lambda a, b, quotes = quotes:
(quotes[a] > quotes[b]) and a or b, quotes.keys())
delim = reduce(lambda a, b, delims = delims:
(delims[a] > delims[b]) and a or b, delims.keys())
skipinitialspace = delims[delim] == spaces
if delim == '\n': # most likely a file with a single column
# there is *no* delimiter, it's a single column of quoted data
# if we see an extra quote between delimiters, we've got a
r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
if dq_regexp.search(data):
return (quotechar, doublequote, delim, skipinitialspace)
def _guess_delimiter(self, data, delimiters):
The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want
an all or nothing approach, so we allow for small variations in this
1) build a table of the frequency of each character on every line.
2) build a table of frequencies of this frequency (meta-frequency?),
e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
3) use the mode of the meta-frequency to determine the /expected/
frequency for that character
4) find out how often the character actually meets that goal
5) the character that best meets its goal is the delimiter
For performance reasons, the data is evaluated in chunks, so it can
try and evaluate the smallest portion of the data possible, evaluating
additional chunks as necessary.
data = filter(None, data.split('\n'))
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
chunkLength = min(10, len(data))
start, end = 0, min(chunkLength, len(data))
for line in data[start:end]:
metaFrequency = charFrequency.get(char, {})
# must count even if frequency is 0
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
charFrequency[char] = metaFrequency
for char in charFrequency.keys():
items = charFrequency[char].items()
if len(items) == 1 and items[0][0] == 0:
# get the mode of the frequencies
modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
# adjust the mode - subtract the sum of all
items.remove(modes[char])
modes[char] = (modes[char][0], modes[char][1]
- reduce(lambda a, b: (0, a[1] + b[1]),
# build a list of possible delimiters
total = float(chunkLength * iteration)
# (rows of consistent data) / (number of rows) = 100%
# minimum consistency threshold
while len(delims) == 0 and consistency >= threshold:
if v[0] > 0 and v[1] > 0:
if ((v[1]/total) >= consistency and
(delimiters is None or k in delimiters)):
skipinitialspace = (data[0].count(delim) ==
data[0].count("%c " % delim))
return (delim, skipinitialspace)
# analyze another chunkLength lines
# if there's more than one, fall back to a 'preferred' list
skipinitialspace = (data[0].count(d) ==
data[0].count("%c " % d))
return (d, skipinitialspace)
# nothing else indicates a preference, pick the character that
items = [(v,k) for (k,v) in delims.items()]
skipinitialspace = (data[0].count(delim) ==
data[0].count("%c " % delim))
return (delim, skipinitialspace)
def has_header(self, sample):
# Creates a dictionary of types of data in each column. If any
# column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type
# can't be determined, it is assumed to be a string in which case
# the length of the string is the determining factor: if all of the
# rows except for the first are the same length, it's a header.
# Finally, a 'vote' is taken at the end for each column, adding or
# subtracting from the likelihood of the first row being a header.
rdr = reader(StringIO(sample), self.sniff(sample))
header = rdr.next() # assume first row is header
for i in range(columns): columnTypes[i] = None
# arbitrary number of rows to check, to keep it sane
continue # skip rows that have irregular number of columns
for col in columnTypes.keys():
for thisType in [int, long, float, complex]:
except (ValueError, OverflowError):
# fallback to length of string
if thisType != columnTypes[col]:
if columnTypes[col] is None: # add new column type
columnTypes[col] = thisType
# type is inconsistent, remove column from
# finally, compare results against first row and "vote"
# on whether it's a header
for col, colType in columnTypes.items():
if type(colType) == type(0): # it's a length
if len(header[col]) != colType:
except (ValueError, TypeError):