Edit File by line

[0] Fix | Delete

"""

[1] Fix | Delete

csv.py - read/write/investigate CSV files

[2] Fix | Delete

"""

[3] Fix | Delete

[4] Fix | Delete

import re

[5] Fix | Delete

from _csv import Error, __version__, writer, reader, register_dialect, \

[6] Fix | Delete

unregister_dialect, get_dialect, list_dialects, \

[7] Fix | Delete

field_size_limit, \

[8] Fix | Delete

QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \

[9] Fix | Delete

__doc__

[10] Fix | Delete

from _csv import Dialect as _Dialect

[11] Fix | Delete

[12] Fix | Delete

from io import StringIO

[13] Fix | Delete

[14] Fix | Delete

__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",

[15] Fix | Delete

"Error", "Dialect", "__doc__", "excel", "excel_tab",

[16] Fix | Delete

"field_size_limit", "reader", "writer",

[17] Fix | Delete

"register_dialect", "get_dialect", "list_dialects", "Sniffer",

[18] Fix | Delete

"unregister_dialect", "__version__", "DictReader", "DictWriter",

[19] Fix | Delete

"unix_dialect"]

[20] Fix | Delete

[21] Fix | Delete

class Dialect:

[22] Fix | Delete

"""Describe a CSV dialect.

[23] Fix | Delete

[24] Fix | Delete

This must be subclassed (see csv.excel). Valid attributes are:

[25] Fix | Delete

delimiter, quotechar, escapechar, doublequote, skipinitialspace,

[26] Fix | Delete

lineterminator, quoting.

[27] Fix | Delete

[28] Fix | Delete

"""

[29] Fix | Delete

_name = ""

[30] Fix | Delete

_valid = False

[31] Fix | Delete

# placeholders

[32] Fix | Delete

delimiter = None

[33] Fix | Delete

quotechar = None

[34] Fix | Delete

escapechar = None

[35] Fix | Delete

doublequote = None

[36] Fix | Delete

skipinitialspace = None

[37] Fix | Delete

lineterminator = None

[38] Fix | Delete

quoting = None

[39] Fix | Delete

[40] Fix | Delete

def __init__(self):

[41] Fix | Delete

if self.__class__ != Dialect:

[42] Fix | Delete

self._valid = True

[43] Fix | Delete

self._validate()

[44] Fix | Delete

[45] Fix | Delete

def _validate(self):

[46] Fix | Delete

try:

[47] Fix | Delete

_Dialect(self)

[48] Fix | Delete

except TypeError as e:

[49] Fix | Delete

# We do this for compatibility with py2.3

[50] Fix | Delete

raise Error(str(e))

[51] Fix | Delete

[52] Fix | Delete

class excel(Dialect):

[53] Fix | Delete

"""Describe the usual properties of Excel-generated CSV files."""

[54] Fix | Delete

delimiter = ','

[55] Fix | Delete

quotechar = '"'

[56] Fix | Delete

doublequote = True

[57] Fix | Delete

skipinitialspace = False

[58] Fix | Delete

lineterminator = '\r\n'

[59] Fix | Delete

quoting = QUOTE_MINIMAL

[60] Fix | Delete

register_dialect("excel", excel)

[61] Fix | Delete

[62] Fix | Delete

class excel_tab(excel):

[63] Fix | Delete

"""Describe the usual properties of Excel-generated TAB-delimited files."""

[64] Fix | Delete

delimiter = '\t'

[65] Fix | Delete

register_dialect("excel-tab", excel_tab)

[66] Fix | Delete

[67] Fix | Delete

class unix_dialect(Dialect):

[68] Fix | Delete

"""Describe the usual properties of Unix-generated CSV files."""

[69] Fix | Delete

delimiter = ','

[70] Fix | Delete

quotechar = '"'

[71] Fix | Delete

doublequote = True

[72] Fix | Delete

skipinitialspace = False

[73] Fix | Delete

lineterminator = '\n'

[74] Fix | Delete

quoting = QUOTE_ALL

[75] Fix | Delete

register_dialect("unix", unix_dialect)

[76] Fix | Delete

[77] Fix | Delete

[78] Fix | Delete

class DictReader:

[79] Fix | Delete

def __init__(self, f, fieldnames=None, restkey=None, restval=None,

[80] Fix | Delete

dialect="excel", *args, **kwds):

[81] Fix | Delete

self._fieldnames = fieldnames # list of keys for the dict

[82] Fix | Delete

self.restkey = restkey # key to catch long rows

[83] Fix | Delete

self.restval = restval # default value for short rows

[84] Fix | Delete

self.reader = reader(f, dialect, *args, **kwds)

[85] Fix | Delete

self.dialect = dialect

[86] Fix | Delete

self.line_num = 0

[87] Fix | Delete

[88] Fix | Delete

def __iter__(self):

[89] Fix | Delete

return self

[90] Fix | Delete

[91] Fix | Delete

@property

[92] Fix | Delete

def fieldnames(self):

[93] Fix | Delete

if self._fieldnames is None:

[94] Fix | Delete

try:

[95] Fix | Delete

self._fieldnames = next(self.reader)

[96] Fix | Delete

except StopIteration:

[97] Fix | Delete

pass

[98] Fix | Delete

self.line_num = self.reader.line_num

[99] Fix | Delete

return self._fieldnames

[100] Fix | Delete

[101] Fix | Delete

@fieldnames.setter

[102] Fix | Delete

def fieldnames(self, value):

[103] Fix | Delete

self._fieldnames = value

[104] Fix | Delete

[105] Fix | Delete

def __next__(self):

[106] Fix | Delete

if self.line_num == 0:

[107] Fix | Delete

# Used only for its side effect.

[108] Fix | Delete

self.fieldnames

[109] Fix | Delete

row = next(self.reader)

[110] Fix | Delete

self.line_num = self.reader.line_num

[111] Fix | Delete

[112] Fix | Delete

# unlike the basic reader, we prefer not to return blanks,

[113] Fix | Delete

# because we will typically wind up with a dict full of None

[114] Fix | Delete

# values

[115] Fix | Delete

while row == []:

[116] Fix | Delete

row = next(self.reader)

[117] Fix | Delete

d = dict(zip(self.fieldnames, row))

[118] Fix | Delete

lf = len(self.fieldnames)

[119] Fix | Delete

lr = len(row)

[120] Fix | Delete

if lf < lr:

[121] Fix | Delete

d[self.restkey] = row[lf:]

[122] Fix | Delete

elif lf > lr:

[123] Fix | Delete

for key in self.fieldnames[lr:]:

[124] Fix | Delete

d[key] = self.restval

[125] Fix | Delete

return d

[126] Fix | Delete

[127] Fix | Delete

[128] Fix | Delete

class DictWriter:

[129] Fix | Delete

def __init__(self, f, fieldnames, restval="", extrasaction="raise",

[130] Fix | Delete

dialect="excel", *args, **kwds):

[131] Fix | Delete

self.fieldnames = fieldnames # list of keys for the dict

[132] Fix | Delete

self.restval = restval # for writing short dicts

[133] Fix | Delete

if extrasaction.lower() not in ("raise", "ignore"):

[134] Fix | Delete

raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"

[135] Fix | Delete

% extrasaction)

[136] Fix | Delete

self.extrasaction = extrasaction

[137] Fix | Delete

self.writer = writer(f, dialect, *args, **kwds)

[138] Fix | Delete

[139] Fix | Delete

def writeheader(self):

[140] Fix | Delete

header = dict(zip(self.fieldnames, self.fieldnames))

[141] Fix | Delete

return self.writerow(header)

[142] Fix | Delete

[143] Fix | Delete

def _dict_to_list(self, rowdict):

[144] Fix | Delete

if self.extrasaction == "raise":

[145] Fix | Delete

wrong_fields = rowdict.keys() - self.fieldnames

[146] Fix | Delete

if wrong_fields:

[147] Fix | Delete

raise ValueError("dict contains fields not in fieldnames: "

[148] Fix | Delete

+ ", ".join([repr(x) for x in wrong_fields]))

[149] Fix | Delete

return (rowdict.get(key, self.restval) for key in self.fieldnames)

[150] Fix | Delete

[151] Fix | Delete

def writerow(self, rowdict):

[152] Fix | Delete

return self.writer.writerow(self._dict_to_list(rowdict))

[153] Fix | Delete

[154] Fix | Delete

def writerows(self, rowdicts):

[155] Fix | Delete

return self.writer.writerows(map(self._dict_to_list, rowdicts))

[156] Fix | Delete

[157] Fix | Delete

# Guard Sniffer's type checking against builds that exclude complex()

[158] Fix | Delete

try:

[159] Fix | Delete

complex

[160] Fix | Delete

except NameError:

[161] Fix | Delete

complex = float

[162] Fix | Delete

[163] Fix | Delete

class Sniffer:

[164] Fix | Delete

'''

[165] Fix | Delete

"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)

[166] Fix | Delete

Returns a Dialect object.

[167] Fix | Delete

'''

[168] Fix | Delete

def __init__(self):

[169] Fix | Delete

# in case there is more than one possible delimiter

[170] Fix | Delete

self.preferred = [',', '\t', ';', ' ', ':']

[171] Fix | Delete

[172] Fix | Delete

[173] Fix | Delete

def sniff(self, sample, delimiters=None):

[174] Fix | Delete

"""

[175] Fix | Delete

Returns a dialect (or None) corresponding to the sample

[176] Fix | Delete

"""

[177] Fix | Delete

[178] Fix | Delete

quotechar, doublequote, delimiter, skipinitialspace = \

[179] Fix | Delete

self._guess_quote_and_delimiter(sample, delimiters)

[180] Fix | Delete

if not delimiter:

[181] Fix | Delete

delimiter, skipinitialspace = self._guess_delimiter(sample,

[182] Fix | Delete

delimiters)

[183] Fix | Delete

[184] Fix | Delete

if not delimiter:

[185] Fix | Delete

raise Error("Could not determine delimiter")

[186] Fix | Delete

[187] Fix | Delete

class dialect(Dialect):

[188] Fix | Delete

_name = "sniffed"

[189] Fix | Delete

lineterminator = '\r\n'

[190] Fix | Delete

quoting = QUOTE_MINIMAL

[191] Fix | Delete

# escapechar = ''

[192] Fix | Delete

[193] Fix | Delete

dialect.doublequote = doublequote

[194] Fix | Delete

dialect.delimiter = delimiter

[195] Fix | Delete

# _csv.reader won't accept a quotechar of ''

[196] Fix | Delete

dialect.quotechar = quotechar or '"'

[197] Fix | Delete

dialect.skipinitialspace = skipinitialspace

[198] Fix | Delete

[199] Fix | Delete

return dialect

[200] Fix | Delete

[201] Fix | Delete

[202] Fix | Delete

def _guess_quote_and_delimiter(self, data, delimiters):

[203] Fix | Delete

"""

[204] Fix | Delete

Looks for text enclosed between two identical quotes

[205] Fix | Delete

(the probable quotechar) which are preceded and followed

[206] Fix | Delete

by the same character (the probable delimiter).

[207] Fix | Delete

For example:

[208] Fix | Delete

,'some text',

[209] Fix | Delete

The quote with the most wins, same with the delimiter.

[210] Fix | Delete

If there is no quotechar the delimiter can't be determined

[211] Fix | Delete

this way.

[212] Fix | Delete

"""

[213] Fix | Delete

[214] Fix | Delete

matches = []

[215] Fix | Delete

for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",

[216] Fix | Delete

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",

[217] Fix | Delete

r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"

[218] Fix | Delete

r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)

[219] Fix | Delete

regexp = re.compile(restr, re.DOTALL | re.MULTILINE)

[220] Fix | Delete

matches = regexp.findall(data)

[221] Fix | Delete

if matches:

[222] Fix | Delete

break

[223] Fix | Delete

[224] Fix | Delete

if not matches:

[225] Fix | Delete

# (quotechar, doublequote, delimiter, skipinitialspace)

[226] Fix | Delete

return ('', False, None, 0)

[227] Fix | Delete

quotes = {}

[228] Fix | Delete

delims = {}

[229] Fix | Delete

spaces = 0

[230] Fix | Delete

groupindex = regexp.groupindex

[231] Fix | Delete

for m in matches:

[232] Fix | Delete

n = groupindex['quote'] - 1

[233] Fix | Delete

key = m[n]

[234] Fix | Delete

if key:

[235] Fix | Delete

quotes[key] = quotes.get(key, 0) + 1

[236] Fix | Delete

try:

[237] Fix | Delete

n = groupindex['delim'] - 1

[238] Fix | Delete

key = m[n]

[239] Fix | Delete

except KeyError:

[240] Fix | Delete

continue

[241] Fix | Delete

if key and (delimiters is None or key in delimiters):

[242] Fix | Delete

delims[key] = delims.get(key, 0) + 1

[243] Fix | Delete

try:

[244] Fix | Delete

n = groupindex['space'] - 1

[245] Fix | Delete

except KeyError:

[246] Fix | Delete

continue

[247] Fix | Delete

if m[n]:

[248] Fix | Delete

spaces += 1

[249] Fix | Delete

[250] Fix | Delete

quotechar = max(quotes, key=quotes.get)

[251] Fix | Delete

[252] Fix | Delete

if delims:

[253] Fix | Delete

delim = max(delims, key=delims.get)

[254] Fix | Delete

skipinitialspace = delims[delim] == spaces

[255] Fix | Delete

if delim == '\n': # most likely a file with a single column

[256] Fix | Delete

delim = ''

[257] Fix | Delete

else:

[258] Fix | Delete

# there is *no* delimiter, it's a single column of quoted data

[259] Fix | Delete

delim = ''

[260] Fix | Delete

skipinitialspace = 0

[261] Fix | Delete

[262] Fix | Delete

# if we see an extra quote between delimiters, we've got a

[263] Fix | Delete

# double quoted format

[264] Fix | Delete

dq_regexp = re.compile(

[265] Fix | Delete

r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \

[266] Fix | Delete

{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)

[267] Fix | Delete

[268] Fix | Delete

[269] Fix | Delete

[270] Fix | Delete

if dq_regexp.search(data):

[271] Fix | Delete

doublequote = True

[272] Fix | Delete

else:

[273] Fix | Delete

doublequote = False

[274] Fix | Delete

[275] Fix | Delete

return (quotechar, doublequote, delim, skipinitialspace)

[276] Fix | Delete

[277] Fix | Delete

[278] Fix | Delete

def _guess_delimiter(self, data, delimiters):

[279] Fix | Delete

"""

[280] Fix | Delete

The delimiter /should/ occur the same number of times on

[281] Fix | Delete

each row. However, due to malformed data, it may not. We don't want

[282] Fix | Delete

an all or nothing approach, so we allow for small variations in this

[283] Fix | Delete

number.

[284] Fix | Delete

1) build a table of the frequency of each character on every line.

[285] Fix | Delete

2) build a table of frequencies of this frequency (meta-frequency?),

[286] Fix | Delete

e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,

[287] Fix | Delete

7 times in 2 rows'

[288] Fix | Delete

3) use the mode of the meta-frequency to determine the /expected/

[289] Fix | Delete

frequency for that character

[290] Fix | Delete

4) find out how often the character actually meets that goal

[291] Fix | Delete

5) the character that best meets its goal is the delimiter

[292] Fix | Delete

For performance reasons, the data is evaluated in chunks, so it can

[293] Fix | Delete

try and evaluate the smallest portion of the data possible, evaluating

[294] Fix | Delete

additional chunks as necessary.

[295] Fix | Delete

"""

[296] Fix | Delete

[297] Fix | Delete

data = list(filter(None, data.split('\n')))

[298] Fix | Delete

[299] Fix | Delete

ascii = [chr(c) for c in range(127)] # 7-bit ASCII

[300] Fix | Delete

[301] Fix | Delete

# build frequency tables

[302] Fix | Delete

chunkLength = min(10, len(data))

[303] Fix | Delete

iteration = 0

[304] Fix | Delete

charFrequency = {}

[305] Fix | Delete

modes = {}

[306] Fix | Delete

delims = {}

[307] Fix | Delete

start, end = 0, chunkLength

[308] Fix | Delete

while start < len(data):

[309] Fix | Delete

iteration += 1

[310] Fix | Delete

for line in data[start:end]:

[311] Fix | Delete

for char in ascii:

[312] Fix | Delete

metaFrequency = charFrequency.get(char, {})

[313] Fix | Delete

# must count even if frequency is 0

[314] Fix | Delete

freq = line.count(char)

[315] Fix | Delete

# value is the mode

[316] Fix | Delete

metaFrequency[freq] = metaFrequency.get(freq, 0) + 1

[317] Fix | Delete

charFrequency[char] = metaFrequency

[318] Fix | Delete

[319] Fix | Delete

for char in charFrequency.keys():

[320] Fix | Delete

items = list(charFrequency[char].items())

[321] Fix | Delete

if len(items) == 1 and items[0][0] == 0:

[322] Fix | Delete

continue

[323] Fix | Delete

# get the mode of the frequencies

[324] Fix | Delete

if len(items) > 1:

[325] Fix | Delete

modes[char] = max(items, key=lambda x: x[1])

[326] Fix | Delete

# adjust the mode - subtract the sum of all

[327] Fix | Delete

# other frequencies

[328] Fix | Delete

items.remove(modes[char])

[329] Fix | Delete

modes[char] = (modes[char][0], modes[char][1]

[330] Fix | Delete

- sum(item[1] for item in items))

[331] Fix | Delete

else:

[332] Fix | Delete

modes[char] = items[0]

[333] Fix | Delete

[334] Fix | Delete

# build a list of possible delimiters

[335] Fix | Delete

modeList = modes.items()

[336] Fix | Delete

total = float(min(chunkLength * iteration, len(data)))

[337] Fix | Delete

# (rows of consistent data) / (number of rows) = 100%

[338] Fix | Delete

consistency = 1.0

[339] Fix | Delete

# minimum consistency threshold

[340] Fix | Delete

threshold = 0.9

[341] Fix | Delete

while len(delims) == 0 and consistency >= threshold:

[342] Fix | Delete

for k, v in modeList:

[343] Fix | Delete

if v[0] > 0 and v[1] > 0:

[344] Fix | Delete

if ((v[1]/total) >= consistency and

[345] Fix | Delete

(delimiters is None or k in delimiters)):

[346] Fix | Delete

delims[k] = v

[347] Fix | Delete

consistency -= 0.01

[348] Fix | Delete

[349] Fix | Delete

if len(delims) == 1:

[350] Fix | Delete

delim = list(delims.keys())[0]

[351] Fix | Delete

skipinitialspace = (data[0].count(delim) ==

[352] Fix | Delete

data[0].count("%c " % delim))

[353] Fix | Delete

return (delim, skipinitialspace)

[354] Fix | Delete

[355] Fix | Delete

# analyze another chunkLength lines

[356] Fix | Delete

start = end

[357] Fix | Delete

end += chunkLength

[358] Fix | Delete

[359] Fix | Delete

if not delims:

[360] Fix | Delete

return ('', 0)

[361] Fix | Delete

[362] Fix | Delete

# if there's more than one, fall back to a 'preferred' list

[363] Fix | Delete

if len(delims) > 1:

[364] Fix | Delete

for d in self.preferred:

[365] Fix | Delete

if d in delims.keys():

[366] Fix | Delete

skipinitialspace = (data[0].count(d) ==

[367] Fix | Delete

data[0].count("%c " % d))

[368] Fix | Delete

return (d, skipinitialspace)

[369] Fix | Delete

[370] Fix | Delete

# nothing else indicates a preference, pick the character that

[371] Fix | Delete

# dominates(?)

[372] Fix | Delete

items = [(v,k) for (k,v) in delims.items()]

[373] Fix | Delete

items.sort()

[374] Fix | Delete

delim = items[-1][1]

[375] Fix | Delete

[376] Fix | Delete

skipinitialspace = (data[0].count(delim) ==

[377] Fix | Delete

data[0].count("%c " % delim))

[378] Fix | Delete

return (delim, skipinitialspace)

[379] Fix | Delete

[380] Fix | Delete

[381] Fix | Delete

def has_header(self, sample):

[382] Fix | Delete

# Creates a dictionary of types of data in each column. If any

[383] Fix | Delete

# column is of a single type (say, integers), *except* for the first

[384] Fix | Delete

# row, then the first row is presumed to be labels. If the type

[385] Fix | Delete

# can't be determined, it is assumed to be a string in which case

[386] Fix | Delete

# the length of the string is the determining factor: if all of the

[387] Fix | Delete

# rows except for the first are the same length, it's a header.

[388] Fix | Delete

# Finally, a 'vote' is taken at the end for each column, adding or

[389] Fix | Delete

# subtracting from the likelihood of the first row being a header.

[390] Fix | Delete

[391] Fix | Delete

rdr = reader(StringIO(sample), self.sniff(sample))

[392] Fix | Delete

[393] Fix | Delete

header = next(rdr) # assume first row is header

[394] Fix | Delete

[395] Fix | Delete

columns = len(header)

[396] Fix | Delete

columnTypes = {}

[397] Fix | Delete

for i in range(columns): columnTypes[i] = None

[398] Fix | Delete

[399] Fix | Delete

checked = 0

[400] Fix | Delete

for row in rdr:

[401] Fix | Delete

# arbitrary number of rows to check, to keep it sane

[402] Fix | Delete

if checked > 20:

[403] Fix | Delete

break

[404] Fix | Delete

checked += 1

[405] Fix | Delete

[406] Fix | Delete

if len(row) != columns:

[407] Fix | Delete

continue # skip rows that have irregular number of columns

[408] Fix | Delete

[409] Fix | Delete

for col in list(columnTypes.keys()):

[410] Fix | Delete

[411] Fix | Delete

for thisType in [int, float, complex]:

[412] Fix | Delete

try:

[413] Fix | Delete

thisType(row[col])

[414] Fix | Delete

break

[415] Fix | Delete

except (ValueError, OverflowError):

[416] Fix | Delete

pass

[417] Fix | Delete

else:

[418] Fix | Delete

# fallback to length of string

[419] Fix | Delete

thisType = len(row[col])

[420] Fix | Delete

[421] Fix | Delete

if thisType != columnTypes[col]:

[422] Fix | Delete

if columnTypes[col] is None: # add new column type

[423] Fix | Delete

columnTypes[col] = thisType

[424] Fix | Delete

else:

[425] Fix | Delete

# type is inconsistent, remove column from

[426] Fix | Delete

# consideration

[427] Fix | Delete

del columnTypes[col]

[428] Fix | Delete

[429] Fix | Delete

# finally, compare results against first row and "vote"

[430] Fix | Delete

# on whether it's a header

[431] Fix | Delete

hasHeader = 0

[432] Fix | Delete

for col, colType in columnTypes.items():

[433] Fix | Delete

if type(colType) == type(0): # it's a length

[434] Fix | Delete

if len(header[col]) != colType:

[435] Fix | Delete

hasHeader += 1

[436] Fix | Delete

else:

[437] Fix | Delete

hasHeader -= 1

[438] Fix | Delete

else: # attempt typecast

[439] Fix | Delete

try:

[440] Fix | Delete

colType(header[col])

[441] Fix | Delete

except (ValueError, TypeError):

[442] Fix | Delete

hasHeader += 1

[443] Fix | Delete

else:

[444] Fix | Delete

hasHeader -= 1

[445] Fix | Delete

[446] Fix | Delete

return hasHeader > 0

[447] Fix | Delete

[448] Fix | Delete