Edit File by line
/home/barbar84/www/wp-conte.../plugins/sujqvwi/AnonR/smanonr..../lib64/python3....
File: csv.py
[0] Fix | Delete
"""
[1] Fix | Delete
csv.py - read/write/investigate CSV files
[2] Fix | Delete
"""
[3] Fix | Delete
[4] Fix | Delete
import re
[5] Fix | Delete
from _csv import Error, __version__, writer, reader, register_dialect, \
[6] Fix | Delete
unregister_dialect, get_dialect, list_dialects, \
[7] Fix | Delete
field_size_limit, \
[8] Fix | Delete
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
[9] Fix | Delete
__doc__
[10] Fix | Delete
from _csv import Dialect as _Dialect
[11] Fix | Delete
[12] Fix | Delete
from io import StringIO
[13] Fix | Delete
[14] Fix | Delete
__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
[15] Fix | Delete
"Error", "Dialect", "__doc__", "excel", "excel_tab",
[16] Fix | Delete
"field_size_limit", "reader", "writer",
[17] Fix | Delete
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
[18] Fix | Delete
"unregister_dialect", "__version__", "DictReader", "DictWriter",
[19] Fix | Delete
"unix_dialect"]
[20] Fix | Delete
[21] Fix | Delete
class Dialect:
[22] Fix | Delete
"""Describe a CSV dialect.
[23] Fix | Delete
[24] Fix | Delete
This must be subclassed (see csv.excel). Valid attributes are:
[25] Fix | Delete
delimiter, quotechar, escapechar, doublequote, skipinitialspace,
[26] Fix | Delete
lineterminator, quoting.
[27] Fix | Delete
[28] Fix | Delete
"""
[29] Fix | Delete
_name = ""
[30] Fix | Delete
_valid = False
[31] Fix | Delete
# placeholders
[32] Fix | Delete
delimiter = None
[33] Fix | Delete
quotechar = None
[34] Fix | Delete
escapechar = None
[35] Fix | Delete
doublequote = None
[36] Fix | Delete
skipinitialspace = None
[37] Fix | Delete
lineterminator = None
[38] Fix | Delete
quoting = None
[39] Fix | Delete
[40] Fix | Delete
def __init__(self):
[41] Fix | Delete
if self.__class__ != Dialect:
[42] Fix | Delete
self._valid = True
[43] Fix | Delete
self._validate()
[44] Fix | Delete
[45] Fix | Delete
def _validate(self):
[46] Fix | Delete
try:
[47] Fix | Delete
_Dialect(self)
[48] Fix | Delete
except TypeError as e:
[49] Fix | Delete
# We do this for compatibility with py2.3
[50] Fix | Delete
raise Error(str(e))
[51] Fix | Delete
[52] Fix | Delete
class excel(Dialect):
[53] Fix | Delete
"""Describe the usual properties of Excel-generated CSV files."""
[54] Fix | Delete
delimiter = ','
[55] Fix | Delete
quotechar = '"'
[56] Fix | Delete
doublequote = True
[57] Fix | Delete
skipinitialspace = False
[58] Fix | Delete
lineterminator = '\r\n'
[59] Fix | Delete
quoting = QUOTE_MINIMAL
[60] Fix | Delete
register_dialect("excel", excel)
[61] Fix | Delete
[62] Fix | Delete
class excel_tab(excel):
[63] Fix | Delete
"""Describe the usual properties of Excel-generated TAB-delimited files."""
[64] Fix | Delete
delimiter = '\t'
[65] Fix | Delete
register_dialect("excel-tab", excel_tab)
[66] Fix | Delete
[67] Fix | Delete
class unix_dialect(Dialect):
[68] Fix | Delete
"""Describe the usual properties of Unix-generated CSV files."""
[69] Fix | Delete
delimiter = ','
[70] Fix | Delete
quotechar = '"'
[71] Fix | Delete
doublequote = True
[72] Fix | Delete
skipinitialspace = False
[73] Fix | Delete
lineterminator = '\n'
[74] Fix | Delete
quoting = QUOTE_ALL
[75] Fix | Delete
register_dialect("unix", unix_dialect)
[76] Fix | Delete
[77] Fix | Delete
[78] Fix | Delete
class DictReader:
[79] Fix | Delete
def __init__(self, f, fieldnames=None, restkey=None, restval=None,
[80] Fix | Delete
dialect="excel", *args, **kwds):
[81] Fix | Delete
self._fieldnames = fieldnames # list of keys for the dict
[82] Fix | Delete
self.restkey = restkey # key to catch long rows
[83] Fix | Delete
self.restval = restval # default value for short rows
[84] Fix | Delete
self.reader = reader(f, dialect, *args, **kwds)
[85] Fix | Delete
self.dialect = dialect
[86] Fix | Delete
self.line_num = 0
[87] Fix | Delete
[88] Fix | Delete
def __iter__(self):
[89] Fix | Delete
return self
[90] Fix | Delete
[91] Fix | Delete
@property
[92] Fix | Delete
def fieldnames(self):
[93] Fix | Delete
if self._fieldnames is None:
[94] Fix | Delete
try:
[95] Fix | Delete
self._fieldnames = next(self.reader)
[96] Fix | Delete
except StopIteration:
[97] Fix | Delete
pass
[98] Fix | Delete
self.line_num = self.reader.line_num
[99] Fix | Delete
return self._fieldnames
[100] Fix | Delete
[101] Fix | Delete
@fieldnames.setter
[102] Fix | Delete
def fieldnames(self, value):
[103] Fix | Delete
self._fieldnames = value
[104] Fix | Delete
[105] Fix | Delete
def __next__(self):
[106] Fix | Delete
if self.line_num == 0:
[107] Fix | Delete
# Used only for its side effect.
[108] Fix | Delete
self.fieldnames
[109] Fix | Delete
row = next(self.reader)
[110] Fix | Delete
self.line_num = self.reader.line_num
[111] Fix | Delete
[112] Fix | Delete
# unlike the basic reader, we prefer not to return blanks,
[113] Fix | Delete
# because we will typically wind up with a dict full of None
[114] Fix | Delete
# values
[115] Fix | Delete
while row == []:
[116] Fix | Delete
row = next(self.reader)
[117] Fix | Delete
d = dict(zip(self.fieldnames, row))
[118] Fix | Delete
lf = len(self.fieldnames)
[119] Fix | Delete
lr = len(row)
[120] Fix | Delete
if lf < lr:
[121] Fix | Delete
d[self.restkey] = row[lf:]
[122] Fix | Delete
elif lf > lr:
[123] Fix | Delete
for key in self.fieldnames[lr:]:
[124] Fix | Delete
d[key] = self.restval
[125] Fix | Delete
return d
[126] Fix | Delete
[127] Fix | Delete
[128] Fix | Delete
class DictWriter:
[129] Fix | Delete
def __init__(self, f, fieldnames, restval="", extrasaction="raise",
[130] Fix | Delete
dialect="excel", *args, **kwds):
[131] Fix | Delete
self.fieldnames = fieldnames # list of keys for the dict
[132] Fix | Delete
self.restval = restval # for writing short dicts
[133] Fix | Delete
if extrasaction.lower() not in ("raise", "ignore"):
[134] Fix | Delete
raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
[135] Fix | Delete
% extrasaction)
[136] Fix | Delete
self.extrasaction = extrasaction
[137] Fix | Delete
self.writer = writer(f, dialect, *args, **kwds)
[138] Fix | Delete
[139] Fix | Delete
def writeheader(self):
[140] Fix | Delete
header = dict(zip(self.fieldnames, self.fieldnames))
[141] Fix | Delete
return self.writerow(header)
[142] Fix | Delete
[143] Fix | Delete
def _dict_to_list(self, rowdict):
[144] Fix | Delete
if self.extrasaction == "raise":
[145] Fix | Delete
wrong_fields = rowdict.keys() - self.fieldnames
[146] Fix | Delete
if wrong_fields:
[147] Fix | Delete
raise ValueError("dict contains fields not in fieldnames: "
[148] Fix | Delete
+ ", ".join([repr(x) for x in wrong_fields]))
[149] Fix | Delete
return (rowdict.get(key, self.restval) for key in self.fieldnames)
[150] Fix | Delete
[151] Fix | Delete
def writerow(self, rowdict):
[152] Fix | Delete
return self.writer.writerow(self._dict_to_list(rowdict))
[153] Fix | Delete
[154] Fix | Delete
def writerows(self, rowdicts):
[155] Fix | Delete
return self.writer.writerows(map(self._dict_to_list, rowdicts))
[156] Fix | Delete
[157] Fix | Delete
# Guard Sniffer's type checking against builds that exclude complex()
[158] Fix | Delete
try:
[159] Fix | Delete
complex
[160] Fix | Delete
except NameError:
[161] Fix | Delete
complex = float
[162] Fix | Delete
[163] Fix | Delete
class Sniffer:
[164] Fix | Delete
'''
[165] Fix | Delete
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
[166] Fix | Delete
Returns a Dialect object.
[167] Fix | Delete
'''
[168] Fix | Delete
def __init__(self):
[169] Fix | Delete
# in case there is more than one possible delimiter
[170] Fix | Delete
self.preferred = [',', '\t', ';', ' ', ':']
[171] Fix | Delete
[172] Fix | Delete
[173] Fix | Delete
def sniff(self, sample, delimiters=None):
[174] Fix | Delete
"""
[175] Fix | Delete
Returns a dialect (or None) corresponding to the sample
[176] Fix | Delete
"""
[177] Fix | Delete
[178] Fix | Delete
quotechar, doublequote, delimiter, skipinitialspace = \
[179] Fix | Delete
self._guess_quote_and_delimiter(sample, delimiters)
[180] Fix | Delete
if not delimiter:
[181] Fix | Delete
delimiter, skipinitialspace = self._guess_delimiter(sample,
[182] Fix | Delete
delimiters)
[183] Fix | Delete
[184] Fix | Delete
if not delimiter:
[185] Fix | Delete
raise Error("Could not determine delimiter")
[186] Fix | Delete
[187] Fix | Delete
class dialect(Dialect):
[188] Fix | Delete
_name = "sniffed"
[189] Fix | Delete
lineterminator = '\r\n'
[190] Fix | Delete
quoting = QUOTE_MINIMAL
[191] Fix | Delete
# escapechar = ''
[192] Fix | Delete
[193] Fix | Delete
dialect.doublequote = doublequote
[194] Fix | Delete
dialect.delimiter = delimiter
[195] Fix | Delete
# _csv.reader won't accept a quotechar of ''
[196] Fix | Delete
dialect.quotechar = quotechar or '"'
[197] Fix | Delete
dialect.skipinitialspace = skipinitialspace
[198] Fix | Delete
[199] Fix | Delete
return dialect
[200] Fix | Delete
[201] Fix | Delete
[202] Fix | Delete
def _guess_quote_and_delimiter(self, data, delimiters):
[203] Fix | Delete
"""
[204] Fix | Delete
Looks for text enclosed between two identical quotes
[205] Fix | Delete
(the probable quotechar) which are preceded and followed
[206] Fix | Delete
by the same character (the probable delimiter).
[207] Fix | Delete
For example:
[208] Fix | Delete
,'some text',
[209] Fix | Delete
The quote with the most wins, same with the delimiter.
[210] Fix | Delete
If there is no quotechar the delimiter can't be determined
[211] Fix | Delete
this way.
[212] Fix | Delete
"""
[213] Fix | Delete
[214] Fix | Delete
matches = []
[215] Fix | Delete
for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
[216] Fix | Delete
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
[217] Fix | Delete
r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
[218] Fix | Delete
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
[219] Fix | Delete
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
[220] Fix | Delete
matches = regexp.findall(data)
[221] Fix | Delete
if matches:
[222] Fix | Delete
break
[223] Fix | Delete
[224] Fix | Delete
if not matches:
[225] Fix | Delete
# (quotechar, doublequote, delimiter, skipinitialspace)
[226] Fix | Delete
return ('', False, None, 0)
[227] Fix | Delete
quotes = {}
[228] Fix | Delete
delims = {}
[229] Fix | Delete
spaces = 0
[230] Fix | Delete
groupindex = regexp.groupindex
[231] Fix | Delete
for m in matches:
[232] Fix | Delete
n = groupindex['quote'] - 1
[233] Fix | Delete
key = m[n]
[234] Fix | Delete
if key:
[235] Fix | Delete
quotes[key] = quotes.get(key, 0) + 1
[236] Fix | Delete
try:
[237] Fix | Delete
n = groupindex['delim'] - 1
[238] Fix | Delete
key = m[n]
[239] Fix | Delete
except KeyError:
[240] Fix | Delete
continue
[241] Fix | Delete
if key and (delimiters is None or key in delimiters):
[242] Fix | Delete
delims[key] = delims.get(key, 0) + 1
[243] Fix | Delete
try:
[244] Fix | Delete
n = groupindex['space'] - 1
[245] Fix | Delete
except KeyError:
[246] Fix | Delete
continue
[247] Fix | Delete
if m[n]:
[248] Fix | Delete
spaces += 1
[249] Fix | Delete
[250] Fix | Delete
quotechar = max(quotes, key=quotes.get)
[251] Fix | Delete
[252] Fix | Delete
if delims:
[253] Fix | Delete
delim = max(delims, key=delims.get)
[254] Fix | Delete
skipinitialspace = delims[delim] == spaces
[255] Fix | Delete
if delim == '\n': # most likely a file with a single column
[256] Fix | Delete
delim = ''
[257] Fix | Delete
else:
[258] Fix | Delete
# there is *no* delimiter, it's a single column of quoted data
[259] Fix | Delete
delim = ''
[260] Fix | Delete
skipinitialspace = 0
[261] Fix | Delete
[262] Fix | Delete
# if we see an extra quote between delimiters, we've got a
[263] Fix | Delete
# double quoted format
[264] Fix | Delete
dq_regexp = re.compile(
[265] Fix | Delete
r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
[266] Fix | Delete
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
[267] Fix | Delete
[268] Fix | Delete
[269] Fix | Delete
[270] Fix | Delete
if dq_regexp.search(data):
[271] Fix | Delete
doublequote = True
[272] Fix | Delete
else:
[273] Fix | Delete
doublequote = False
[274] Fix | Delete
[275] Fix | Delete
return (quotechar, doublequote, delim, skipinitialspace)
[276] Fix | Delete
[277] Fix | Delete
[278] Fix | Delete
def _guess_delimiter(self, data, delimiters):
[279] Fix | Delete
"""
[280] Fix | Delete
The delimiter /should/ occur the same number of times on
[281] Fix | Delete
each row. However, due to malformed data, it may not. We don't want
[282] Fix | Delete
an all or nothing approach, so we allow for small variations in this
[283] Fix | Delete
number.
[284] Fix | Delete
1) build a table of the frequency of each character on every line.
[285] Fix | Delete
2) build a table of frequencies of this frequency (meta-frequency?),
[286] Fix | Delete
e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
[287] Fix | Delete
7 times in 2 rows'
[288] Fix | Delete
3) use the mode of the meta-frequency to determine the /expected/
[289] Fix | Delete
frequency for that character
[290] Fix | Delete
4) find out how often the character actually meets that goal
[291] Fix | Delete
5) the character that best meets its goal is the delimiter
[292] Fix | Delete
For performance reasons, the data is evaluated in chunks, so it can
[293] Fix | Delete
try and evaluate the smallest portion of the data possible, evaluating
[294] Fix | Delete
additional chunks as necessary.
[295] Fix | Delete
"""
[296] Fix | Delete
[297] Fix | Delete
data = list(filter(None, data.split('\n')))
[298] Fix | Delete
[299] Fix | Delete
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
[300] Fix | Delete
[301] Fix | Delete
# build frequency tables
[302] Fix | Delete
chunkLength = min(10, len(data))
[303] Fix | Delete
iteration = 0
[304] Fix | Delete
charFrequency = {}
[305] Fix | Delete
modes = {}
[306] Fix | Delete
delims = {}
[307] Fix | Delete
start, end = 0, chunkLength
[308] Fix | Delete
while start < len(data):
[309] Fix | Delete
iteration += 1
[310] Fix | Delete
for line in data[start:end]:
[311] Fix | Delete
for char in ascii:
[312] Fix | Delete
metaFrequency = charFrequency.get(char, {})
[313] Fix | Delete
# must count even if frequency is 0
[314] Fix | Delete
freq = line.count(char)
[315] Fix | Delete
# value is the mode
[316] Fix | Delete
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
[317] Fix | Delete
charFrequency[char] = metaFrequency
[318] Fix | Delete
[319] Fix | Delete
for char in charFrequency.keys():
[320] Fix | Delete
items = list(charFrequency[char].items())
[321] Fix | Delete
if len(items) == 1 and items[0][0] == 0:
[322] Fix | Delete
continue
[323] Fix | Delete
# get the mode of the frequencies
[324] Fix | Delete
if len(items) > 1:
[325] Fix | Delete
modes[char] = max(items, key=lambda x: x[1])
[326] Fix | Delete
# adjust the mode - subtract the sum of all
[327] Fix | Delete
# other frequencies
[328] Fix | Delete
items.remove(modes[char])
[329] Fix | Delete
modes[char] = (modes[char][0], modes[char][1]
[330] Fix | Delete
- sum(item[1] for item in items))
[331] Fix | Delete
else:
[332] Fix | Delete
modes[char] = items[0]
[333] Fix | Delete
[334] Fix | Delete
# build a list of possible delimiters
[335] Fix | Delete
modeList = modes.items()
[336] Fix | Delete
total = float(min(chunkLength * iteration, len(data)))
[337] Fix | Delete
# (rows of consistent data) / (number of rows) = 100%
[338] Fix | Delete
consistency = 1.0
[339] Fix | Delete
# minimum consistency threshold
[340] Fix | Delete
threshold = 0.9
[341] Fix | Delete
while len(delims) == 0 and consistency >= threshold:
[342] Fix | Delete
for k, v in modeList:
[343] Fix | Delete
if v[0] > 0 and v[1] > 0:
[344] Fix | Delete
if ((v[1]/total) >= consistency and
[345] Fix | Delete
(delimiters is None or k in delimiters)):
[346] Fix | Delete
delims[k] = v
[347] Fix | Delete
consistency -= 0.01
[348] Fix | Delete
[349] Fix | Delete
if len(delims) == 1:
[350] Fix | Delete
delim = list(delims.keys())[0]
[351] Fix | Delete
skipinitialspace = (data[0].count(delim) ==
[352] Fix | Delete
data[0].count("%c " % delim))
[353] Fix | Delete
return (delim, skipinitialspace)
[354] Fix | Delete
[355] Fix | Delete
# analyze another chunkLength lines
[356] Fix | Delete
start = end
[357] Fix | Delete
end += chunkLength
[358] Fix | Delete
[359] Fix | Delete
if not delims:
[360] Fix | Delete
return ('', 0)
[361] Fix | Delete
[362] Fix | Delete
# if there's more than one, fall back to a 'preferred' list
[363] Fix | Delete
if len(delims) > 1:
[364] Fix | Delete
for d in self.preferred:
[365] Fix | Delete
if d in delims.keys():
[366] Fix | Delete
skipinitialspace = (data[0].count(d) ==
[367] Fix | Delete
data[0].count("%c " % d))
[368] Fix | Delete
return (d, skipinitialspace)
[369] Fix | Delete
[370] Fix | Delete
# nothing else indicates a preference, pick the character that
[371] Fix | Delete
# dominates(?)
[372] Fix | Delete
items = [(v,k) for (k,v) in delims.items()]
[373] Fix | Delete
items.sort()
[374] Fix | Delete
delim = items[-1][1]
[375] Fix | Delete
[376] Fix | Delete
skipinitialspace = (data[0].count(delim) ==
[377] Fix | Delete
data[0].count("%c " % delim))
[378] Fix | Delete
return (delim, skipinitialspace)
[379] Fix | Delete
[380] Fix | Delete
[381] Fix | Delete
def has_header(self, sample):
[382] Fix | Delete
# Creates a dictionary of types of data in each column. If any
[383] Fix | Delete
# column is of a single type (say, integers), *except* for the first
[384] Fix | Delete
# row, then the first row is presumed to be labels. If the type
[385] Fix | Delete
# can't be determined, it is assumed to be a string in which case
[386] Fix | Delete
# the length of the string is the determining factor: if all of the
[387] Fix | Delete
# rows except for the first are the same length, it's a header.
[388] Fix | Delete
# Finally, a 'vote' is taken at the end for each column, adding or
[389] Fix | Delete
# subtracting from the likelihood of the first row being a header.
[390] Fix | Delete
[391] Fix | Delete
rdr = reader(StringIO(sample), self.sniff(sample))
[392] Fix | Delete
[393] Fix | Delete
header = next(rdr) # assume first row is header
[394] Fix | Delete
[395] Fix | Delete
columns = len(header)
[396] Fix | Delete
columnTypes = {}
[397] Fix | Delete
for i in range(columns): columnTypes[i] = None
[398] Fix | Delete
[399] Fix | Delete
checked = 0
[400] Fix | Delete
for row in rdr:
[401] Fix | Delete
# arbitrary number of rows to check, to keep it sane
[402] Fix | Delete
if checked > 20:
[403] Fix | Delete
break
[404] Fix | Delete
checked += 1
[405] Fix | Delete
[406] Fix | Delete
if len(row) != columns:
[407] Fix | Delete
continue # skip rows that have irregular number of columns
[408] Fix | Delete
[409] Fix | Delete
for col in list(columnTypes.keys()):
[410] Fix | Delete
[411] Fix | Delete
for thisType in [int, float, complex]:
[412] Fix | Delete
try:
[413] Fix | Delete
thisType(row[col])
[414] Fix | Delete
break
[415] Fix | Delete
except (ValueError, OverflowError):
[416] Fix | Delete
pass
[417] Fix | Delete
else:
[418] Fix | Delete
# fallback to length of string
[419] Fix | Delete
thisType = len(row[col])
[420] Fix | Delete
[421] Fix | Delete
if thisType != columnTypes[col]:
[422] Fix | Delete
if columnTypes[col] is None: # add new column type
[423] Fix | Delete
columnTypes[col] = thisType
[424] Fix | Delete
else:
[425] Fix | Delete
# type is inconsistent, remove column from
[426] Fix | Delete
# consideration
[427] Fix | Delete
del columnTypes[col]
[428] Fix | Delete
[429] Fix | Delete
# finally, compare results against first row and "vote"
[430] Fix | Delete
# on whether it's a header
[431] Fix | Delete
hasHeader = 0
[432] Fix | Delete
for col, colType in columnTypes.items():
[433] Fix | Delete
if type(colType) == type(0): # it's a length
[434] Fix | Delete
if len(header[col]) != colType:
[435] Fix | Delete
hasHeader += 1
[436] Fix | Delete
else:
[437] Fix | Delete
hasHeader -= 1
[438] Fix | Delete
else: # attempt typecast
[439] Fix | Delete
try:
[440] Fix | Delete
colType(header[col])
[441] Fix | Delete
except (ValueError, TypeError):
[442] Fix | Delete
hasHeader += 1
[443] Fix | Delete
else:
[444] Fix | Delete
hasHeader -= 1
[445] Fix | Delete
[446] Fix | Delete
return hasHeader > 0
[447] Fix | Delete
[448] Fix | Delete
It is recommended that you Edit text format, this type of Fix handles quite a lot in one request
Function