Edit File by line
/home/barbar84/public_h.../wp-conte.../plugins/sujqvwi/ShExBy/shex_roo.../usr/lib64/python3....
File: tokenize.py
"""Tokenization help for Python programs.
[0] Fix | Delete
[1] Fix | Delete
tokenize(readline) is a generator that breaks a stream of bytes into
[2] Fix | Delete
Python tokens. It decodes the bytes according to PEP-0263 for
[3] Fix | Delete
determining source file encoding.
[4] Fix | Delete
[5] Fix | Delete
It accepts a readline-like method which is called repeatedly to get the
[6] Fix | Delete
next line of input (or b"" for EOF). It generates 5-tuples with these
[7] Fix | Delete
members:
[8] Fix | Delete
[9] Fix | Delete
the token type (see token.py)
[10] Fix | Delete
the token (a string)
[11] Fix | Delete
the starting (row, column) indices of the token (a 2-tuple of ints)
[12] Fix | Delete
the ending (row, column) indices of the token (a 2-tuple of ints)
[13] Fix | Delete
the original line (string)
[14] Fix | Delete
[15] Fix | Delete
It is designed to match the working of the Python tokenizer exactly, except
[16] Fix | Delete
that it produces COMMENT tokens for comments and gives type OP for all
[17] Fix | Delete
operators. Additionally, all token lists start with an ENCODING token
[18] Fix | Delete
which tells you which encoding was used to decode the bytes stream.
[19] Fix | Delete
"""
[20] Fix | Delete
[21] Fix | Delete
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
[22] Fix | Delete
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
[23] Fix | Delete
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
[24] Fix | Delete
'Michael Foord')
[25] Fix | Delete
from builtins import open as _builtin_open
[26] Fix | Delete
from codecs import lookup, BOM_UTF8
[27] Fix | Delete
import collections
[28] Fix | Delete
from io import TextIOWrapper
[29] Fix | Delete
from itertools import chain
[30] Fix | Delete
import itertools as _itertools
[31] Fix | Delete
import re
[32] Fix | Delete
import sys
[33] Fix | Delete
from token import *
[34] Fix | Delete
[35] Fix | Delete
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
[36] Fix | Delete
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
[37] Fix | Delete
[38] Fix | Delete
import token
[39] Fix | Delete
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
[40] Fix | Delete
"NL", "untokenize", "ENCODING", "TokenInfo"]
[41] Fix | Delete
del token
[42] Fix | Delete
[43] Fix | Delete
COMMENT = N_TOKENS
[44] Fix | Delete
tok_name[COMMENT] = 'COMMENT'
[45] Fix | Delete
NL = N_TOKENS + 1
[46] Fix | Delete
tok_name[NL] = 'NL'
[47] Fix | Delete
ENCODING = N_TOKENS + 2
[48] Fix | Delete
tok_name[ENCODING] = 'ENCODING'
[49] Fix | Delete
N_TOKENS += 3
[50] Fix | Delete
EXACT_TOKEN_TYPES = {
[51] Fix | Delete
'(': LPAR,
[52] Fix | Delete
')': RPAR,
[53] Fix | Delete
'[': LSQB,
[54] Fix | Delete
']': RSQB,
[55] Fix | Delete
':': COLON,
[56] Fix | Delete
',': COMMA,
[57] Fix | Delete
';': SEMI,
[58] Fix | Delete
'+': PLUS,
[59] Fix | Delete
'-': MINUS,
[60] Fix | Delete
'*': STAR,
[61] Fix | Delete
'/': SLASH,
[62] Fix | Delete
'|': VBAR,
[63] Fix | Delete
'&': AMPER,
[64] Fix | Delete
'<': LESS,
[65] Fix | Delete
'>': GREATER,
[66] Fix | Delete
'=': EQUAL,
[67] Fix | Delete
'.': DOT,
[68] Fix | Delete
'%': PERCENT,
[69] Fix | Delete
'{': LBRACE,
[70] Fix | Delete
'}': RBRACE,
[71] Fix | Delete
'==': EQEQUAL,
[72] Fix | Delete
'!=': NOTEQUAL,
[73] Fix | Delete
'<=': LESSEQUAL,
[74] Fix | Delete
'>=': GREATEREQUAL,
[75] Fix | Delete
'~': TILDE,
[76] Fix | Delete
'^': CIRCUMFLEX,
[77] Fix | Delete
'<<': LEFTSHIFT,
[78] Fix | Delete
'>>': RIGHTSHIFT,
[79] Fix | Delete
'**': DOUBLESTAR,
[80] Fix | Delete
'+=': PLUSEQUAL,
[81] Fix | Delete
'-=': MINEQUAL,
[82] Fix | Delete
'*=': STAREQUAL,
[83] Fix | Delete
'/=': SLASHEQUAL,
[84] Fix | Delete
'%=': PERCENTEQUAL,
[85] Fix | Delete
'&=': AMPEREQUAL,
[86] Fix | Delete
'|=': VBAREQUAL,
[87] Fix | Delete
'^=': CIRCUMFLEXEQUAL,
[88] Fix | Delete
'<<=': LEFTSHIFTEQUAL,
[89] Fix | Delete
'>>=': RIGHTSHIFTEQUAL,
[90] Fix | Delete
'**=': DOUBLESTAREQUAL,
[91] Fix | Delete
'//': DOUBLESLASH,
[92] Fix | Delete
'//=': DOUBLESLASHEQUAL,
[93] Fix | Delete
'@': AT,
[94] Fix | Delete
'@=': ATEQUAL,
[95] Fix | Delete
}
[96] Fix | Delete
[97] Fix | Delete
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
[98] Fix | Delete
def __repr__(self):
[99] Fix | Delete
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
[100] Fix | Delete
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
[101] Fix | Delete
self._replace(type=annotated_type))
[102] Fix | Delete
[103] Fix | Delete
@property
[104] Fix | Delete
def exact_type(self):
[105] Fix | Delete
if self.type == OP and self.string in EXACT_TOKEN_TYPES:
[106] Fix | Delete
return EXACT_TOKEN_TYPES[self.string]
[107] Fix | Delete
else:
[108] Fix | Delete
return self.type
[109] Fix | Delete
[110] Fix | Delete
def group(*choices): return '(' + '|'.join(choices) + ')'
[111] Fix | Delete
def any(*choices): return group(*choices) + '*'
[112] Fix | Delete
def maybe(*choices): return group(*choices) + '?'
[113] Fix | Delete
[114] Fix | Delete
# Note: we use unicode matching for names ("\w") but ascii matching for
[115] Fix | Delete
# number literals.
[116] Fix | Delete
Whitespace = r'[ \f\t]*'
[117] Fix | Delete
Comment = r'#[^\r\n]*'
[118] Fix | Delete
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
[119] Fix | Delete
Name = r'\w+'
[120] Fix | Delete
[121] Fix | Delete
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
[122] Fix | Delete
Binnumber = r'0[bB](?:_?[01])+'
[123] Fix | Delete
Octnumber = r'0[oO](?:_?[0-7])+'
[124] Fix | Delete
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
[125] Fix | Delete
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
[126] Fix | Delete
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
[127] Fix | Delete
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
[128] Fix | Delete
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
[129] Fix | Delete
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
[130] Fix | Delete
Floatnumber = group(Pointfloat, Expfloat)
[131] Fix | Delete
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
[132] Fix | Delete
Number = group(Imagnumber, Floatnumber, Intnumber)
[133] Fix | Delete
[134] Fix | Delete
# Return the empty string, plus all of the valid string prefixes.
[135] Fix | Delete
def _all_string_prefixes():
[136] Fix | Delete
# The valid string prefixes. Only contain the lower case versions,
[137] Fix | Delete
# and don't contain any permuations (include 'fr', but not
[138] Fix | Delete
# 'rf'). The various permutations will be generated.
[139] Fix | Delete
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
[140] Fix | Delete
# if we add binary f-strings, add: ['fb', 'fbr']
[141] Fix | Delete
result = set([''])
[142] Fix | Delete
for prefix in _valid_string_prefixes:
[143] Fix | Delete
for t in _itertools.permutations(prefix):
[144] Fix | Delete
# create a list with upper and lower versions of each
[145] Fix | Delete
# character
[146] Fix | Delete
for u in _itertools.product(*[(c, c.upper()) for c in t]):
[147] Fix | Delete
result.add(''.join(u))
[148] Fix | Delete
return result
[149] Fix | Delete
[150] Fix | Delete
def _compile(expr):
[151] Fix | Delete
return re.compile(expr, re.UNICODE)
[152] Fix | Delete
[153] Fix | Delete
# Note that since _all_string_prefixes includes the empty string,
[154] Fix | Delete
# StringPrefix can be the empty string (making it optional).
[155] Fix | Delete
StringPrefix = group(*_all_string_prefixes())
[156] Fix | Delete
[157] Fix | Delete
# Tail end of ' string.
[158] Fix | Delete
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
[159] Fix | Delete
# Tail end of " string.
[160] Fix | Delete
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
[161] Fix | Delete
# Tail end of ''' string.
[162] Fix | Delete
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
[163] Fix | Delete
# Tail end of """ string.
[164] Fix | Delete
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
[165] Fix | Delete
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
[166] Fix | Delete
# Single-line ' or " string.
[167] Fix | Delete
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
[168] Fix | Delete
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
[169] Fix | Delete
[170] Fix | Delete
# Because of leftmost-then-longest match semantics, be sure to put the
[171] Fix | Delete
# longest operators first (e.g., if = came before ==, == would get
[172] Fix | Delete
# recognized as two instances of =).
[173] Fix | Delete
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
[174] Fix | Delete
r"//=?", r"->",
[175] Fix | Delete
r"[+\-*/%&@|^=<>]=?",
[176] Fix | Delete
r"~")
[177] Fix | Delete
[178] Fix | Delete
Bracket = '[][(){}]'
[179] Fix | Delete
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
[180] Fix | Delete
Funny = group(Operator, Bracket, Special)
[181] Fix | Delete
[182] Fix | Delete
PlainToken = group(Number, Funny, String, Name)
[183] Fix | Delete
Token = Ignore + PlainToken
[184] Fix | Delete
[185] Fix | Delete
# First (or only) line of ' or " string.
[186] Fix | Delete
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
[187] Fix | Delete
group("'", r'\\\r?\n'),
[188] Fix | Delete
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
[189] Fix | Delete
group('"', r'\\\r?\n'))
[190] Fix | Delete
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
[191] Fix | Delete
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
[192] Fix | Delete
[193] Fix | Delete
# For a given string prefix plus quotes, endpats maps it to a regex
[194] Fix | Delete
# to match the remainder of that string. _prefix can be empty, for
[195] Fix | Delete
# a normal single or triple quoted string (with no prefix).
[196] Fix | Delete
endpats = {}
[197] Fix | Delete
for _prefix in _all_string_prefixes():
[198] Fix | Delete
endpats[_prefix + "'"] = Single
[199] Fix | Delete
endpats[_prefix + '"'] = Double
[200] Fix | Delete
endpats[_prefix + "'''"] = Single3
[201] Fix | Delete
endpats[_prefix + '"""'] = Double3
[202] Fix | Delete
[203] Fix | Delete
# A set of all of the single and triple quoted string prefixes,
[204] Fix | Delete
# including the opening quotes.
[205] Fix | Delete
single_quoted = set()
[206] Fix | Delete
triple_quoted = set()
[207] Fix | Delete
for t in _all_string_prefixes():
[208] Fix | Delete
for u in (t + '"', t + "'"):
[209] Fix | Delete
single_quoted.add(u)
[210] Fix | Delete
for u in (t + '"""', t + "'''"):
[211] Fix | Delete
triple_quoted.add(u)
[212] Fix | Delete
[213] Fix | Delete
tabsize = 8
[214] Fix | Delete
[215] Fix | Delete
class TokenError(Exception): pass
[216] Fix | Delete
[217] Fix | Delete
class StopTokenizing(Exception): pass
[218] Fix | Delete
[219] Fix | Delete
[220] Fix | Delete
class Untokenizer:
[221] Fix | Delete
[222] Fix | Delete
def __init__(self):
[223] Fix | Delete
self.tokens = []
[224] Fix | Delete
self.prev_row = 1
[225] Fix | Delete
self.prev_col = 0
[226] Fix | Delete
self.encoding = None
[227] Fix | Delete
[228] Fix | Delete
def add_whitespace(self, start):
[229] Fix | Delete
row, col = start
[230] Fix | Delete
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
[231] Fix | Delete
raise ValueError("start ({},{}) precedes previous end ({},{})"
[232] Fix | Delete
.format(row, col, self.prev_row, self.prev_col))
[233] Fix | Delete
row_offset = row - self.prev_row
[234] Fix | Delete
if row_offset:
[235] Fix | Delete
self.tokens.append("\\\n" * row_offset)
[236] Fix | Delete
self.prev_col = 0
[237] Fix | Delete
col_offset = col - self.prev_col
[238] Fix | Delete
if col_offset:
[239] Fix | Delete
self.tokens.append(" " * col_offset)
[240] Fix | Delete
[241] Fix | Delete
def untokenize(self, iterable):
[242] Fix | Delete
it = iter(iterable)
[243] Fix | Delete
indents = []
[244] Fix | Delete
startline = False
[245] Fix | Delete
for t in it:
[246] Fix | Delete
if len(t) == 2:
[247] Fix | Delete
self.compat(t, it)
[248] Fix | Delete
break
[249] Fix | Delete
tok_type, token, start, end, line = t
[250] Fix | Delete
if tok_type == ENCODING:
[251] Fix | Delete
self.encoding = token
[252] Fix | Delete
continue
[253] Fix | Delete
if tok_type == ENDMARKER:
[254] Fix | Delete
break
[255] Fix | Delete
if tok_type == INDENT:
[256] Fix | Delete
indents.append(token)
[257] Fix | Delete
continue
[258] Fix | Delete
elif tok_type == DEDENT:
[259] Fix | Delete
indents.pop()
[260] Fix | Delete
self.prev_row, self.prev_col = end
[261] Fix | Delete
continue
[262] Fix | Delete
elif tok_type in (NEWLINE, NL):
[263] Fix | Delete
startline = True
[264] Fix | Delete
elif startline and indents:
[265] Fix | Delete
indent = indents[-1]
[266] Fix | Delete
if start[1] >= len(indent):
[267] Fix | Delete
self.tokens.append(indent)
[268] Fix | Delete
self.prev_col = len(indent)
[269] Fix | Delete
startline = False
[270] Fix | Delete
self.add_whitespace(start)
[271] Fix | Delete
self.tokens.append(token)
[272] Fix | Delete
self.prev_row, self.prev_col = end
[273] Fix | Delete
if tok_type in (NEWLINE, NL):
[274] Fix | Delete
self.prev_row += 1
[275] Fix | Delete
self.prev_col = 0
[276] Fix | Delete
return "".join(self.tokens)
[277] Fix | Delete
[278] Fix | Delete
def compat(self, token, iterable):
[279] Fix | Delete
indents = []
[280] Fix | Delete
toks_append = self.tokens.append
[281] Fix | Delete
startline = token[0] in (NEWLINE, NL)
[282] Fix | Delete
prevstring = False
[283] Fix | Delete
[284] Fix | Delete
for tok in chain([token], iterable):
[285] Fix | Delete
toknum, tokval = tok[:2]
[286] Fix | Delete
if toknum == ENCODING:
[287] Fix | Delete
self.encoding = tokval
[288] Fix | Delete
continue
[289] Fix | Delete
[290] Fix | Delete
if toknum in (NAME, NUMBER, ASYNC, AWAIT):
[291] Fix | Delete
tokval += ' '
[292] Fix | Delete
[293] Fix | Delete
# Insert a space between two consecutive strings
[294] Fix | Delete
if toknum == STRING:
[295] Fix | Delete
if prevstring:
[296] Fix | Delete
tokval = ' ' + tokval
[297] Fix | Delete
prevstring = True
[298] Fix | Delete
else:
[299] Fix | Delete
prevstring = False
[300] Fix | Delete
[301] Fix | Delete
if toknum == INDENT:
[302] Fix | Delete
indents.append(tokval)
[303] Fix | Delete
continue
[304] Fix | Delete
elif toknum == DEDENT:
[305] Fix | Delete
indents.pop()
[306] Fix | Delete
continue
[307] Fix | Delete
elif toknum in (NEWLINE, NL):
[308] Fix | Delete
startline = True
[309] Fix | Delete
elif startline and indents:
[310] Fix | Delete
toks_append(indents[-1])
[311] Fix | Delete
startline = False
[312] Fix | Delete
toks_append(tokval)
[313] Fix | Delete
[314] Fix | Delete
[315] Fix | Delete
def untokenize(iterable):
[316] Fix | Delete
"""Transform tokens back into Python source code.
[317] Fix | Delete
It returns a bytes object, encoded using the ENCODING
[318] Fix | Delete
token, which is the first token sequence output by tokenize.
[319] Fix | Delete
[320] Fix | Delete
Each element returned by the iterable must be a token sequence
[321] Fix | Delete
with at least two elements, a token number and token value. If
[322] Fix | Delete
only two tokens are passed, the resulting output is poor.
[323] Fix | Delete
[324] Fix | Delete
Round-trip invariant for full input:
[325] Fix | Delete
Untokenized source will match input source exactly
[326] Fix | Delete
[327] Fix | Delete
Round-trip invariant for limited input:
[328] Fix | Delete
# Output bytes will tokenize back to the input
[329] Fix | Delete
t1 = [tok[:2] for tok in tokenize(f.readline)]
[330] Fix | Delete
newcode = untokenize(t1)
[331] Fix | Delete
readline = BytesIO(newcode).readline
[332] Fix | Delete
t2 = [tok[:2] for tok in tokenize(readline)]
[333] Fix | Delete
assert t1 == t2
[334] Fix | Delete
"""
[335] Fix | Delete
ut = Untokenizer()
[336] Fix | Delete
out = ut.untokenize(iterable)
[337] Fix | Delete
if ut.encoding is not None:
[338] Fix | Delete
out = out.encode(ut.encoding)
[339] Fix | Delete
return out
[340] Fix | Delete
[341] Fix | Delete
[342] Fix | Delete
def _get_normal_name(orig_enc):
[343] Fix | Delete
"""Imitates get_normal_name in tokenizer.c."""
[344] Fix | Delete
# Only care about the first 12 characters.
[345] Fix | Delete
enc = orig_enc[:12].lower().replace("_", "-")
[346] Fix | Delete
if enc == "utf-8" or enc.startswith("utf-8-"):
[347] Fix | Delete
return "utf-8"
[348] Fix | Delete
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
[349] Fix | Delete
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
[350] Fix | Delete
return "iso-8859-1"
[351] Fix | Delete
return orig_enc
[352] Fix | Delete
[353] Fix | Delete
def detect_encoding(readline):
[354] Fix | Delete
"""
[355] Fix | Delete
The detect_encoding() function is used to detect the encoding that should
[356] Fix | Delete
be used to decode a Python source file. It requires one argument, readline,
[357] Fix | Delete
in the same way as the tokenize() generator.
[358] Fix | Delete
[359] Fix | Delete
It will call readline a maximum of twice, and return the encoding used
[360] Fix | Delete
(as a string) and a list of any lines (left as bytes) it has read in.
[361] Fix | Delete
[362] Fix | Delete
It detects the encoding from the presence of a utf-8 bom or an encoding
[363] Fix | Delete
cookie as specified in pep-0263. If both a bom and a cookie are present,
[364] Fix | Delete
but disagree, a SyntaxError will be raised. If the encoding cookie is an
[365] Fix | Delete
invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
[366] Fix | Delete
'utf-8-sig' is returned.
[367] Fix | Delete
[368] Fix | Delete
If no encoding is specified, then the default of 'utf-8' will be returned.
[369] Fix | Delete
"""
[370] Fix | Delete
try:
[371] Fix | Delete
filename = readline.__self__.name
[372] Fix | Delete
except AttributeError:
[373] Fix | Delete
filename = None
[374] Fix | Delete
bom_found = False
[375] Fix | Delete
encoding = None
[376] Fix | Delete
default = 'utf-8'
[377] Fix | Delete
def read_or_stop():
[378] Fix | Delete
try:
[379] Fix | Delete
return readline()
[380] Fix | Delete
except StopIteration:
[381] Fix | Delete
return b''
[382] Fix | Delete
[383] Fix | Delete
def find_cookie(line):
[384] Fix | Delete
try:
[385] Fix | Delete
# Decode as UTF-8. Either the line is an encoding declaration,
[386] Fix | Delete
# in which case it should be pure ASCII, or it must be UTF-8
[387] Fix | Delete
# per default encoding.
[388] Fix | Delete
line_string = line.decode('utf-8')
[389] Fix | Delete
except UnicodeDecodeError:
[390] Fix | Delete
msg = "invalid or missing encoding declaration"
[391] Fix | Delete
if filename is not None:
[392] Fix | Delete
msg = '{} for {!r}'.format(msg, filename)
[393] Fix | Delete
raise SyntaxError(msg)
[394] Fix | Delete
[395] Fix | Delete
match = cookie_re.match(line_string)
[396] Fix | Delete
if not match:
[397] Fix | Delete
return None
[398] Fix | Delete
encoding = _get_normal_name(match.group(1))
[399] Fix | Delete
try:
[400] Fix | Delete
codec = lookup(encoding)
[401] Fix | Delete
except LookupError:
[402] Fix | Delete
# This behaviour mimics the Python interpreter
[403] Fix | Delete
if filename is None:
[404] Fix | Delete
msg = "unknown encoding: " + encoding
[405] Fix | Delete
else:
[406] Fix | Delete
msg = "unknown encoding for {!r}: {}".format(filename,
[407] Fix | Delete
encoding)
[408] Fix | Delete
raise SyntaxError(msg)
[409] Fix | Delete
[410] Fix | Delete
if bom_found:
[411] Fix | Delete
if encoding != 'utf-8':
[412] Fix | Delete
# This behaviour mimics the Python interpreter
[413] Fix | Delete
if filename is None:
[414] Fix | Delete
msg = 'encoding problem: utf-8'
[415] Fix | Delete
else:
[416] Fix | Delete
msg = 'encoding problem for {!r}: utf-8'.format(filename)
[417] Fix | Delete
raise SyntaxError(msg)
[418] Fix | Delete
encoding += '-sig'
[419] Fix | Delete
return encoding
[420] Fix | Delete
[421] Fix | Delete
first = read_or_stop()
[422] Fix | Delete
if first.startswith(BOM_UTF8):
[423] Fix | Delete
bom_found = True
[424] Fix | Delete
first = first[3:]
[425] Fix | Delete
default = 'utf-8-sig'
[426] Fix | Delete
if not first:
[427] Fix | Delete
return default, []
[428] Fix | Delete
[429] Fix | Delete
encoding = find_cookie(first)
[430] Fix | Delete
if encoding:
[431] Fix | Delete
return encoding, [first]
[432] Fix | Delete
if not blank_re.match(first):
[433] Fix | Delete
return default, [first]
[434] Fix | Delete
[435] Fix | Delete
second = read_or_stop()
[436] Fix | Delete
if not second:
[437] Fix | Delete
return default, [first]
[438] Fix | Delete
[439] Fix | Delete
encoding = find_cookie(second)
[440] Fix | Delete
if encoding:
[441] Fix | Delete
return encoding, [first, second]
[442] Fix | Delete
[443] Fix | Delete
return default, [first, second]
[444] Fix | Delete
[445] Fix | Delete
[446] Fix | Delete
def open(filename):
[447] Fix | Delete
"""Open a file in read only mode using the encoding detected by
[448] Fix | Delete
detect_encoding().
[449] Fix | Delete
"""
[450] Fix | Delete
buffer = _builtin_open(filename, 'rb')
[451] Fix | Delete
try:
[452] Fix | Delete
encoding, lines = detect_encoding(buffer.readline)
[453] Fix | Delete
buffer.seek(0)
[454] Fix | Delete
text = TextIOWrapper(buffer, encoding, line_buffering=True)
[455] Fix | Delete
text.mode = 'r'
[456] Fix | Delete
return text
[457] Fix | Delete
except:
[458] Fix | Delete
buffer.close()
[459] Fix | Delete
raise
[460] Fix | Delete
[461] Fix | Delete
[462] Fix | Delete
def tokenize(readline):
[463] Fix | Delete
"""
[464] Fix | Delete
The tokenize() generator requires one argument, readline, which
[465] Fix | Delete
must be a callable object which provides the same interface as the
[466] Fix | Delete
readline() method of built-in file objects. Each call to the function
[467] Fix | Delete
should return one line of input as bytes. Alternatively, readline
[468] Fix | Delete
can be a callable function terminating with StopIteration:
[469] Fix | Delete
readline = open(myfile, 'rb').__next__ # Example of alternate readline
[470] Fix | Delete
[471] Fix | Delete
The generator produces 5-tuples with these members: the token type; the
[472] Fix | Delete
token string; a 2-tuple (srow, scol) of ints specifying the row and
[473] Fix | Delete
column where the token begins in the source; a 2-tuple (erow, ecol) of
[474] Fix | Delete
ints specifying the row and column where the token ends in the source;
[475] Fix | Delete
and the line on which the token was found. The line passed is the
[476] Fix | Delete
logical line; continuation lines are included.
[477] Fix | Delete
[478] Fix | Delete
The first token sequence will always be an ENCODING token
[479] Fix | Delete
which tells you which encoding was used to decode the bytes stream.
[480] Fix | Delete
"""
[481] Fix | Delete
# This import is here to avoid problems when the itertools module is not
[482] Fix | Delete
# built yet and tokenize is imported.
[483] Fix | Delete
from itertools import chain, repeat
[484] Fix | Delete
encoding, consumed = detect_encoding(readline)
[485] Fix | Delete
rl_gen = iter(readline, b"")
[486] Fix | Delete
empty = repeat(b"")
[487] Fix | Delete
return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
[488] Fix | Delete
[489] Fix | Delete
[490] Fix | Delete
def _tokenize(readline, encoding):
[491] Fix | Delete
lnum = parenlev = continued = 0
[492] Fix | Delete
numchars = '0123456789'
[493] Fix | Delete
contstr, needcont = '', 0
[494] Fix | Delete
contline = None
[495] Fix | Delete
indents = [0]
[496] Fix | Delete
[497] Fix | Delete
# 'stashed' and 'async_*' are used for async/await parsing
[498] Fix | Delete
stashed = None
[499] Fix | Delete
12
It is recommended that you Edit text format, this type of Fix handles quite a lot in one request
Function