# Secret Labs' Regular Expression Engine
# convert re-style regular expression to sre pattern
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
# See the sre.py file for information on usage and redistribution.
"""Internal support module for sre"""
# XXX: show string offset and offending character for all errors
from sre_constants import *
SPECIAL_CHARS = ".\\[{()*+?^$|"
DIGITS = frozenset("0123456789")
OCTDIGITS = frozenset("01234567")
HEXDIGITS = frozenset("0123456789abcdefABCDEF")
ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f")
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
r"\a": (LITERAL, ord("\a")),
r"\b": (LITERAL, ord("\b")),
r"\f": (LITERAL, ord("\f")),
r"\n": (LITERAL, ord("\n")),
r"\r": (LITERAL, ord("\r")),
r"\t": (LITERAL, ord("\t")),
r"\v": (LITERAL, ord("\v")),
r"\\": (LITERAL, ord("\\"))
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\Z": (AT, AT_END_STRING), # end of string
"i": SRE_FLAG_IGNORECASE,
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
class Verbose(Exception):
# keeps track of state for parsing
self.groupwidths = [None] # group 0
self.lookbehindgroups = None
return len(self.groupwidths)
def opengroup(self, name=None):
self.groupwidths.append(None)
if self.groups > MAXGROUPS:
raise error("too many groups")
ogid = self.groupdict.get(name, None)
raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid))
self.groupdict[name] = gid
def closegroup(self, gid, p):
self.groupwidths[gid] = p.getwidth()
def checkgroup(self, gid):
return gid < self.groups and self.groupwidths[gid] is not None
def checklookbehindgroup(self, gid, source):
if self.lookbehindgroups is not None:
if not self.checkgroup(gid):
raise source.error('cannot refer to an open group')
if gid >= self.lookbehindgroups:
raise source.error('cannot refer to group defined in the same '
# a subpattern, in intermediate form
def __init__(self, state, data=None):
print(level*" " + str(op), end='')
print((level+1)*" " + str(op), a)
for i, a in enumerate(av[1]):
elif op is GROUPREF_EXISTS:
condgroup, item_yes, item_no = av
print(level*" " + "ELSE")
elif isinstance(av, seqtypes):
if isinstance(a, SubPattern):
def __delitem__(self, index):
def __getitem__(self, index):
if isinstance(index, slice):
return SubPattern(self.state, self.data[index])
def __setitem__(self, index, code):
def insert(self, index, code):
self.data.insert(index, code)
# determine the width (min, max) for this subpattern
if self.width is not None:
i, j = self.state.groupwidths[av]
elif op is GROUPREF_EXISTS:
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
def __init__(self, string):
self.istext = isinstance(string, str)
string = str(string, 'latin1')
self.decoded_string = string
char = self.decoded_string[index]
char += self.decoded_string[index]
raise error("bad escape (end of pattern)",
self.string, len(self.string) - 1) from None
def getwhile(self, n, charset):
def getuntil(self, terminator, name):
raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator,
raise self.error("missing " + name, 1)
return self.index - len(self.next or '')
return self.index - len(self.next or '')
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)
def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
code = CATEGORIES.get(escape)
if code and code[0] is IN:
# hexadecimal escape (exactly two digits)
escape += source.getwhile(2, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
chr(c) # raise ValueError for invalid code
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
c = ord(unicodedata.lookup(charname))
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape, len(escape))
raise source.error('bad escape %s' % escape, len(escape))
return LITERAL, ord(escape[1])
raise source.error("bad escape %s" % escape, len(escape))
def _escape(source, escape, state):
# handle escape code in expression
code = CATEGORIES.get(escape)
code = ESCAPES.get(escape)
escape += source.getwhile(2, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
raise source.error("incomplete escape %s" % escape, len(escape))
chr(c) # raise ValueError for invalid code
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
c = ord(unicodedata.lookup(charname))
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
escape += source.getwhile(2, OCTDIGITS)
return LITERAL, int(escape[1:], 8)
# octal escape *or* decimal group reference (sigh)
if source.next in DIGITS:
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
source.next in OCTDIGITS):
# got three octal digits; this is an octal escape
raise source.error('octal escape value %s outside of '
'range 0-0o377' % escape,
# not an octal escape, so this is a group reference
if not state.checkgroup(group):
raise source.error("cannot refer to an open group",
state.checklookbehindgroup(group, source)
raise source.error("invalid group reference %d" % group, len(escape) - 1)
raise source.error("bad escape %s" % escape, len(escape))
return LITERAL, ord(escape[1])
raise source.error("bad escape %s" % escape, len(escape))
return list(dict.fromkeys(items))
def _parse_sub(source, state, verbose, nested):
# parse an alternation: a|b|c
itemsappend = items.append
sourcematch = source.match
itemsappend(_parse(source, state, verbose, nested + 1,
not nested and not items))
subpattern = SubPattern(state)
# check if all items share a common prefix
# all subitems start with a common "prefix".
# move it out of the branch
subpattern.append(prefix)
continue # check next one
# check if the branch can be replaced by a character set
elif op is IN and av[0][0] is not NEGATE:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpattern.append((IN, _uniq(set)))
subpattern.append((BRANCH, (None, items)))
def _parse(source, state, verbose, nested, first=False):
subpattern = SubPattern(state)
# precompute constants into local variables
subpatternappend = subpattern.append
sourcematch = source.match