# Secret Labs' Regular Expression Engine
# convert re-style regular expression to sre pattern
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
# See the sre.py file for information on usage and redistribution.
"""Internal support module for sre"""
# XXX: show string offset and offending character for all errors
from sre_constants import *
SPECIAL_CHARS = ".\\[{()*+?^$|"
DIGITS = set("0123456789")
OCTDIGITS = set("01234567")
HEXDIGITS = set("0123456789abcdefABCDEF")
ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = set(" \t\n\r\v\f")
r"\a": (LITERAL, ord("\a")),
r"\b": (LITERAL, ord("\b")),
r"\f": (LITERAL, ord("\f")),
r"\n": (LITERAL, ord("\n")),
r"\r": (LITERAL, ord("\r")),
r"\t": (LITERAL, ord("\t")),
r"\v": (LITERAL, ord("\v")),
r"\\": (LITERAL, ord("\\"))
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\Z": (AT, AT_END_STRING), # end of string
"i": SRE_FLAG_IGNORECASE,
# master pattern object. keeps track of global attributes
def opengroup(self, name=None):
ogid = self.groupdict.get(name, None)
raise error, ("redefinition of group name %s as group %d; "
"was group %d" % (repr(name), gid, ogid))
self.groupdict[name] = gid
def closegroup(self, gid):
def checkgroup(self, gid):
return gid < self.groups and gid not in self.open
# a subpattern, in intermediate form
def __init__(self, pattern, data=None):
print (level+1)*" " + op, a
for i, a in enumerate(av[1]):
elif op == GROUPREF_EXISTS:
condgroup, item_yes, item_no = av
elif isinstance(av, seqtypes):
if isinstance(a, SubPattern):
def __delitem__(self, index):
def __getitem__(self, index):
if isinstance(index, slice):
return SubPattern(self.pattern, self.data[index])
def __setitem__(self, index, code):
def insert(self, index, code):
self.data.insert(index, code)
# determine the width (min, max) for this subpattern
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
def __init__(self, string):
if self.index >= len(self.string):
char = self.string[self.index]
c = self.string[self.index + 1]
raise error, "bogus escape (end of line)"
self.index = self.index + len(char)
def match(self, char, skip=1):
return self.index, self.next
self.index, self.next = index
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
return "0" <= char <= "9"
# check that group name is a valid string
if not isident(char) and not isdigit(char):
def _class_escape(source, escape, nested):
# handle escape code inside character class
code = ESCAPES.get(escape)
code = CATEGORIES.get(escape)
if code and code[0] == IN:
# hexadecimal escape (exactly two digits)
while source.next in HEXDIGITS and len(escape) < 4:
escape = escape + source.get()
raise error, "bogus escape: %s" % repr("\\" + escape)
return LITERAL, int(escape, 16) & 0xff
# octal escape (up to three digits)
while source.next in OCTDIGITS and len(escape) < 4:
escape = escape + source.get()
return LITERAL, int(escape, 8) & 0xff
raise error, "bogus escape: %s" % repr(escape)
if sys.py3kwarning and c in ASCIILETTERS:
warnings.warn('bad escape %s; Unicode escapes are '
'supported only since Python 3.3' % escape,
FutureWarning, stacklevel=nested + 6)
warnings.warnpy3k('bad escape %s' % escape,
DeprecationWarning, stacklevel=nested + 6)
return LITERAL, ord(escape[1])
raise error, "bogus escape: %s" % repr(escape)
def _escape(source, escape, state, nested):
# handle escape code in expression
code = CATEGORIES.get(escape)
code = ESCAPES.get(escape)
while source.next in HEXDIGITS and len(escape) < 4:
escape = escape + source.get()
return LITERAL, int(escape[2:], 16) & 0xff
while source.next in OCTDIGITS and len(escape) < 4:
escape = escape + source.get()
return LITERAL, int(escape[1:], 8) & 0xff
# octal escape *or* decimal group reference (sigh)
if source.next in DIGITS:
escape = escape + source.get()
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
source.next in OCTDIGITS):
# got three octal digits; this is an octal escape
escape = escape + source.get()
return LITERAL, int(escape[1:], 8) & 0xff
# not an octal escape, so this is a group reference
if not state.checkgroup(group):
raise error, "cannot refer to open group"
warnings.warn('group references in lookbehind '
'assertions are not supported',
RuntimeWarning, stacklevel=nested + 6)
if sys.py3kwarning and c in ASCIILETTERS:
warnings.warn('bad escape %s; Unicode escapes are '
'supported only since Python 3.3' % escape,
FutureWarning, stacklevel=nested + 6)
warnings.warnpy3k('bad escape %s' % escape,
DeprecationWarning, stacklevel=nested + 6)
return LITERAL, ord(escape[1])
raise error, "bogus escape: %s" % repr(escape)
def _parse_sub(source, state, nested):
# parse an alternation: a|b|c
itemsappend = items.append
sourcematch = source.match
itemsappend(_parse(source, state, nested + 1))
if not source.next or sourcematch(")", 0):
raise error, "pattern not properly closed"
subpattern = SubPattern(state)
subpatternappend = subpattern.append
# check if all items share a common prefix
# all subitems start with a common "prefix".
# move it out of the branch
continue # check next one
# check if the branch can be replaced by a character set
if len(item) != 1 or item[0][0] != LITERAL:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpatternappend((IN, set))
subpattern.append((BRANCH, (None, items)))
def _parse_sub_cond(source, state, condgroup, nested):
item_yes = _parse(source, state, nested + 1)
item_no = _parse(source, state, nested + 1)
raise error, "conditional backref with more than two branches"
if source.next and not source.match(")", 0):
raise error, "pattern not properly closed"
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
_PATTERNENDERS = set("|)")
_ASSERTCHARS = set("=!<")
_LOOKBEHINDASSERTCHARS = set("=!")
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
def _parse(source, state, nested):
subpattern = SubPattern(state)
# precompute constants into local variables
subpatternappend = subpattern.append
sourcematch = source.match
PATTERNENDERS = _PATTERNENDERS
ASSERTCHARS = _ASSERTCHARS
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
REPEATCODES = _REPEATCODES
if source.next in PATTERNENDERS:
break # end of subpattern
if state.flags & SRE_FLAG_VERBOSE:
# skip whitespace and comments
if this and this[0] not in SPECIAL_CHARS:
subpatternappend((LITERAL, ord(this)))
## pass # handle character classes
setappend((NEGATE, None))
# check remaining characters
if this == "]" and set != start:
elif this and this[0] == "\\":
code1 = _class_escape(source, this, nested + 1)
code1 = LITERAL, ord(this)
raise error, "unexpected end of regular expression"
setappend((LITERAL, ord("-")))
code2 = _class_escape(source, this, nested + 1)
code2 = LITERAL, ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise error, "bad character range"
raise error, "bad character range"
setappend((RANGE, (lo, hi)))
raise error, "unexpected end of regular expression"
# XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL:
subpatternappend(set[0]) # optimization