# Secret Labs' Regular Expression Engine
# convert template to internal format
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
# See the sre.py file for information on usage and redistribution.
"""Internal support module for sre"""
from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
# Sets of lowercase characters which have the same uppercase.
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
# MICRO SIGN, GREEK SMALL LETTER MU
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ιι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=sre_parse.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
return (flags | add_flags) & ~del_flags
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
LITERAL_CODES = _LITERAL_CODES
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
if not flags & SRE_FLAG_IGNORECASE:
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOCALE_IGNORE[op])
emit(OP_UNICODE_IGNORE[op])
skip = _len(code); emit(0)
for k in (lo,) + fixes[lo]:
code[skip] = _len(code) - skip
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
if flags & SRE_FLAG_DOTALL:
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
skip = _len(code); emit(0)
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
skip = _len(code); emit(0)
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
group, add_flags, del_flags, p = av
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
elif op in SUCCESS_CODES:
skip = _len(code); emit(0)
lo, hi = av[1].getwidth()
raise error("look-behind requires fixed-width pattern")
_compile(code, av[1], flags)
code[skip] = _len(code) - skip
skip = _len(code); emit(0)
_compile(code, av, flags)
code[skip] = _len(code) - skip
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
emit(FAILURE) # end of branch
code[tail] = _len(code) - tail
if flags & SRE_FLAG_LOCALE:
elif flags & SRE_FLAG_UNICODE:
if not flags & SRE_FLAG_IGNORECASE:
elif flags & SRE_FLAG_LOCALE:
emit(GROUPREF_LOC_IGNORE)
emit(GROUPREF_UNI_IGNORE)
elif op is GROUPREF_EXISTS:
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = _len(code) - skipno
code[skipyes] = _len(code) - skipyes + 1
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code):
# compile charset subprogram
elif op is RANGE or op is RANGE_UNI_IGNORE:
if flags & SRE_FLAG_LOCALE:
elif flags & SRE_FLAG_UNICODE:
raise error("internal: unsupported set operator %r" % (op,))
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
if fixes and lo in fixes:
if not hascased and iscased(av):
r = range(av[0], av[1]+1)
hascased = any(map(iscased, r))
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
# Character set contains non-BMP character codes.
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
runs.append((p, len(charmap)))
out.append((RANGE, (p, q - 1)))
# if the case was changed or new representation is more compact
if hascased or len(out) < len(charset):
# else original character set is good enough
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
charmap = bytes(charmap) # should be hashable
for i in range(0, 65536, 256):
chunk = charmap[i: i + 256]
mapping[i // 256] = comps[chunk]
mapping[i // 256] = comps[chunk] = block
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
# check if this subpattern is a "simple" operator
return av[0] is None and _simple(av[-1])
def _generate_overlap_table(prefix):
Generate an overlap table for the following prefix.
An overlap table is a table of the same size as the prefix which
informs about the potential self-overlap for each index in the prefix:
- if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
- if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
table = [0] * len(prefix)
for i in range(1, len(prefix)):
while prefix[i] != prefix[idx]:
if not flags & SRE_FLAG_IGNORECASE:
elif flags & SRE_FLAG_UNICODE:
return _sre.unicode_iscased
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefixappend = prefix.append
iscased = _get_iscased(flags)
for op, av in pattern.data:
if iscased and iscased(av):
group, add_flags, del_flags, p = av
flags1 = _combine_flags(flags, add_flags, del_flags)
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern, flags):
group, add_flags, del_flags, pattern = av
flags = _combine_flags(flags, add_flags, del_flags)