* Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
* 2000, 2001, 2002, 2003, 2005, 2006, 2007, by Larry Wall and others
* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
#include "regcharclass.h"
/* Convert branch sequences to more efficient trie ops? */
#define PERL_ENABLE_TRIE_OPTIMISATION 1
/* Be really aggressive about optimising patterns with trie sequences? */
#define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
/* Should the optimiser take positive assertions into account? */
#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
/* Not for production use: */
#define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0
/* Activate offsets code - set to if 1 to enable */
#define RE_TRACK_PATTERN_OFFSETS
* The "internal use only" fields in regexp.h are present to pass info from
* compile to execute that permits the execute phase to run lots faster on
* simple cases. They are:
* regstart sv that must begin a match; NULL if none obvious
* reganch is the match anchored (at beginning-of-line only)?
* regmust string (pointer into program) that match must include, or NULL
* [regmust changed to SV* for bminstr()--law]
* regmlen length of regmust string
* [regmlen not used currently]
* Regstart and reganch permit very fast decisions on suitable starting points
* for a match, cutting down the work a lot. Regmust permits fast rejection
* of lines that cannot possibly match. The regmust tests are costly enough
* that pregcomp() supplies a regmust only if the r.e. contains something
* potentially expensive (at present, the only such thing detected is * or +
* at the start of the r.e., which can involve a lot of backup). Regmlen is
* supplied because the test in pregexec() needs it and pregcomp() is computing
* [regmust is now supplied always. The tests that use regmust have a
* heuristic that disables the test if it usually matches.]
* [In fact, we now use regmust in many cases to locate where the search
* starts in the string, so if regback is >= 0, the regmust search is never
* wasted effort. The regback variable says how many characters back from
* where regmust matched is the earliest possible start of the match.
* For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
* Structure for regexp "program". This is essentially a linear encoding
* of a nondeterministic finite-state machine (aka syntax charts or
* "railroad normal form" in parsing technology). Each node is an opcode
* plus a "next" pointer, possibly plus an operand. "Next" pointers of
* all nodes except BRANCH implement concatenation; a "next" pointer with
* a BRANCH on both ends of it is connecting two alternatives. (Here we
* have one of the subtle syntax dependencies: an individual BRANCH (as
* opposed to a collection of them) is never concatenated with anything
* because of operator precedence.) The operand of some types of node is
* a literal string; for others, it is a node leading into a sub-FSM. In
* particular, the operand of a BRANCH node is the first node of the branch.
* (NB this is *not* a tree structure: the tail of the branch connects
* to the thing following the set of BRANCHes.) The opcodes are defined
* in regnodes.h which is generated from regcomp.sym by regcomp.pl.
* A node is one char of opcode followed by two chars of "next" pointer.
* "Next" pointers are stored as two 8-bit pieces, high order first. The
* value is a positive offset from the opcode of the node containing it.
* An operand, if any, simply follows the node. (Note that much of the
* code generation knows about this implicit relationship.)
* Using two bytes for the "next" pointer is vast overkill for most things,
* but allows patterns to get big without disasters.
* [The "next" pointer is always aligned on an even
* boundary, and reads the offset directly as a short.]
/* This is the stuff that used to live in regexp.h that was truly
private to the engine itself. It now lives here. */
typedef struct regexp_internal {
int name_list_idx; /* Optional data index of an array of paren names */
U32 *offsets; /* offset annotations 20001228 MJD
data about mapping the program to the
offsets[0] is proglen when this is used
regnode *regstclass; /* Optional startclass as identified or constructed
struct reg_data *data; /* Additional miscellaneous data used by the program.
Used to make it easier to clone and free arbitrary
data that the regops need. Often the ARG field of
a regop is an index into this structure */
struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */
regnode program[1]; /* Unwarranted chumminess with compiler. */
#define RXi_SET(x,y) (x)->pprivate = (void*)(y)
#define RXi_GET(x) ((regexp_internal *)((x)->pprivate))
#define RXi_GET_DECL(r,ri) regexp_internal *ri = RXi_GET(r)
* Flags stored in regexp->intflags
* These are used only internally to the regexp engine
* See regexp.h for flags used externally to the regexp engine
#define RXp_INTFLAGS(rx) ((rx)->intflags)
#define RX_INTFLAGS(prog) RXp_INTFLAGS(ReANY(prog))
#define PREGf_SKIP 0x00000001
#define PREGf_IMPLICIT 0x00000002 /* Converted .* to ^.* */
#define PREGf_NAUGHTY 0x00000004 /* how exponential is this pattern? */
#define PREGf_VERBARG_SEEN 0x00000008
#define PREGf_CUTGROUP_SEEN 0x00000010
#define PREGf_USE_RE_EVAL 0x00000020 /* compiled with "use re 'eval'" */
/* these used to be extflags, but are now intflags */
#define PREGf_NOSCAN 0x00000040
#define PREGf_GPOS_SEEN 0x00000100
#define PREGf_GPOS_FLOAT 0x00000200
#define PREGf_ANCH_MBOL 0x00000400
#define PREGf_ANCH_SBOL 0x00000800
#define PREGf_ANCH_GPOS 0x00001000
#define PREGf_RECURSE_SEEN 0x00002000
( PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | PREGf_ANCH_MBOL )
/* this is where the old regcomp.h started */
/* Argument bearing node - workhorse,
arg1 is often for the data field */
/* Similar to a regnode_1 but with an extra signed argument */
/* 'Two field' -- Two 16 bit unsigned args */
/* This give the number of code points that can be in the bitmap of an ANYOF
* node. The shift number must currently be one of: 8..12. It can't be less
* than 8 (256) because some code relies on it being at least that. Above 12
* (4096), and you start running into warnings that some data structure widths
* have been exceeded, though the test suite as of this writing still passes
* for up through 16, which is as high as anyone would ever want to go,
* encompassing all of the Unicode BMP, and thus including all the economically
* important world scripts. At 12 most of them are: including Arabic,
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
* Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
* the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
* below above 12.) Be sure to benchmark before changing, as larger sizes do
* significantly slow down the test suite */
#define NUM_ANYOF_CODE_POINTS (1 << 8)
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */
/* Note that these form structs which are supersets of the next smaller one, by
* appending fields. Alignment problems can occur if one of those optional
* fields requires stricter alignment than the base struct. And formal
* parameters that can really be two or more of the structs should be
* declared as the smallest one it could be. See commit message for
* 7dcac5f6a5195002b55c935ee1d67f67e1df280b. Regnode allocation is done
* without regard to alignment, and changing it to would also require changing
* the code that inserts and deletes regnodes. The basic single-argument
* regnode has a U32, which is what reganode() allocates as a unit. Therefore
* no field can require stricter alignment than U32. */
struct regnode_charclass {
U32 arg1; /* set by set_ANYOF_arg() */
char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */
/* has runtime (locale) \d, \w, ..., [:posix:] classes */
struct regnode_charclass_class {
U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */
char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */
U32 classflags; /* and run-time */
/* A synthetic start class (SSC); is a regnode_charclass_posixl_fold, plus an
* extra SV*, used only during its construction and which is not used by
* regexec.c. Note that the 'next_off' field is unused, as the SSC stands
* alone, so there is never a next node. Also, there is no alignment issue,
* because these are declared or allocated as a complete unit so the compiler
* takes care of alignment. This is unlike the other regnodes which are
* allocated in terms of multiples of a single-argument regnode. SSC nodes can
* have a pointer field because there is no alignment issue, and because it is
* set to NULL after construction, before any cloning of the pattern */
U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */
char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */
U32 classflags; /* ... and run-time */
/* Auxiliary, only used during construction; NULL afterwards: list of code
/* We take advantage of 'next_off' not otherwise being used in the SSC by
* actually using it: by setting it to 1. This allows us to test and
* distinguish between an SSC and other ANYOF node types, as 'next_off' cannot
* otherwise be 1, because it is the offset to the next regnode expressed in
* units of regnodes. Since an ANYOF node contains extra fields, it adds up
* to 12 regnode units on 32-bit systems, (hence the minimum this can be (if
* not 0) is 11 there. Even if things get tightly packed on a 64-bit system,
* it still would be more than 1. */
#define set_ANYOF_SYNTHETIC(n) STMT_START{ OP(n) = ANYOF; \
#define is_ANYOF_SYNTHETIC(n) (PL_regkind[OP(n)] == ANYOF && NEXT_OFF(n) == 1)
/* XXX fix this description.
Impose a limit of REG_INFTY on various pattern matching operations
to limit stack growth and to avoid "infinite" recursions.
/* The default size for REG_INFTY is I16_MAX, which is the same as
SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
(see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
((1<<63)-1). To limit stack growth to reasonable sizes, supply a
--Andy Dougherty 11 June 1998
# define REG_INFTY ((1<<15)-1)
# define REG_INFTY I16_MAX
#define ARG_VALUE(arg) (arg)
#define ARG__SET(arg,val) ((arg) = (val))
#define ARG(p) ARG_VALUE(ARG_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
#define NEXT_OFF(p) ((p)->next_off)
/* the following define was set to 0xde in 075abff3
* as part of some linting logic. I have set it to 0
* as otherwise in every place where we /might/ set flags
* we have to set it 0 explicitly, which duplicates
* assignments and IMO adds an unacceptable level of
* surprise to working in the regex engine. If this
* is changed from 0 then at the very least make sure
* that SBOL for /^/ sets the flags to 0 explicitly.
#define NODE_ALIGN_FILL(node) ((node)->flags = 0)
#define SIZE_ALIGN NODE_ALIGN
#define OP(p) ((p)->type)
#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \
regnode types. For some, it's the \
character set of the regnode */
#define OPERAND(p) (((struct regnode_string *)p)->string)
#define MASK(p) ((char*)OPERAND(p))
#define STR_LEN(p) (((struct regnode_string *)p)->str_len)
#define STRING(p) (((struct regnode_string *)p)->string)
#define STR_SZ(l) ((l + sizeof(regnode) - 1) / sizeof(regnode))
#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1)
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
#define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
(ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
#define FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2) \
FILL_ADVANCE_NODE(ptr, op); \
#define SIZE_ONLY cBOOL(RExC_emit == (regnode *) & RExC_emit_dummy)
#define PASS2 (! SIZE_ONLY)
/* An ANYOF node is basically a bitmap with the index being a code point. If
* the bit for that code point is 1, the code point matches; if 0, it doesn't
* match (complemented if inverted). There is an additional mechanism to deal
* with cases where the bitmap is insufficient in and of itself. This #define
* indicates if the bitmap does fully represent what this ANYOF node can match.
* The ARG is set to this special value (since 0, 1, ... are legal, but will
* never reach this high). */
#define ANYOF_ONLY_HAS_BITMAP ((U32) -1)
/* When the bimap isn't completely sufficient for handling the ANYOF node,
* flags (in node->flags of the ANYOF node) get set to indicate this. These
* are perennially in short supply. Beyond several cases where warnings need
* to be raised under certain circumstances, currently, there are six cases
* where the bitmap alone isn't sufficient. We could use six flags to
* represent the 6 cases, but to save flags bits, we play some games. The
* 1) The bitmap has a compiled-in very finite size. So something else needs
* to be used to specify if a code point that is too large for the bitmap
* actually matches. The mechanism currently is a swash or inversion
* list. ANYOF_ONLY_HAS_BITMAP, described above, being TRUE indicates
* there are no matches of too-large code points. But if it is FALSE,
* then almost certainly there are matches too large for the bitmap. (The
* other cases, described below, either imply this one or are extremely
* rare in practice.) So we can just assume that a too-large code point
* will need something beyond the bitmap if ANYOF_ONLY_HAS_BITMAP is
* FALSE, instead of having a separate flag for this.
* 2) A subset of item 1) is if all possible code points outside the bitmap
* match. This is a common occurrence when the class is complemented,
* like /[^ij]/. Therefore a bit is reserved to indicate this,
* rather than having an expensive swash created,
* ANYOF_MATCHES_ALL_ABOVE_BITMAP.
* 3) Under /d rules, it can happen that code points that are in the upper
* latin1 range (\x80-\xFF or their equivalents on EBCDIC platforms) match
* only if the runtime target string being matched against is UTF-8. For
* example /[\w[:punct:]]/d. This happens only for posix classes (with a
* couple of exceptions, like \d where it doesn't happen), and all such
* ones also have above-bitmap matches. Thus, 3) implies 1) as well.
* Note that /d rules are no longer encouraged; 'use 5.14' or higher
* deselects them. But a flag is required so that they can be properly
* handled. But it can be a shared flag: see 5) below.
* 4) Also under /d rules, something like /[\Wfoo]/ will match everything in
* the \x80-\xFF range, unless the string being matched against is UTF-8.
* A swash could be created for this case, but this is relatively common,
* and it turns out that it's all or nothing: if any one of these code
* points matches, they all do. Hence a single bit suffices. We use a
* shared flag that doesn't take up space by itself:
* ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER.
* This also implies 1), with one exception: [:^cntrl:].
* 5) A user-defined \p{} property may not have been defined by the time the
* regex is compiled. In this case, we don't know until runtime what it
* will match, so we have to assume it could match anything, including
* code points that ordinarily would be in the bitmap. A flag bit is
* necessary to indicate this , though it can be shared with the item 3)
* flag, as that only occurs under /d, and this only occurs under non-d.
* This case is quite uncommon in the field, and the /(?[ ...])/ construct
* is a better way to accomplish what this feature does. This case also
* ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
* 6) /[foo]/il may have folds that are only valid if the runtime locale is a
* UTF-8 one. These are quite rare, so it would be good to avoid the
* expense of looking for them. But /l matching is slow anyway, and we've
* traditionally not worried too much about its performance. And this
* condition requires the ANYOFL_FOLD flag to be set, so testing for
* that flag would be sufficient to rule out most cases of this. So it is
* unclear if this should have a flag or not. But, this flag can be
* shared with another, so it doesn't occupy extra space.
* At the moment, there is one spare bit, but this could be increased by
* If just one more bit is needed, at this writing it seems to khw that the
* best choice would be to make ANYOF_MATCHES_ALL_ABOVE_BITMAP not a flag, but
* #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2)
* and access it through the ARG like ANYOF_ONLY_HAS_BITMAP is. This flag is
* used by all ANYOF node types, and it could be used to avoid calling the
* handler function, as the macro REGINCLASS in regexec.c does now for other
* Another possibility is to instead (or additionally) rename the ANYOF_POSIXL
* flag to be ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits
* beyond what a regular one does. That's what it effectively means now, with
* the extra space all for the POSIX class flags. But those classes actually
* only occupy 30 bits, so the ANYOFL_FOLD and
* ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD flags could be moved
* to that extra space. The 30 bits in the extra word would indicate if a
* posix class should be looked up or not. The downside of this is that ANYOFL
* nodes with folding would always have to have the extra space allocated, even
* if they didn't use the 30 posix bits. There isn't an SSC problem as all
* SSCs are this large anyway.
* One could completely remove ANYOFL_LARGE and make all ANYOFL nodes large.
* REGINCLASS would have to be modified so that if the node type were this, it
* would call reginclass(), as the flag bit that indicates to do this now would
* All told, 5 bits could be available for other uses if all of the above were
* Some flags are not used in synthetic start class (SSC) nodes, so could be
* shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING
/* If this is set, the result of the match should be complemented. regexec.c
* is expecting this to be in the low bit. Never in an SSC */
#define ANYOF_INVERT 0x01
/* For the SSC node only, which cannot be inverted, so is shared with that bit.
* This is used only during regex compilation. */
#define SSC_MATCHES_EMPTY_STRING ANYOF_INVERT
/* Set if this is a regnode_charclass_posixl vs a regnode_charclass. This
* is used for runtime \d, \w, [:posix:], ..., which are used only in locale
* and the optimizer's synthetic start class. Non-locale \d, etc are resolved
* at compile-time. Only set under /l; can be in SSC */
#define ANYOF_MATCHES_POSIXL 0x02
/* The fold is calculated and stored in the bitmap where possible at compile
* time. However under locale, the actual folding varies depending on
* what the locale is at the time of execution, so it has to be deferred until
* then. Only set under /l; never in an SSC */