Revision: 422
http://www.exim.org/viewvc/pcre2?view=rev&revision=422
Author: ph10
Date: 2015-11-10 14:33:28 +0000 (Tue, 10 Nov 2015)
Log Message:
-----------
Don't use group caching when (?| is involved; instead use a counter to cap too
much computation.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-11-09 18:45:15 UTC (rev 421)
+++ code/trunk/ChangeLog 2015-11-10 14:33:28 UTC (rev 422)
@@ -270,7 +270,11 @@
79. Adding group information caching improves the speed of compiling when
checking whether a group has a fixed length and/or could match an empty string,
-especially when recursion or subroutine calls are involved.
+especially when recursion or subroutine calls are involved. However, this
+cannot be used when (?| is present in the pattern because the same number may
+be used for groups of different sizes. To catch runaway patterns in this
+situation, counts have been introduced to the functions that scan for empty
+branches or compute fixed lengths.
80. Allow for the possibility of the size of the nest_save structure not being
a factor of the size of the compiling workspace (it currently is).
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-11-09 18:45:15 UTC (rev 421)
+++ code/trunk/src/pcre2_compile.c 2015-11-10 14:33:28 UTC (rev 422)
@@ -616,6 +616,7 @@
ERR25, /* Lookbehind is not fixed length */
ERR36, /* \C in lookbehind is not allowed */
ERR87, /* Lookbehind is too long */
+ ERR86, /* Pattern too complicated */
ERR70 /* Internal error: unknown opcode encountered */
};
@@ -828,11 +829,12 @@
as LOOKBEHIND_MAX.
Arguments:
- code points to the start of the pattern (the bracket)
- utf TRUE in UTF mode
- atend TRUE if called when the pattern is complete
- cb the "compile data" structure
+ code points to the start of the pattern (the bracket)
+ utf TRUE in UTF mode
+ atend TRUE if called when the pattern is complete
+ cb the "compile data" structure
recurses chain of recurse_check to catch mutual recursion
+ countptr pointer to counter, to catch over-complexity
Returns: if non-negative, the fixed length,
or -1 if an OP_RECURSE item was encountered and atend is FALSE
@@ -842,15 +844,16 @@
or -5 if an unknown opcode was encountered (internal error)
*/
-#define FFL_LATER (-1)
-#define FFL_NOTFIXED (-2)
-#define FFL_BACKSLASHC (-3)
-#define FFL_TOOLONG (-4)
-#define FFL_UNKNOWNOP (-5)
+#define FFL_LATER (-1)
+#define FFL_NOTFIXED (-2)
+#define FFL_BACKSLASHC (-3)
+#define FFL_TOOLONG (-4)
+#define FFL_TOOCOMPLICATED (-5)
+#define FFL_UNKNOWNOP (-6)
static int
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
- recurse_check *recurses)
+ recurse_check *recurses, int *countptr)
{
int length = -1;
uint32_t group = 0;
@@ -859,7 +862,9 @@
register int branchlength = 0;
register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
-/* If this is a capturing group, we may have the answer cached. */
+/* If this is a capturing group, we may have the answer cached, but we can only
+use this information if there are no (?| groups in the pattern, because
+otherwise group numbers are not unique. */
if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
*code == OP_SCBRAPOS)
@@ -867,11 +872,19 @@
group = GET2(cc, 0);
cc += IMM2_SIZE;
groupinfo = cb->groupinfo[group];
- if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
- if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
- return groupinfo & GI_FIXED_LENGTH_MASK;
+ if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
+ {
+ if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
+ if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
+ return groupinfo & GI_FIXED_LENGTH_MASK;
+ }
}
+/* A large and/or complex regex can take too long to process. This can happen
+more often when (?| groups are present in the pattern. */
+
+if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
+
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
@@ -895,7 +908,7 @@
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
- d = find_fixedlength(cc, utf, atend, cb, recurses);
+ d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -945,7 +958,7 @@
}
this_recurse.prev = recurses;
this_recurse.group = cs;
- d = find_fixedlength(cs, utf, atend, cb, &this_recurse);
+ d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
if (d < 0) return d;
branchlength += d;
cc += 1 + LINK_SIZE;
@@ -1298,13 +1311,21 @@
cb compile data
atend TRUE if being called to check an entire pattern
recurses chain of recurse_check to catch mutual recursion
+ countptr pointer to count to catch over-complicated pattern
-Returns: TRUE if what is matched could be empty
+Returns: 0 if what is matched cannot be empty
+ 1 if what is matched could be empty
+ -1 if the pattern is too complicated
*/
-static BOOL
+#define CBE_NOTEMPTY 0
+#define CBE_EMPTY 1
+#define CBE_TOOCOMPLICATED (-1)
+
+
+static int
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
- compile_block *cb, BOOL atend, recurse_check *recurses)
+ compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
{
uint32_t group = 0;
uint32_t groupinfo = 0;
@@ -1314,18 +1335,30 @@
/* If what we are checking has already been set as "could be empty", we know
the answer. */
-if (*code >= OP_SBRA && *code <= OP_SCOND) return TRUE;
+if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
-/* If this is a capturing group, we may have the answer cached. */
+/* If this is a capturing group, we may have the answer cached, but we can only
+use this information if there are no (?| groups in the pattern, because
+otherwise group numbers are not unique. */
-if (*code == OP_CBRA || *code == OP_CBRAPOS)
+if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
+ (*code == OP_CBRA || *code == OP_CBRAPOS))
{
group = GET2(code, 1 + LINK_SIZE);
groupinfo = cb->groupinfo[group];
if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
- return (groupinfo & GI_COULD_BE_EMPTY) != 0;
+ return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
+/* A large and/or complex regex can take too long to process. We have to assume
+it can match an empty string. This can happen more often when (?| groups are
+present in the pattern and the caching is disabled. Setting the cap at 1100
+allows the test for more than 1023 capturing patterns to work. */
+
+if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
+
+/* Scan the opcodes for this branch. */
+
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
code < endcode;
code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
@@ -1347,8 +1380,8 @@
/* For a recursion/subroutine call we can scan the recursion when this
function is called at the end, to check a complete pattern. Before then,
recursions just have the group number as their argument and in any case may
- be forward references. In that situation, we return TRUE, just in case. It
- means that unlimited repeats of groups that contain recursions are always
+ be forward references. In that situation, we return CBE_EMPTY, just in case.
+ It means that unlimited repeats of groups that contain recursions are always
treated as "could be empty" - which just adds a bit more processing time
because of the runtime check. */
@@ -1385,7 +1418,10 @@
do
{
- if (could_be_empty_branch(scode, endcode, utf, cb, atend, &this_recurse))
+ int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
+ &this_recurse, countptr);
+ if (rc < 0) return rc;
+ if (rc > 0)
{
empty_branch = TRUE;
break;
@@ -1441,8 +1477,13 @@
empty_branch = FALSE;
do
{
- if (!empty_branch && could_be_empty_branch(code, endcode, utf, cb,
- atend, recurses)) empty_branch = TRUE;
+ if (!empty_branch)
+ {
+ int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
+ recurses, countptr);
+ if (rc < 0) return rc;
+ if (rc > 0) empty_branch = TRUE;
+ }
code += GET(code, 1);
}
while (*code == OP_ALT);
@@ -1663,7 +1704,7 @@
ISFALSE:
if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
-return (groupinfo & GI_COULD_BE_EMPTY) != 0;
+return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
@@ -5480,8 +5521,16 @@
PCRE2_UCHAR *scode = bracode;
do
{
- if (could_be_empty_branch(scode, ketcode, utf, cb, FALSE, NULL))
+ int count = 0;
+ int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
+ NULL, &count);
+ if (rc < 0)
{
+ *errorcodeptr = ERR86;
+ goto FAILED;
+ }
+ if (rc > 0)
+ {
*bracode += OP_SBRA - OP_BRA;
break;
}
@@ -7594,9 +7643,10 @@
if (lookbehind)
{
int fixed_length;
+ int count = 0;
*code = OP_END;
fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
- FALSE, cb, NULL);
+ FALSE, cb, NULL, &count);
if (fixed_length == FFL_LATER)
{
cb->check_lookbehind = TRUE;
@@ -8733,10 +8783,11 @@
if (GET(cc, 1) == 0)
{
int fixed_length;
+ int count = 0;
PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
int end_op = *be;
*be = OP_END;
- fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL);
+ fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
*be = end_op;
if (fixed_length < 0)
{
@@ -8860,8 +8911,15 @@
do
{
- if (could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL))
+ int count = 0;
+ int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
+ if (rc < 0)
{
+ errorcode = ERR86;
+ goto HAD_ERROR;
+ }
+ if (rc > 0)
+ {
re->flags |= PCRE2_MATCH_EMPTY;
break;
}
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2015-11-09 18:45:15 UTC (rev 421)
+++ code/trunk/testdata/testinput2 2015-11-10 14:33:28 UTC (rev 422)
@@ -4626,6 +4626,20 @@
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
a)"xI
+# When (?| is used and groups of the same number may be different,
+# we have to rely on a count to catch overly complicated patterns.
+
+"(?|()|())(.*)
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))"xI
+
+"(?|()|())(?<=a()
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+a)"xI
+
# Test the use of malloc for caching group information when there are more
# groups than fit into the on-stack workspace.
@@ -4635,4 +4649,14 @@
/(A{65000})\1{65000}/I
+# Test group scans when numbers are not unique
+
+/(?|()+|(a)+)/BI
+
+/(?|(a)+|()+)/BI
+
+/(?|()|(a))/BI
+
+/(?|(a)|())/BI
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-11-09 18:45:15 UTC (rev 421)
+++ code/trunk/testdata/testoutput2 2015-11-10 14:33:28 UTC (rev 422)
@@ -14734,6 +14734,22 @@
Options: extended
Subject length lower bound = 0
+# When (?| is used and groups of the same number may be different,
+# we have to rely on a count to catch overly complicated patterns.
+
+"(?|()|())(.*)
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))"xI
+Failed: error 186 at offset 148: regular expression is too complicated
+
+"(?|()|())(?<=a()
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
+a)"xI
+Failed: error 186 at offset 154: regular expression is too complicated
+
# Test the use of malloc for caching group information when there are more
# groups than fit into the on-stack workspace.
@@ -14752,4 +14768,78 @@
Last code unit = 'A'
Subject length lower bound = 65535
+# Test group scans when numbers are not unique
+
+/(?|()+|(a)+)/BI
+------------------------------------------------------------------
+ Bra
+ Bra
+ SCBra 1
+ KetRmax
+ Alt
+ CBra 1
+ a
+ KetRmax
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 0
+
+/(?|(a)+|()+)/BI
+------------------------------------------------------------------
+ Bra
+ Bra
+ CBra 1
+ a
+ KetRmax
+ Alt
+ SCBra 1
+ KetRmax
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 0
+
+/(?|()|(a))/BI
+------------------------------------------------------------------
+ Bra
+ Bra
+ CBra 1
+ Ket
+ Alt
+ CBra 1
+ a
+ Ket
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 0
+
+/(?|(a)|())/BI
+------------------------------------------------------------------
+ Bra
+ Bra
+ CBra 1
+ a
+ Ket
+ Alt
+ CBra 1
+ Ket
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+May match empty string
+Subject length lower bound = 0
+
# End of testinput2