Revision: 331
http://www.exim.org/viewvc/pcre2?view=rev&revision=331
Author: ph10
Date: 2015-08-01 10:11:28 +0100 (Sat, 01 Aug 2015)
Log Message:
-----------
Fix issues with minimum length finding.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_find_bracket.c
code/trunk/src/pcre2_study.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/ChangeLog 2015-08-01 09:11:28 UTC (rev 331)
@@ -85,7 +85,14 @@
class, where both values are literal letters in the same case, omit the
non-letter EBCDIC code points within the range.
+23. Finding the minimum matching length of complex patterns with back
+references and/or recursions can take a long time. There is now a cut-off that
+gives up trying to find a minimum length when things get too complex.
+24. An optimization has been added that speeds up finding the minimum matching
+length for patterns containing repeated capturing groups or recursions.
+
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/src/pcre2_find_bracket.c
===================================================================
--- code/trunk/src/pcre2_find_bracket.c 2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/src/pcre2_find_bracket.c 2015-08-01 09:11:28 UTC (rev 331)
@@ -83,7 +83,7 @@
if (c == OP_XCLASS) code += GET(code, 1);
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
- /* Handle recursion */
+ /* Handle lookbehind */
else if (c == OP_REVERSE)
{
Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c 2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/src/pcre2_study.c 2015-08-01 09:11:28 UTC (rev 331)
@@ -59,7 +59,6 @@
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
-
/*************************************************
* Find the minimum subject length for a group *
*************************************************/
@@ -75,10 +74,12 @@
startcode pointer to start of the whole pattern's code
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
+ countptr pointer to call count (to catch over complexity)
Returns: the minimum length
-1 \C in UTF-8 mode
or (*ACCEPT)
+ or pattern too complicated
-2 internal error (missing capturing bracket)
-3 internal error (opcode not listed)
*/
@@ -85,14 +86,23 @@
static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
- PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
+ PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
{
int length = -1;
+int prev_recno = -1;
+int prev_d = 0;
+uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
+/* A large and/or complex regex can take too long to process. */
+
+if ((*countptr)++ > 1000) return -1;
+
+/* Skip over capturing bracket number */
+
if (*code == OP_CBRA || *code == OP_SCBRA ||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
@@ -101,10 +111,10 @@
for (;;)
{
- int d, min;
+ int d, min, recno;
PCRE2_UCHAR *cs, *ce;
register PCRE2_UCHAR op = *cc;
-
+
switch (op)
{
case OP_COND:
@@ -112,7 +122,8 @@
/* If there is only one branch in a condition, the implied branch has zero
length, so we don't add anything. This covers the DEFINE "condition"
- automatically. */
+ automatically. If there are two branches we can treat it the same as any
+ other non-capturing subpattern. */
cs = cc + GET(cc, 1);
if (*cs != OP_ALT)
@@ -120,21 +131,29 @@
cc = cs + 1 + LINK_SIZE;
break;
}
+ goto PROCESS_NON_CAPTURE;
- /* Otherwise we can fall through and treat it the same as any other
- subpattern. */
-
- case OP_CBRA:
- case OP_SCBRA:
+ /* There's a special case of OP_ONCE, when it is wrapped round an
+ OP_RECURSE. We'd like to process the latter at this level so that
+ remembering the value works for repeated cases. So we do nothing, but
+ set a fudge value to skip over the OP_KET after the recurse. */
+
+ case OP_ONCE:
+ if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
+ {
+ once_fudge = 1 + LINK_SIZE;
+ cc += 1 + LINK_SIZE;
+ break;
+ }
+ /* Fall through */
+
+ case OP_ONCE_NC:
case OP_BRA:
case OP_SBRA:
- case OP_CBRAPOS:
- case OP_SCBRAPOS:
case OP_BRAPOS:
case OP_SBRAPOS:
- case OP_ONCE:
- case OP_ONCE_NC:
- d = find_minlength(re, cc, startcode, utf, recurses);
+ PROCESS_NON_CAPTURE:
+ d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -141,6 +160,26 @@
cc += 1 + LINK_SIZE;
break;
+ /* To save time for repeated capturing subpatterns, we remember the
+ length of the previous one. Unfortunately we can't do the same for
+ the unnumbered ones above. */
+
+ case OP_CBRA:
+ case OP_SCBRA:
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
+ recno = GET2(cc, 1+LINK_SIZE);
+ if (recno != prev_recno)
+ {
+ prev_recno = recno;
+ prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ if (prev_d < 0) return prev_d;
+ }
+ branchlength += prev_d;
+ do cc += GET(cc, 1); while (*cc == OP_ALT);
+ cc += 1 + LINK_SIZE;
+ break;
+
/* ACCEPT makes things far too complicated; we have to give up. */
case OP_ACCEPT:
@@ -427,7 +466,7 @@
int dd;
this_recurse.prev = recurses;
this_recurse.group = cs;
- dd = find_minlength(re, cs, startcode, utf, &this_recurse);
+ dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
if (dd < d) d = dd;
}
}
@@ -463,7 +502,7 @@
{
this_recurse.prev = recurses;
this_recurse.group = cs;
- d = find_minlength(re, cs, startcode, utf, &this_recurse);
+ d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
}
}
}
@@ -509,23 +548,36 @@
case OP_RECURSE:
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
- do ce += GET(ce, 1); while (*ce == OP_ALT);
- if (cc > cs && cc < ce) /* Simple recursion */
- had_recurse = TRUE;
+ recno = GET2(cs, 1+LINK_SIZE);
+ if (recno == prev_recno)
+ {
+ branchlength += prev_d;
+ }
else
- {
- recurse_check *r = recurses;
- for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
- if (r != NULL) /* Mutual recursion */
+ {
+ do ce += GET(ce, 1); while (*ce == OP_ALT);
+ if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
else
{
- this_recurse.prev = recurses;
- this_recurse.group = cs;
- branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
+ recurse_check *r = recurses;
+ for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+ if (r != NULL) /* Mutual recursion */
+ had_recurse = TRUE;
+ else
+ {
+ this_recurse.prev = recurses;
+ this_recurse.group = cs;
+ prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
+ countptr);
+ if (prev_d < 0) return prev_d;
+ prev_recno = recno;
+ branchlength += prev_d;
+ }
}
- }
- cc += 1 + LINK_SIZE;
+ }
+ cc += 1 + LINK_SIZE + once_fudge;
+ once_fudge = 0;
break;
/* Anything else does not or need not match a character. We can get the
@@ -1441,6 +1493,7 @@
PRIV(study)(pcre2_real_code *re)
{
int min;
+int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
@@ -1463,9 +1516,9 @@
/* Find the minimum length of subject string. */
-switch(min = find_minlength(re, code, code, utf, NULL))
+switch(min = find_minlength(re, code, code, utf, NULL, &count))
{
- case -1: /* \C in UTF mode or (*ACCEPT) */
+ case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
break; /* Leave minlength unchanged (will be zero) */
case -2:
@@ -1475,6 +1528,7 @@
return 3; /* unrecognized opcode */
default:
+ if (min > UINT16_MAX) min = UINT16_MAX;
re->minlength = min;
break;
}
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/testdata/testinput2 2015-08-01 09:11:28 UTC (rev 331)
@@ -4360,4 +4360,6 @@
/(?(?C{\Q})(?!(?'abc')))/I
+/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/testdata/testoutput2 2015-08-01 09:11:28 UTC (rev 331)
@@ -3942,7 +3942,7 @@
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
-Subject length lower bound = 2
+Subject length lower bound = 3
xyz
0: xyz
1: xyz
@@ -14566,4 +14566,14 @@
May match empty string
Subject length lower bound = 0
+/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
+Capturing subpattern count = 8
+Max back reference = 8
+Named capturing subpatterns:
+ R 7
+ R 8
+Duplicate name status changes
+Last code unit = '0'
+Subject length lower bound = 65535
+
# End of testinput2