Revision: 332
http://www.exim.org/viewvc/pcre2?view=rev&revision=332
Author: ph10
Date: 2015-08-03 14:18:49 +0100 (Mon, 03 Aug 2015)
Log Message:
-----------
Fix incorrect minimum matching length when pattern contains (?| groups.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/src/pcre2_internal.h
code/trunk/src/pcre2_study.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/ChangeLog 2015-08-03 13:18:49 UTC (rev 332)
@@ -92,7 +92,13 @@
24. An optimization has been added that speeds up finding the minimum matching
length for patterns containing repeated capturing groups or recursions.
+25. If a pattern contained a back reference to a group whose number was
+duplicated as a result of appearing in a (?|...) group, the computation of the
+minimum matching length gave a wrong result, which could cause incorrect "no
+match" errors. For such patterns, a minimum matching length cannot at present
+be computed.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_compile.c 2015-08-03 13:18:49 UTC (rev 332)
@@ -3215,6 +3215,7 @@
top_nest->reset_group = cb->bracount;
top_nest->max_group = cb->bracount;
top_nest->flags |= NSF_RESET;
+ cb->external_flags |= PCRE2_DUPCAPUSED;
break;
}
Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_internal.h 2015-08-03 13:18:49 UTC (rev 332)
@@ -524,9 +524,10 @@
#define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */
#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */
#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */
-#define PCRE2_DEREF_TABLES 0x00040000 /* Release character tables. */
+#define PCRE2_DEREF_TABLES 0x00040000 /* release character tables */
#define PCRE2_NOJIT 0x00080000 /* (*NOJIT) used */
#define PCRE2_HASBKPORX 0x00100000 /* contains \P, \p, or \X */
+#define PCRE2_DUPCAPUSED 0x00200000 /* contains (?| */
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_study.c 2015-08-03 13:18:49 UTC (rev 332)
@@ -74,12 +74,13 @@
startcode pointer to start of the whole pattern's code
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
- countptr pointer to call count (to catch over complexity)
+ countptr pointer to call count (to catch over complexity)
Returns: the minimum length
-1 \C in UTF-8 mode
or (*ACCEPT)
- or pattern too complicated
+ or pattern too complicated
+ or back reference to duplicate name/number
-2 internal error (missing capturing bracket)
-3 internal error (opcode not listed)
*/
@@ -89,10 +90,13 @@
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
{
int length = -1;
-int prev_recno = -1;
-int prev_d = 0;
+int prev_cap_recno = -1;
+int prev_cap_d = 0;
+int prev_recurse_recno = -1;
+int prev_recurse_d = 0;
uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
+BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
@@ -114,7 +118,7 @@
int d, min, recno;
PCRE2_UCHAR *cs, *ce;
register PCRE2_UCHAR op = *cc;
-
+
switch (op)
{
case OP_COND:
@@ -133,26 +137,26 @@
}
goto PROCESS_NON_CAPTURE;
- /* There's a special case of OP_ONCE, when it is wrapped round an
- OP_RECURSE. We'd like to process the latter at this level so that
+ /* There's a special case of OP_ONCE, when it is wrapped round an
+ OP_RECURSE. We'd like to process the latter at this level so that
remembering the value works for repeated cases. So we do nothing, but
set a fudge value to skip over the OP_KET after the recurse. */
-
+
case OP_ONCE:
if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
{
once_fudge = 1 + LINK_SIZE;
cc += 1 + LINK_SIZE;
- break;
- }
+ break;
+ }
/* Fall through */
-
+
case OP_ONCE_NC:
case OP_BRA:
case OP_SBRA:
case OP_BRAPOS:
case OP_SBRAPOS:
- PROCESS_NON_CAPTURE:
+ PROCESS_NON_CAPTURE:
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
if (d < 0) return d;
branchlength += d;
@@ -162,24 +166,25 @@
/* To save time for repeated capturing subpatterns, we remember the
length of the previous one. Unfortunately we can't do the same for
- the unnumbered ones above. */
+ the unnumbered ones above. Nor can we do this if (?| is present in the
+ pattern because captures with the same number are not then identical. */
case OP_CBRA:
case OP_SCBRA:
case OP_CBRAPOS:
case OP_SCBRAPOS:
- recno = GET2(cc, 1+LINK_SIZE);
- if (recno != prev_recno)
+ recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
+ if (recno != prev_cap_recno)
{
- prev_recno = recno;
- prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
- if (prev_d < 0) return prev_d;
+ prev_cap_recno = recno;
+ prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+ if (prev_cap_d < 0) return prev_cap_d;
}
- branchlength += prev_d;
+ branchlength += prev_cap_d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
-
+
/* ACCEPT makes things far too complicated; we have to give up. */
case OP_ACCEPT:
@@ -427,8 +432,12 @@
matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */
- case OP_DNREF: /* Duplicate named pattern back reference */
+ /* Duplicate named pattern back reference. We cannot reliably find a length
+ for this if duplicate numbers are present in the pattern. */
+
+ case OP_DNREF:
case OP_DNREFI:
+ if (dupcapused) return -1;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
int count = GET2(cc, 1+IMM2_SIZE);
@@ -477,8 +486,12 @@
cc += 1 + 2*IMM2_SIZE;
goto REPEAT_BACK_REFERENCE;
- case OP_REF: /* Single back reference */
+ /* Single back reference. We cannot find a length for this if duplicate
+ numbers are present in the pattern. */
+
+ case OP_REF:
case OP_REFI:
+ if (dupcapused) return -1;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
@@ -546,15 +559,19 @@
branchlength += min * d;
break;
+ /* Recursion always refers to the first occurrence of a subpattern with a
+ given number. Therefore, we can always make use of caching, even when the
+ pattern contains multiple subpatterns with the same number. */
+
case OP_RECURSE:
cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
- recno = GET2(cs, 1+LINK_SIZE);
- if (recno == prev_recno)
+ recno = GET2(cs, 1+LINK_SIZE);
+ if (recno == prev_recurse_recno)
{
- branchlength += prev_d;
- }
+ branchlength += prev_recurse_d;
+ }
else
- {
+ {
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
@@ -568,16 +585,16 @@
{
this_recurse.prev = recurses;
this_recurse.group = cs;
- prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
+ prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr);
- if (prev_d < 0) return prev_d;
- prev_recno = recno;
- branchlength += prev_d;
+ if (prev_recurse_d < 0) return prev_recurse_d;
+ prev_recurse_recno = recno;
+ branchlength += prev_recurse_d;
}
}
- }
+ }
cc += 1 + LINK_SIZE + once_fudge;
- once_fudge = 0;
+ once_fudge = 0;
break;
/* Anything else does not or need not match a character. We can get the
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testinput1 2015-08-03 13:18:49 UTC (rev 332)
@@ -5727,4 +5727,26 @@
"(?|(\k'Pm')|(?'Pm'))"
abcd
+/(?|(aaa)|(b))\g{1}/
+ aaaaaa
+ bb
+
+/(?|(aaa)|(b))(?1)/
+ aaaaaa
+ baaa
+ ** Failers
+ bb
+
+/(?|(aaa)|(b))/
+ xaaa
+ xbc
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/
+ aaaaaa
+ bb
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
+ aaaccccaaa
+ bccccb
+
# End of testinput1
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testinput2 2015-08-03 13:18:49 UTC (rev 332)
@@ -4362,4 +4362,14 @@
/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
+/(?|(aaa)|(b))\g{1}/I
+
+/(?|(aaa)|(b))(?1)/I
+
+/(?|(aaa)|(b))/I
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/I
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
+
# End of testinput2
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testoutput1 2015-08-03 13:18:49 UTC (rev 332)
@@ -9463,4 +9463,50 @@
0:
1:
+/(?|(aaa)|(b))\g{1}/
+ aaaaaa
+ 0: aaaaaa
+ 1: aaa
+ bb
+ 0: bb
+ 1: b
+
+/(?|(aaa)|(b))(?1)/
+ aaaaaa
+ 0: aaaaaa
+ 1: aaa
+ baaa
+ 0: baaa
+ 1: b
+ ** Failers
+No match
+ bb
+No match
+
+/(?|(aaa)|(b))/
+ xaaa
+ 0: aaa
+ 1: aaa
+ xbc
+ 0: b
+ 1: b
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/
+ aaaaaa
+ 0: aaaaaa
+ 1: aaa
+ bb
+ 0: bb
+ 1: b
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
+ aaaccccaaa
+ 0: aaaccccaaa
+ 1: aaa
+ 2: cccc
+ bccccb
+ 0: bccccb
+ 1: b
+ 2: cccc
+
# End of testinput1
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testoutput2 2015-08-03 13:18:49 UTC (rev 332)
@@ -14576,4 +14576,39 @@
Last code unit = '0'
Subject length lower bound = 65535
+/(?|(aaa)|(b))\g{1}/I
+Capturing subpattern count = 1
+Max back reference = 1
+Starting code units: a b
+Subject length lower bound = 0
+
+/(?|(aaa)|(b))(?1)/I
+Capturing subpattern count = 1
+Starting code units: a b
+Subject length lower bound = 4
+
+/(?|(aaa)|(b))/I
+Capturing subpattern count = 1
+Starting code units: a b
+Subject length lower bound = 1
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/I
+Capturing subpattern count = 1
+Max back reference = 1
+Named capturing subpatterns:
+ a 1
+Starting code units: a b
+Subject length lower bound = 0
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
+Capturing subpattern count = 2
+Max back reference = 2
+Named capturing subpatterns:
+ a 1
+ a 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 0
+
# End of testinput2