[Pcre-svn] [332] code/trunk: Fix incorrect minimum matching …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [332] code/trunk: Fix incorrect minimum matching length when pattern contains (?| groups.
Revision: 332
          http://www.exim.org/viewvc/pcre2?view=rev&revision=332
Author:   ph10
Date:     2015-08-03 14:18:49 +0100 (Mon, 03 Aug 2015)
Log Message:
-----------
Fix incorrect minimum matching length when pattern contains (?| groups.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/src/pcre2_internal.h
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/ChangeLog    2015-08-03 13:18:49 UTC (rev 332)
@@ -92,7 +92,13 @@
 24. An optimization has been added that speeds up finding the minimum matching 
 length for patterns containing repeated capturing groups or recursions.


+25. If a pattern contained a back reference to a group whose number was
+duplicated as a result of appearing in a (?|...) group, the computation of the
+minimum matching length gave a wrong result, which could cause incorrect "no
+match" errors. For such patterns, a minimum matching length cannot at present
+be computed.

+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_compile.c    2015-08-03 13:18:49 UTC (rev 332)
@@ -3215,6 +3215,7 @@
         top_nest->reset_group = cb->bracount;
         top_nest->max_group = cb->bracount;
         top_nest->flags |= NSF_RESET;
+        cb->external_flags |= PCRE2_DUPCAPUSED; 
         break;
         }



Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_internal.h    2015-08-03 13:18:49 UTC (rev 332)
@@ -524,9 +524,10 @@
 #define PCRE2_NL_SET        0x00008000  /* newline was set in the pattern */
 #define PCRE2_NOTEMPTY_SET  0x00010000  /* (*NOTEMPTY) used        ) keep */
 #define PCRE2_NE_ATST_SET   0x00020000  /* (*NOTEMPTY_ATSTART) used) together */
-#define PCRE2_DEREF_TABLES  0x00040000  /* Release character tables. */
+#define PCRE2_DEREF_TABLES  0x00040000  /* release character tables */
 #define PCRE2_NOJIT         0x00080000  /* (*NOJIT) used */
 #define PCRE2_HASBKPORX     0x00100000  /* contains \P, \p, or \X */
+#define PCRE2_DUPCAPUSED    0x00200000  /* contains (?| */


 #define PCRE2_MODE_MASK     (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)



Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/src/pcre2_study.c    2015-08-03 13:18:49 UTC (rev 332)
@@ -74,12 +74,13 @@
   startcode       pointer to start of the whole pattern's code
   utf             UTF flag
   recurses        chain of recurse_check to catch mutual recursion
-  countptr        pointer to call count (to catch over complexity) 
+  countptr        pointer to call count (to catch over complexity)


 Returns:   the minimum length
            -1 \C in UTF-8 mode
               or (*ACCEPT)
-              or pattern too complicated 
+              or pattern too complicated
+              or back reference to duplicate name/number
            -2 internal error (missing capturing bracket)
            -3 internal error (opcode not listed)
 */
@@ -89,10 +90,13 @@
   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
 {
 int length = -1;
-int prev_recno = -1;
-int prev_d = 0;
+int prev_cap_recno = -1;
+int prev_cap_d = 0;
+int prev_recurse_recno = -1;
+int prev_recurse_d = 0;
 uint32_t once_fudge = 0;
 BOOL had_recurse = FALSE;
+BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
 recurse_check this_recurse;
 register int branchlength = 0;
 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
@@ -114,7 +118,7 @@
   int d, min, recno;
   PCRE2_UCHAR *cs, *ce;
   register PCRE2_UCHAR op = *cc;
-  
+
   switch (op)
     {
     case OP_COND:
@@ -133,26 +137,26 @@
       }
     goto PROCESS_NON_CAPTURE;


-    /* There's a special case of OP_ONCE, when it is wrapped round an 
-    OP_RECURSE. We'd like to process the latter at this level so that 
+    /* There's a special case of OP_ONCE, when it is wrapped round an
+    OP_RECURSE. We'd like to process the latter at this level so that
     remembering the value works for repeated cases. So we do nothing, but
     set a fudge value to skip over the OP_KET after the recurse. */
-       
+
     case OP_ONCE:
     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
       {
       once_fudge = 1 + LINK_SIZE;
       cc += 1 + LINK_SIZE;
-      break;   
-      }   
+      break;
+      }
     /* Fall through */
- 
+
     case OP_ONCE_NC:
     case OP_BRA:
     case OP_SBRA:
     case OP_BRAPOS:
     case OP_SBRAPOS:
-    PROCESS_NON_CAPTURE: 
+    PROCESS_NON_CAPTURE:
     d = find_minlength(re, cc, startcode, utf, recurses, countptr);
     if (d < 0) return d;
     branchlength += d;
@@ -162,24 +166,25 @@


     /* To save time for repeated capturing subpatterns, we remember the
     length of the previous one. Unfortunately we can't do the same for
-    the unnumbered ones above. */ 
+    the unnumbered ones above. Nor can we do this if (?| is present in the
+    pattern because captures with the same number are not then identical. */


     case OP_CBRA:
     case OP_SCBRA:
     case OP_CBRAPOS:
     case OP_SCBRAPOS:
-    recno = GET2(cc, 1+LINK_SIZE); 
-    if (recno != prev_recno)
+    recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
+    if (recno != prev_cap_recno)
       {
-      prev_recno = recno; 
-      prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
-      if (prev_d < 0) return prev_d;
+      prev_cap_recno = recno;
+      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+      if (prev_cap_d < 0) return prev_cap_d;
       }
-    branchlength += prev_d;
+    branchlength += prev_cap_d;
     do cc += GET(cc, 1); while (*cc == OP_ALT);
     cc += 1 + LINK_SIZE;
     break;
-     
+
     /* ACCEPT makes things far too complicated; we have to give up. */


     case OP_ACCEPT:
@@ -427,8 +432,12 @@
     matches an empty string (by default it causes a matching failure), so in
     that case we must set the minimum length to zero. */


-    case OP_DNREF:     /* Duplicate named pattern back reference */
+    /* Duplicate named pattern back reference. We cannot reliably find a length
+    for this if duplicate numbers are present in the pattern. */
+
+    case OP_DNREF:
     case OP_DNREFI:
+    if (dupcapused) return -1;
     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
       {
       int count = GET2(cc, 1+IMM2_SIZE);
@@ -477,8 +486,12 @@
     cc += 1 + 2*IMM2_SIZE;
     goto REPEAT_BACK_REFERENCE;


-    case OP_REF:      /* Single back reference */
+    /* Single back reference. We cannot find a length for this if duplicate
+    numbers are present in the pattern. */
+
+    case OP_REF:
     case OP_REFI:
+    if (dupcapused) return -1;
     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
       {
       ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
@@ -546,15 +559,19 @@
     branchlength += min * d;
     break;


+    /* Recursion always refers to the first occurrence of a subpattern with a
+    given number. Therefore, we can always make use of caching, even when the
+    pattern contains multiple subpatterns with the same number. */
+
     case OP_RECURSE:
     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
-    recno = GET2(cs, 1+LINK_SIZE); 
-    if (recno == prev_recno)
+    recno = GET2(cs, 1+LINK_SIZE);
+    if (recno == prev_recurse_recno)
       {
-      branchlength += prev_d;
-      }   
+      branchlength += prev_recurse_d;
+      }
     else
-      { 
+      {
       do ce += GET(ce, 1); while (*ce == OP_ALT);
       if (cc > cs && cc < ce)    /* Simple recursion */
         had_recurse = TRUE;
@@ -568,16 +585,16 @@
           {
           this_recurse.prev = recurses;
           this_recurse.group = cs;
-          prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
+          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
             countptr);
-          if (prev_d < 0) return prev_d;   
-          prev_recno = recno; 
-          branchlength += prev_d;   
+          if (prev_recurse_d < 0) return prev_recurse_d;
+          prev_recurse_recno = recno;
+          branchlength += prev_recurse_d;
           }
         }
-      }   
+      }
     cc += 1 + LINK_SIZE + once_fudge;
-    once_fudge = 0; 
+    once_fudge = 0;
     break;


     /* Anything else does not or need not match a character. We can get the


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testinput1    2015-08-03 13:18:49 UTC (rev 332)
@@ -5727,4 +5727,26 @@
 "(?|(\k'Pm')|(?'Pm'))"
     abcd


+/(?|(aaa)|(b))\g{1}/
+    aaaaaa
+    bb 
+
+/(?|(aaa)|(b))(?1)/
+    aaaaaa
+    baaa 
+    ** Failers 
+    bb 
+
+/(?|(aaa)|(b))/
+    xaaa
+    xbc 
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/
+    aaaaaa
+    bb 
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
+    aaaccccaaa
+    bccccb 
+
 # End of testinput1 


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testinput2    2015-08-03 13:18:49 UTC (rev 332)
@@ -4362,4 +4362,14 @@


/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I

+/(?|(aaa)|(b))\g{1}/I
+
+/(?|(aaa)|(b))(?1)/I
+
+/(?|(aaa)|(b))/I
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/I
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
+
# End of testinput2

Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testoutput1    2015-08-03 13:18:49 UTC (rev 332)
@@ -9463,4 +9463,50 @@
  0: 
  1: 


+/(?|(aaa)|(b))\g{1}/
+    aaaaaa
+ 0: aaaaaa
+ 1: aaa
+    bb 
+ 0: bb
+ 1: b
+
+/(?|(aaa)|(b))(?1)/
+    aaaaaa
+ 0: aaaaaa
+ 1: aaa
+    baaa 
+ 0: baaa
+ 1: b
+    ** Failers 
+No match
+    bb 
+No match
+
+/(?|(aaa)|(b))/
+    xaaa
+ 0: aaa
+ 1: aaa
+    xbc 
+ 0: b
+ 1: b
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/
+    aaaaaa
+ 0: aaaaaa
+ 1: aaa
+    bb 
+ 0: bb
+ 1: b
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/dupnames
+    aaaccccaaa
+ 0: aaaccccaaa
+ 1: aaa
+ 2: cccc
+    bccccb 
+ 0: bccccb
+ 1: b
+ 2: cccc
+
 # End of testinput1 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2015-08-01 09:11:28 UTC (rev 331)
+++ code/trunk/testdata/testoutput2    2015-08-03 13:18:49 UTC (rev 332)
@@ -14576,4 +14576,39 @@
 Last code unit = '0'
 Subject length lower bound = 65535


+/(?|(aaa)|(b))\g{1}/I
+Capturing subpattern count = 1
+Max back reference = 1
+Starting code units: a b
+Subject length lower bound = 0
+
+/(?|(aaa)|(b))(?1)/I
+Capturing subpattern count = 1
+Starting code units: a b
+Subject length lower bound = 4
+
+/(?|(aaa)|(b))/I
+Capturing subpattern count = 1
+Starting code units: a b
+Subject length lower bound = 1
+
+/(?|(?'a'aaa)|(?'a'b))\k'a'/I
+Capturing subpattern count = 1
+Max back reference = 1
+Named capturing subpatterns:
+ a 1
+Starting code units: a b
+Subject length lower bound = 0
+
+/(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames
+Capturing subpattern count = 2
+Max back reference = 2
+Named capturing subpatterns:
+ a 1
+ a 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 0
+
# End of testinput2