[Pcre-svn] [1108] code/trunk: Another extension to minimum l…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1108] code/trunk: Another extension to minimum length calculation.
Revision: 1108
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1108
Author:   ph10
Date:     2019-06-17 17:26:44 +0100 (Mon, 17 Jun 2019)
Log Message:
-----------
Another extension to minimum length calculation.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput12-16
    code/trunk/testdata/testoutput12-32
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/ChangeLog    2019-06-17 16:26:44 UTC (rev 1108)
@@ -42,6 +42,10 @@
      such references as not adding to the minimum length (which it should have 
      done all along).


+   * Furthermore, the above action now happens only if the back reference is to 
+     a group that exists more than once in a pattern instead of any back 
+     reference in a pattern with duplicate numbers.  
+     
 10. A (*MARK) value inside a successful condition was not being returned by the 
 interpretive matcher (it was returned by JIT). This bug has been mended.



Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/src/pcre2_study.c    2019-06-17 16:26:44 UTC (rev 1108)
@@ -134,7 +134,7 @@
   int d, min, recno;
   PCRE2_UCHAR *cs, *ce;
   PCRE2_UCHAR op = *cc;
-  
+
   if (branchlength >= UINT16_MAX) return UINT16_MAX;


switch (op)
@@ -448,11 +448,13 @@

     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
     matches an empty string (by default it causes a matching failure), so in
-    that case we must set the minimum length to zero. */
+    that case we must set the minimum length to zero.


-    /* Duplicate named pattern back reference. We cannot reliably find a length
-    for this if duplicate numbers are present in the pattern, so we set the 
-    length to zero here also. */
+    For backreferenes, if duplicate numbers are present in the pattern we check
+    for a reference to a duplicate. If it is, we don't know which version will
+    be referenced, so we have to set the minimum length to zero. */
+    
+    /* Duplicate named pattern back reference. */ 


     case OP_DNREF:
     case OP_DNREFI:
@@ -479,30 +481,34 @@
           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
           if (cs == NULL) return -2;
           do ce += GET(ce, 1); while (*ce == OP_ALT);
-          if (cc > cs && cc < ce)    /* Simple recursion */
+          
+          dd = 0;
+          if (!dupcapused ||
+              (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
             {
-            dd = 0;
-            had_recurse = TRUE;
-            }
-          else
-            {
-            recurse_check *r = recurses;
-            for (r = recurses; r != NULL; r = r->prev)
-              if (r->group == cs) break;
-            if (r != NULL)           /* Mutual recursion */
+            if (cc > cs && cc < ce)    /* Simple recursion */
               {
-              dd = 0;
               had_recurse = TRUE;
               }
             else
               {
-              this_recurse.prev = recurses;
-              this_recurse.group = cs;
-              dd = find_minlength(re, cs, startcode, utf, &this_recurse,
-                countptr, backref_cache);
-              if (dd < 0) return dd;
+              recurse_check *r = recurses;
+              for (r = recurses; r != NULL; r = r->prev)
+                if (r->group == cs) break;
+              if (r != NULL)           /* Mutual recursion */
+                {
+                had_recurse = TRUE;
+                }
+              else
+                {
+                this_recurse.prev = recurses;  /* No recursion */
+                this_recurse.group = cs;
+                dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+                  countptr, backref_cache);
+                if (dd < 0) return dd;
+                }
               }
-            }
+            }   


           backref_cache[recno] = dd;
           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
@@ -518,8 +524,8 @@
     cc += 1 + 2*IMM2_SIZE;
     goto REPEAT_BACK_REFERENCE;


-    /* Single back reference. We cannot find a length for this if duplicate
-    numbers are present in the pattern. */
+    /* Single back reference by number. References by name are converted to by 
+    number when there is no duplication. */


     case OP_REF:
     case OP_REFI:
@@ -529,36 +535,40 @@
     else
       {
       int i;
-      if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+      d = 0;
+
+      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
         {
         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
         if (cs == NULL) return -2;
         do ce += GET(ce, 1); while (*ce == OP_ALT);
-        if (cc > cs && cc < ce)    /* Simple recursion */
+
+        if (!dupcapused ||
+            (PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
           {
-          d = 0;
-          had_recurse = TRUE;
-          }
-        else
-          {
-          recurse_check *r = recurses;
-          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
-          if (r != NULL)           /* Mutual recursion */
+          if (cc > cs && cc < ce)    /* Simple recursion */
             {
-            d = 0;
             had_recurse = TRUE;
             }
           else
             {
-            this_recurse.prev = recurses;
-            this_recurse.group = cs;
-            d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
-              backref_cache);
-            if (d < 0) return d;
+            recurse_check *r = recurses;
+            for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+            if (r != NULL)           /* Mutual recursion */
+              {
+              had_recurse = TRUE;
+              }
+            else                     /* No recursion */
+              {
+              this_recurse.prev = recurses;
+              this_recurse.group = cs;
+              d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+                backref_cache);
+              if (d < 0) return d;
+              }
             }
           }
         }
-      else d = 0;


       backref_cache[recno] = d;
       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;


Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testinput10    2019-06-17 16:26:44 UTC (rev 1108)
@@ -557,4 +557,6 @@


# -------------------------------------

+/(*UTF)(?=\x{123})/I
+
# End of testinput10

Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testinput12    2019-06-17 16:26:44 UTC (rev 1108)
@@ -447,4 +447,6 @@


# ----------------------------------------------------

+/(*UTF)(?=\x{123})/I
+
# End of testinput12

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testinput2    2019-06-17 16:26:44 UTC (rev 1108)
@@ -5607,4 +5607,20 @@


/(*:\Q \E){5}/alt_verbnames

+/(?=abc)/I
+
+/(?|(X)|(XY))\1abc/I
+
+/(?|(a)|(bcde))(c)\2/I
+
+/(?|(a)|(bcde))(c)\1/I
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
+
+/((a|)+)+Z/I
+
+/((?=a))[abcd]/I
+
# End of testinput2

Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testoutput10    2019-06-17 16:26:44 UTC (rev 1108)
@@ -1757,4 +1757,13 @@


# -------------------------------------

+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \xc4
+Last code unit = \xa3
+Subject length lower bound = 1
+
# End of testinput10

Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testoutput12-16    2019-06-17 16:26:44 UTC (rev 1108)
@@ -1579,4 +1579,12 @@


# ----------------------------------------------------

+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \x{123}
+Subject length lower bound = 1
+
# End of testinput12

Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testoutput12-32    2019-06-17 16:26:44 UTC (rev 1108)
@@ -1577,4 +1577,12 @@


# ----------------------------------------------------

+/(*UTF)(?=\x{123})/I
+Capture group count = 0
+May match empty string
+Compile options: <none>
+Overall options: utf
+First code unit = \x{123}
+Subject length lower bound = 1
+
# End of testinput12

Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2019-06-16 15:37:45 UTC (rev 1107)
+++ code/trunk/testdata/testoutput2    2019-06-17 16:26:44 UTC (rev 1108)
@@ -16963,6 +16963,69 @@
 /(*:\Q \E){5}/alt_verbnames
 Failed: error 109 at offset 11: quantifier does not follow a repeatable item


+/(?=abc)/I
+Capture group count = 0
+May match empty string
+First code unit = 'a'
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/(?|(X)|(XY))\1abc/I
+Capture group count = 1
+Max back reference = 1
+First code unit = 'X'
+Last code unit = 'c'
+Subject length lower bound = 4
+
+/(?|(a)|(bcde))(c)\2/I
+Capture group count = 2
+Max back reference = 2
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 3
+
+/(?|(a)|(bcde))(c)\1/I
+Capture group count = 2
+Max back reference = 1
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'B'(?'A')/I,dupnames
+Capture group count = 3
+Max back reference = 2
+Named capture groups:
+ A 1
+ A 3
+ B 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 3
+
+/(?|(?'A'a)|(?'A'bcde))(?'B'c)\k'A'(?'A')/I,dupnames
+Capture group count = 3
+Max back reference = 3
+Named capture groups:
+ A 1
+ A 3
+ B 2
+Options: dupnames
+Starting code units: a b
+Last code unit = 'c'
+Subject length lower bound = 2
+
+/((a|)+)+Z/I
+Capture group count = 2
+Starting code units: Z a
+Last code unit = 'Z'
+Subject length lower bound = 1
+
+/((?=a))[abcd]/I
+Capture group count = 1
+First code unit = 'a'
+Subject length lower bound = 1
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data