[Pcre-svn] [331] code/trunk: Fix issues with minimum length …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [331] code/trunk: Fix issues with minimum length finding.
Revision: 331
          http://www.exim.org/viewvc/pcre2?view=rev&revision=331
Author:   ph10
Date:     2015-08-01 10:11:28 +0100 (Sat, 01 Aug 2015)
Log Message:
-----------
Fix issues with minimum length finding.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_find_bracket.c
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/ChangeLog    2015-08-01 09:11:28 UTC (rev 331)
@@ -85,7 +85,14 @@
 class, where both values are literal letters in the same case, omit the 
 non-letter EBCDIC code points within the range.


+23. Finding the minimum matching length of complex patterns with back
+references and/or recursions can take a long time. There is now a cut-off that
+gives up trying to find a minimum length when things get too complex.

+24. An optimization has been added that speeds up finding the minimum matching
+length for patterns containing repeated capturing groups or recursions.
+
+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/src/pcre2_find_bracket.c
===================================================================
--- code/trunk/src/pcre2_find_bracket.c    2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/src/pcre2_find_bracket.c    2015-08-01 09:11:28 UTC (rev 331)
@@ -83,7 +83,7 @@
   if (c == OP_XCLASS) code += GET(code, 1);
     else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);


- /* Handle recursion */
+ /* Handle lookbehind */

   else if (c == OP_REVERSE)
     {


Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/src/pcre2_study.c    2015-08-01 09:11:28 UTC (rev 331)
@@ -59,7 +59,6 @@
 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };



-
 /*************************************************
 *   Find the minimum subject length for a group  *
 *************************************************/
@@ -75,10 +74,12 @@
   startcode       pointer to start of the whole pattern's code
   utf             UTF flag
   recurses        chain of recurse_check to catch mutual recursion
+  countptr        pointer to call count (to catch over complexity) 


 Returns:   the minimum length
            -1 \C in UTF-8 mode
               or (*ACCEPT)
+              or pattern too complicated 
            -2 internal error (missing capturing bracket)
            -3 internal error (opcode not listed)
 */
@@ -85,14 +86,23 @@


static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
- PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses)
+ PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
{
int length = -1;
+int prev_recno = -1;
+int prev_d = 0;
+uint32_t once_fudge = 0;
BOOL had_recurse = FALSE;
recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;

+/* A large and/or complex regex can take too long to process. */
+
+if ((*countptr)++ > 1000) return -1;
+
+/* Skip over capturing bracket number */
+
 if (*code == OP_CBRA || *code == OP_SCBRA ||
     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;


@@ -101,10 +111,10 @@

 for (;;)
   {
-  int d, min;
+  int d, min, recno;
   PCRE2_UCHAR *cs, *ce;
   register PCRE2_UCHAR op = *cc;
-
+  
   switch (op)
     {
     case OP_COND:
@@ -112,7 +122,8 @@


     /* If there is only one branch in a condition, the implied branch has zero
     length, so we don't add anything. This covers the DEFINE "condition"
-    automatically. */
+    automatically. If there are two branches we can treat it the same as any
+    other non-capturing subpattern. */


     cs = cc + GET(cc, 1);
     if (*cs != OP_ALT)
@@ -120,21 +131,29 @@
       cc = cs + 1 + LINK_SIZE;
       break;
       }
+    goto PROCESS_NON_CAPTURE;


-    /* Otherwise we can fall through and treat it the same as any other
-    subpattern. */
-
-    case OP_CBRA:
-    case OP_SCBRA:
+    /* There's a special case of OP_ONCE, when it is wrapped round an 
+    OP_RECURSE. We'd like to process the latter at this level so that 
+    remembering the value works for repeated cases. So we do nothing, but
+    set a fudge value to skip over the OP_KET after the recurse. */
+       
+    case OP_ONCE:
+    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
+      {
+      once_fudge = 1 + LINK_SIZE;
+      cc += 1 + LINK_SIZE;
+      break;   
+      }   
+    /* Fall through */
+ 
+    case OP_ONCE_NC:
     case OP_BRA:
     case OP_SBRA:
-    case OP_CBRAPOS:
-    case OP_SCBRAPOS:
     case OP_BRAPOS:
     case OP_SBRAPOS:
-    case OP_ONCE:
-    case OP_ONCE_NC:
-    d = find_minlength(re, cc, startcode, utf, recurses);
+    PROCESS_NON_CAPTURE: 
+    d = find_minlength(re, cc, startcode, utf, recurses, countptr);
     if (d < 0) return d;
     branchlength += d;
     do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -141,6 +160,26 @@
     cc += 1 + LINK_SIZE;
     break;


+    /* To save time for repeated capturing subpatterns, we remember the
+    length of the previous one. Unfortunately we can't do the same for
+    the unnumbered ones above. */ 
+
+    case OP_CBRA:
+    case OP_SCBRA:
+    case OP_CBRAPOS:
+    case OP_SCBRAPOS:
+    recno = GET2(cc, 1+LINK_SIZE); 
+    if (recno != prev_recno)
+      {
+      prev_recno = recno; 
+      prev_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+      if (prev_d < 0) return prev_d;
+      }
+    branchlength += prev_d;
+    do cc += GET(cc, 1); while (*cc == OP_ALT);
+    cc += 1 + LINK_SIZE;
+    break;
+     
     /* ACCEPT makes things far too complicated; we have to give up. */


     case OP_ACCEPT:
@@ -427,7 +466,7 @@
             int dd;
             this_recurse.prev = recurses;
             this_recurse.group = cs;
-            dd = find_minlength(re, cs, startcode, utf, &this_recurse);
+            dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
             if (dd < d) d = dd;
             }
           }
@@ -463,7 +502,7 @@
           {
           this_recurse.prev = recurses;
           this_recurse.group = cs;
-          d = find_minlength(re, cs, startcode, utf, &this_recurse);
+          d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
           }
         }
       }
@@ -509,23 +548,36 @@


     case OP_RECURSE:
     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
-    do ce += GET(ce, 1); while (*ce == OP_ALT);
-    if (cc > cs && cc < ce)    /* Simple recursion */
-      had_recurse = TRUE;
+    recno = GET2(cs, 1+LINK_SIZE); 
+    if (recno == prev_recno)
+      {
+      branchlength += prev_d;
+      }   
     else
-      {
-      recurse_check *r = recurses;
-      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
-      if (r != NULL)          /* Mutual recursion */
+      { 
+      do ce += GET(ce, 1); while (*ce == OP_ALT);
+      if (cc > cs && cc < ce)    /* Simple recursion */
         had_recurse = TRUE;
       else
         {
-        this_recurse.prev = recurses;
-        this_recurse.group = cs;
-        branchlength += find_minlength(re, cs, startcode, utf, &this_recurse);
+        recurse_check *r = recurses;
+        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+        if (r != NULL)          /* Mutual recursion */
+          had_recurse = TRUE;
+        else
+          {
+          this_recurse.prev = recurses;
+          this_recurse.group = cs;
+          prev_d = find_minlength(re, cs, startcode, utf, &this_recurse,
+            countptr);
+          if (prev_d < 0) return prev_d;   
+          prev_recno = recno; 
+          branchlength += prev_d;   
+          }
         }
-      }
-    cc += 1 + LINK_SIZE;
+      }   
+    cc += 1 + LINK_SIZE + once_fudge;
+    once_fudge = 0; 
     break;


     /* Anything else does not or need not match a character. We can get the
@@ -1441,6 +1493,7 @@
 PRIV(study)(pcre2_real_code *re)
 {
 int min;
+int count = 0;
 PCRE2_UCHAR *code;
 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;


@@ -1463,9 +1516,9 @@

/* Find the minimum length of subject string. */

-switch(min = find_minlength(re, code, code, utf, NULL))
+switch(min = find_minlength(re, code, code, utf, NULL, &count))
   {
-  case -1:  /* \C in UTF mode or (*ACCEPT) */
+  case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
   break;    /* Leave minlength unchanged (will be zero) */


case -2:
@@ -1475,6 +1528,7 @@
return 3; /* unrecognized opcode */

default:
+ if (min > UINT16_MAX) min = UINT16_MAX;
re->minlength = min;
break;
}

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/testdata/testinput2    2015-08-01 09:11:28 UTC (rev 331)
@@ -4360,4 +4360,6 @@


/(?(?C{\Q})(?!(?'abc')))/I

+/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
+
# End of testinput2

Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2015-07-31 09:59:49 UTC (rev 330)
+++ code/trunk/testdata/testoutput2    2015-08-01 09:11:28 UTC (rev 331)
@@ -3942,7 +3942,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
-Subject length lower bound = 2
+Subject length lower bound = 3
     xyz
  0: xyz
  1: xyz
@@ -14566,4 +14566,14 @@
 May match empty string
 Subject length lower bound = 0


+/(?1){3918}(((((0(\k'R'))))(?J)(?'R'(?'R'\3){99})))/I
+Capturing subpattern count = 8
+Max back reference = 8
+Named capturing subpatterns:
+ R 7
+ R 8
+Duplicate name status changes
+Last code unit = '0'
+Subject length lower bound = 65535
+
# End of testinput2