[Pcre-svn] [660] code/trunk: Cache group minima to speed up …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [660] code/trunk: Cache group minima to speed up studying of pathological patterns.
Revision: 660
          http://www.exim.org/viewvc/pcre2?view=rev&revision=660
Author:   ph10
Date:     2017-02-10 16:33:15 +0000 (Fri, 10 Feb 2017)
Log Message:
-----------
Cache group minima to speed up studying of pathological patterns. Fixes 
oss-fuzz #557.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2017-02-08 17:03:30 UTC (rev 659)
+++ code/trunk/ChangeLog    2017-02-10 16:33:15 UTC (rev 660)
@@ -343,7 +343,16 @@
 stack usage for the benefit of clang with -fsanitize-address, which uses huge 
 stack frames. Example pattern: /X?(R||){3335}/. Fixes oss-fuzz issue 553. 


+54. A pattern with very many explicit back references to a group that is a long
+way from the start of the pattern could take a long time to compile because
+searching for the referenced group in order to find the minimum length was
+being done repeatedly. Now up to 128 group minimum lengths are cached and the
+attempt to find a minimum length is abandoned if there is a back reference to a
+group whose number is greater than 128. (In that case, the pattern is so
+complicated that this optimization probably isn't worth it.) This fixes
+oss-fuzz issue 557.

+
Version 10.22 29-July-2016
--------------------------


Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2017-02-08 17:03:30 UTC (rev 659)
+++ code/trunk/src/pcre2_study.c    2017-02-10 16:33:15 UTC (rev 660)
@@ -50,6 +50,10 @@
 #include "pcre2_internal.h"



+/* The maximum remembered capturing brackets minimum. */
+
+#define MAX_CACHE_BACKREF 128
+
/* Set a bit in the starting code unit bit map. */

#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
@@ -71,6 +75,12 @@
pathological), so we give up when we reach that amount. This also means that
integer overflow for really crazy patterns cannot happen.

+Backreference minimum lengths are cached to speed up multiple references. This
+function is called only when the highest back reference in the pattern is less
+than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
+caching vector. The zeroth element contains the number of the highest set
+value.
+
 Arguments:
   re              compiled pattern block
   code            pointer to start of group (the bracket)
@@ -78,6 +88,7 @@
   utf             UTF flag
   recurses        chain of recurse_check to catch mutual recursion
   countptr        pointer to call count (to catch over complexity)
+  backref_cache   vector for caching back references.


 Returns:   the minimum length
            -1 \C in UTF-8 mode
@@ -90,7 +101,8 @@


 static int
 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
-  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
+  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
+  int *backref_cache)
 {
 int length = -1;
 int prev_cap_recno = -1;
@@ -166,7 +178,8 @@
     case OP_BRAPOS:
     case OP_SBRAPOS:
     PROCESS_NON_CAPTURE:
-    d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+    d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+      backref_cache);
     if (d < 0) return d;
     branchlength += d;
     do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -186,7 +199,8 @@
     if (recno != prev_cap_recno)
       {
       prev_cap_recno = recno;
-      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
+      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
+        backref_cache);
       if (prev_cap_d < 0) return prev_cap_d;
       }
     branchlength += prev_cap_d;
@@ -456,39 +470,52 @@


       d = INT_MAX;


-      /* Scan all groups with the same name */
+      /* Scan all groups with the same name; find the shortest. */


       while (count-- > 0)
         {
-        ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
-        if (cs == NULL) return -2;
-        do ce += GET(ce, 1); while (*ce == OP_ALT);
-        if (cc > cs && cc < ce)    /* Simple recursion */
-          {
-          d = 0;
-          had_recurse = TRUE;
-          break;
-          }
+        int dd, i;
+        recno = GET2(slot, 0);
+
+        if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+          dd = backref_cache[recno];
         else
           {
-          recurse_check *r = recurses;
-          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
-          if (r != NULL)           /* Mutual recursion */
+          ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+          if (cs == NULL) return -2;
+          do ce += GET(ce, 1); while (*ce == OP_ALT);
+          if (cc > cs && cc < ce)    /* Simple recursion */
             {
-            d = 0;
+            dd = 0;
             had_recurse = TRUE;
-            break;
             }
           else
             {
-            int dd;
-            this_recurse.prev = recurses;
-            this_recurse.group = cs;
-            dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
-            if (dd < 0) return dd;
-            if (dd < d) d = dd;
+            recurse_check *r = recurses;
+            for (r = recurses; r != NULL; r = r->prev)
+              if (r->group == cs) break;
+            if (r != NULL)           /* Mutual recursion */
+              {
+              dd = 0;
+              had_recurse = TRUE;
+              }
+            else
+              {
+              this_recurse.prev = recurses;
+              this_recurse.group = cs;
+              dd = find_minlength(re, cs, startcode, utf, &this_recurse,
+                countptr, backref_cache);
+              if (dd < 0) return dd;
+              }
             }
+
+          backref_cache[recno] = dd;
+          for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+          backref_cache[0] = recno;
           }
+
+        if (dd < d) d = dd;
+        if (d <= 0) break;    /* No point looking at any more */
         slot += re->name_entry_size;
         }
       }
@@ -502,21 +529,18 @@
     case OP_REF:
     case OP_REFI:
     if (dupcapused) return -1;
-    if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
+    recno = GET2(cc, 1);
+    if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
+      d = backref_cache[recno];
+    else
       {
-      ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
-      if (cs == NULL) return -2;
-      do ce += GET(ce, 1); while (*ce == OP_ALT);
-      if (cc > cs && cc < ce)    /* Simple recursion */
+      int i;
+      if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
         {
-        d = 0;
-        had_recurse = TRUE;
-        }
-      else
-        {
-        recurse_check *r = recurses;
-        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
-        if (r != NULL)           /* Mutual recursion */
+        ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
+        if (cs == NULL) return -2;
+        do ce += GET(ce, 1); while (*ce == OP_ALT);
+        if (cc > cs && cc < ce)    /* Simple recursion */
           {
           d = 0;
           had_recurse = TRUE;
@@ -523,14 +547,30 @@
           }
         else
           {
-          this_recurse.prev = recurses;
-          this_recurse.group = cs;
-          d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
-          if (d < 0) return d;
+          recurse_check *r = recurses;
+          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
+          if (r != NULL)           /* Mutual recursion */
+            {
+            d = 0;
+            had_recurse = TRUE;
+            }
+          else
+            {
+            this_recurse.prev = recurses;
+            this_recurse.group = cs;
+            d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
+              backref_cache);
+            if (d < 0) return d;
+            }
           }
         }
+      else d = 0;
+
+      backref_cache[recno] = d;
+      for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
+      backref_cache[0] = recno;
       }
-    else d = 0;
+
     cc += 1 + IMM2_SIZE;


     /* Handle repeated back references */
@@ -603,7 +643,7 @@
           this_recurse.prev = recurses;
           this_recurse.group = cs;
           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
-            countptr);
+            countptr, backref_cache);
           if (prev_recurse_d < 0) return prev_recurse_d;
           prev_recurse_recno = recno;
           branchlength += prev_recurse_d;
@@ -1548,12 +1588,19 @@
   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
   }


-/* Find the minimum length of subject string. If it can match an empty string,
-the minimum length is already known. */
+/* Find the minimum length of subject string. If the pattern can match an empty
+string, the minimum length is already known. If there are more back references
+than the size of the vector we are going to cache them in, do nothing. A
+pattern that complicated will probably take a long time to analyze and may in
+any case turn out to be too complicated. Note that back reference minima are
+held as 16-bit numbers. */

-if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
+if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
+     re->top_backref <= MAX_CACHE_BACKREF)
   {
-  min = find_minlength(re, code, code, utf, NULL, &count);
+  int backref_cache[MAX_CACHE_BACKREF+1];
+  backref_cache[0] = 0;    /* Highest one that is set */
+  min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
   switch(min)
     {
     case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2017-02-08 17:03:30 UTC (rev 659)
+++ code/trunk/testdata/testinput2    2017-02-10 16:33:15 UTC (rev 660)
@@ -4963,4 +4963,10 @@


/()(\g+65533)/

+/\xC1\x00\x00\x00?\x9A(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00?\xFF\xFF\x00\x9A(\1{50779}?)J\w2/I
+
+/(a)(b)\2\1\1\1\1/I
+
+/(?<a>a)(?<b>b)\g{b}\g{a}\g{a}\g{a}\g{a}(?<a>xx)(?<b>zz)/I,dupnames
+
# End of testinput2

Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2017-02-08 17:03:30 UTC (rev 659)
+++ code/trunk/testdata/testoutput2    2017-02-10 16:33:15 UTC (rev 660)
@@ -2122,7 +2122,7 @@
 Capturing subpattern count = 271
 Max back reference = 270
 Starting code units: 0 1 2 3 4 5 6 7 8 9 
-Subject length lower bound = 272
+Subject length lower bound = 0
      1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC\=ovector=300
  0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC
  1: 1 
@@ -15416,7 +15416,7 @@
 Capturing subpattern count = 108
 Max back reference = 22
 Contains explicit CR or LF match
-Subject length lower bound = 0
+Subject length lower bound = 1


# This checks that new code for handling groups that may match an empty string
# works on a very large number of alternatives. This pattern used to provoke a
@@ -15452,6 +15452,33 @@
/()(\g+65533)/
Failed: error 115 at offset 10: reference to non-existent subpattern

+/\xC1\x00\x00\x00?\x9A(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00?\xFF\xFF\x00\x9A(\1{50779}?)J\w2/I
+Capturing subpattern count = 2
+Max back reference = 2
+First code unit = \xc1
+Last code unit = '2'
+Subject length lower bound = 65535
+
+/(a)(b)\2\1\1\1\1/I
+Capturing subpattern count = 2
+Max back reference = 2
+First code unit = 'a'
+Last code unit = 'b'
+Subject length lower bound = 7
+
+/(?<a>a)(?<b>b)\g{b}\g{a}\g{a}\g{a}\g{a}(?<a>xx)(?<b>zz)/I,dupnames
+Capturing subpattern count = 4
+Max back reference = 4
+Named capturing subpatterns:
+ a 1
+ a 3
+ b 2
+ b 4
+Options: dupnames
+First code unit = 'a'
+Last code unit = 'z'
+Subject length lower bound = 11
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data