[Pcre-svn] [1124] code/trunk: Fix bugs in recent patch for s…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1124] code/trunk: Fix bugs in recent patch for setting the maximum lookbehind.
Revision: 1124
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1124
Author:   ph10
Date:     2019-06-28 17:58:08 +0100 (Fri, 28 Jun 2019)
Log Message:
-----------
Fix bugs in recent patch for setting the maximum lookbehind.


Modified Paths:
--------------
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput15
    code/trunk/testdata/testoutput15


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/src/pcre2_compile.c    2019-06-28 16:58:08 UTC (rev 1124)
@@ -128,7 +128,7 @@
     compile_block *, PCRE2_SIZE *);


 static int
-  get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
+  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
     compile_block *);


 static BOOL
@@ -388,6 +388,9 @@
 #define GI_SET_FIXED_LENGTH    0x80000000u
 #define GI_NOT_FIXED_LENGTH    0x40000000u
 #define GI_FIXED_LENGTH_MASK   0x0000ffffu
+#define GI_EXTRA_MASK          0x0fff0000u
+#define GI_EXTRA_MAX                 0xfff  /* NB not unsigned */
+#define GI_EXTRA_SHIFT                  16


 /* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
 and is fast (a good compiler can turn it into a subtraction and unsigned
@@ -8841,6 +8844,7 @@
 Arguments:
   pptrptr     pointer to pointer in the parsed pattern
   isinline    FALSE if a reference or recursion; TRUE for inline group
+  extraptr    pointer to where to return extra lookbehind length
   errcodeptr  pointer to the errorcode
   lcptr       pointer to the loop counter
   group       number of captured group or -1 for a non-capturing group
@@ -8851,11 +8855,13 @@
 */


static int
-get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
- int group, parsed_recurse_check *recurses, compile_block *cb)
+get_grouplength(uint32_t **pptrptr, BOOL isinline, int *extraptr,
+ int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses,
+ compile_block *cb)
{
int branchlength;
int grouplength = -1;
+int extra = 0;

 /* The cache can be used only if there is no possibility of there being two
 groups with the same number. We do not need to set the end pointer for a group
@@ -8869,6 +8875,7 @@
   if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
     {
     if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
+    *extraptr = (groupinfo & GI_EXTRA_MASK) >> GI_EXTRA_SHIFT;
     return groupinfo & GI_FIXED_LENGTH_MASK;
     }
   }
@@ -8877,16 +8884,28 @@


 for(;;)
   {
-  branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
+  int branchextra;
+  branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
+    recurses, cb);
   if (branchlength < 0) goto ISNOTFIXED;
-  if (grouplength == -1) grouplength = branchlength;
-    else if (grouplength != branchlength) goto ISNOTFIXED;
+  if (grouplength == -1)
+    {
+    grouplength = branchlength;
+    extra = branchextra;
+    }
+  else if (grouplength != branchlength || extra != branchextra) goto ISNOTFIXED;
   if (**pptrptr == META_KET) break;
   *pptrptr += 1;   /* Skip META_ALT */
   }


-if (group > 0)
-  cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
+/* There are only 12 bits for caching the extra value, but a pattern that
+needs more than that is weird indeed. */
+
+if (group > 0 && extra <= GI_EXTRA_MAX)
+  cb->groupinfo[group] |= (uint32_t)
+    (GI_SET_FIXED_LENGTH | (extra << GI_EXTRA_SHIFT) | grouplength);
+
+*extraptr = extra;
 return grouplength;


ISNOTFIXED:
@@ -8901,13 +8920,17 @@
*************************************************/

/* Return a fixed length for a branch in a lookbehind, giving an error if the
-length is not fixed. If any lookbehinds are encountered on the way, they get
-their length set, and there is a check for them looking further back than the
-current lookbehind. On entry, *pptrptr points to the first element inside the
-branch. On exit it is set to point to the ALT or KET.
+length is not fixed. We also take note of any extra value that is generated
+from a nested lookbehind. For example, for /(?<=a(?<=ba)c)/ each individual
+lookbehind has length 2, but the max_lookbehind setting must be 3 because
+matching inspects 3 characters before the match starting point.

+On entry, *pptrptr points to the first element inside the branch. On exit it is
+set to point to the ALT or KET.
+
 Arguments:
   pptrptr     pointer to pointer in the parsed pattern
+  extraptr    pointer to where to return extra lookbehind length
   errcodeptr  pointer to error code
   lcptr       pointer to loop counter
   recurses    chain of recurse_check to catch mutual recursion
@@ -8917,11 +8940,12 @@
 */


 static int
-get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
+get_branchlength(uint32_t **pptrptr, int *extraptr, int *errcodeptr, int *lcptr,
   parsed_recurse_check *recurses, compile_block *cb)
 {
 int branchlength = 0;
 int grouplength;
+int groupextra;
 int max;
 int extra = 0;   /* Additional lookbehind from nesting */
 uint32_t lastitemlength = 0;
@@ -9070,11 +9094,11 @@
       }
     break;


-    /* A lookbehind does not contribute any length to this lookbehind, but must
-    itself be checked and have its lengths set. If the maximum lookebhind of
-    any branch is greater than the length so far computed for this branch, we
-    must set an extra value for use when setting the maximum overall
-    lookbehind. */
+    /* A nested lookbehind does not contribute any length to this lookbehind,
+    but must itself be checked and have its lengths set. If the maximum
+    lookbehind for the nested lookbehind is greater than the length so far
+    computed for this branch, we must compute an extra value and keep the
+    largest encountered for use when setting the maximum overall lookbehind. */


     case META_LOOKBEHIND:
     case META_LOOKBEHINDNOT:
@@ -9188,15 +9212,14 @@
     in the cache. */


     gptr++;
-    grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
-      &this_recurse, cb);
+    grouplength = get_grouplength(&gptr, FALSE, &groupextra, errcodeptr, lcptr,
+      group, &this_recurse, cb);
     if (grouplength < 0)
       {
       if (*errcodeptr == 0) goto ISNOTFIXED;
       return -1;  /* Error already set */
       }
-    itemlength = grouplength;
-    break;
+    goto OK_GROUP;


     /* Check nested groups - advance past the initial data for each type and
     then seek a fixed length with get_grouplength(). */
@@ -9226,10 +9249,16 @@
     case META_SCRIPT_RUN:
     pptr++;
     CHECK_GROUP:
-    grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
-      recurses, cb);
+    grouplength = get_grouplength(&pptr, TRUE, &groupextra, errcodeptr, lcptr,
+      group, recurses, cb);
     if (grouplength < 0) return -1;
+
+    /* A nested lookbehind within the group may require looking back further
+    than the length of the group. */
+
+    OK_GROUP:
     itemlength = grouplength;
+    if (groupextra - branchlength > extra) extra = groupextra - branchlength;
     break;


     /* Exact repetition is OK; variable repetition is not. A repetition of zero
@@ -9272,15 +9301,7 @@


EXIT:
*pptrptr = pptr;
-
-/* The overall maximum lookbehind for any branch in the pattern takes note of
-any extra value that is generated from a nested lookbehind. For example, for
-/(?<=a(?<=ba)c)/ each individual lookbehind has length 2, but the
-max_lookbehind setting is 3 because matching inspects 3 characters before the
-match starting point. */
-
-if (branchlength + extra > cb->max_lookbehind)
- cb->max_lookbehind = branchlength + extra;
+*extraptr = extra;
return branchlength;

PARSED_SKIP_FAILED:
@@ -9299,9 +9320,14 @@
less than the maximum (65535). On exit, the pointer must be left on the final
ket.

+The function also maintains the max_lookbehind value. Any lookbehind branch
+that contains a nested lookbehind may actually look further back than the
+length of the branch. The additional amount is passed back from
+get_branchlength() as an "extra" value.
+
 Arguments:
   pptrptr     pointer to pointer in the parsed pattern
-  maxptr      where to return maximum length for the whole group
+  maxptr      where to return maximum lookbehind for the whole group
   errcodeptr  pointer to error code
   lcptr       pointer to loop counter
   recurses    chain of recurse_check to catch mutual recursion
@@ -9317,6 +9343,7 @@
 {
 PCRE2_SIZE offset;
 int branchlength;
+int branchextra;
 int max = 0;
 uint32_t *bptr = *pptrptr;


@@ -9326,7 +9353,8 @@
 do
   {
   *pptrptr += 1;
-  branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
+  branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
+    recurses, cb);
   if (branchlength < 0)
     {
     /* The errorcode and offset may already be set from a nested lookbehind. */
@@ -9334,12 +9362,13 @@
     if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
     return FALSE;
     }
-  if (branchlength > max) max = branchlength;
+  if (branchlength + branchextra > max) max = branchlength + branchextra;
   *bptr |= branchlength;  /* branchlength never more than 65535 */
   bptr = *pptrptr;
   }
 while (*bptr == META_ALT);


+if (max > cb->max_lookbehind) cb->max_lookbehind = max;
*maxptr = max;
return TRUE;
}

Modified: code/trunk/testdata/testinput15
===================================================================
--- code/trunk/testdata/testinput15    2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/testdata/testinput15    2019-06-28 16:58:08 UTC (rev 1124)
@@ -157,6 +157,18 @@
 /(?<=ab)cdef/
     xxabcd\=ph


+/(?<=(?<=(?<=a)b)c)./I
+    123abcXYZ
+
+/(?<=ab(cd(?<=...)))./I
+    abcdX
+
+/(?<=ab((?<=...)cd))./I
+    ZabcdX
+
+/(?<=((?<=(?<=ab).))(?1)(?1))./I
+    abxZ
+
 #subject
 # -------------------------------------------------------------------



Modified: code/trunk/testdata/testoutput15
===================================================================
--- code/trunk/testdata/testoutput15    2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/testdata/testoutput15    2019-06-28 16:58:08 UTC (rev 1124)
@@ -335,6 +335,41 @@
 Partial match: abcd
                <<


+/(?<=(?<=(?<=a)b)c)./I
+Capture group count = 0
+Max lookbehind = 3
+Subject length lower bound = 1
+    123abcXYZ
+ 0: abcX
+    <<< 
+
+/(?<=ab(cd(?<=...)))./I
+Capture group count = 1
+Max lookbehind = 4
+Subject length lower bound = 1
+    abcdX
+ 0: abcdX
+    <<<< 
+ 1: cd
+
+/(?<=ab((?<=...)cd))./I
+Capture group count = 1
+Max lookbehind = 5
+Subject length lower bound = 1
+    ZabcdX
+ 0: ZabcdX
+    <<<<< 
+ 1: cd
+
+/(?<=((?<=(?<=ab).))(?1)(?1))./I
+Capture group count = 1
+Max lookbehind = 3
+Subject length lower bound = 1
+    abxZ
+ 0: abxZ
+    <<< 
+ 1: 
+
 #subject
 # -------------------------------------------------------------------