Revision: 1124
http://www.exim.org/viewvc/pcre2?view=rev&revision=1124
Author: ph10
Date: 2019-06-28 17:58:08 +0100 (Fri, 28 Jun 2019)
Log Message:
-----------
Fix bugs in recent patch for setting the maximum lookbehind.
Modified Paths:
--------------
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput15
code/trunk/testdata/testoutput15
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/src/pcre2_compile.c 2019-06-28 16:58:08 UTC (rev 1124)
@@ -128,7 +128,7 @@
compile_block *, PCRE2_SIZE *);
static int
- get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
+ get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
compile_block *);
static BOOL
@@ -388,6 +388,9 @@
#define GI_SET_FIXED_LENGTH 0x80000000u
#define GI_NOT_FIXED_LENGTH 0x40000000u
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
+#define GI_EXTRA_MASK 0x0fff0000u
+#define GI_EXTRA_MAX 0xfff /* NB not unsigned */
+#define GI_EXTRA_SHIFT 16
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
and is fast (a good compiler can turn it into a subtraction and unsigned
@@ -8841,6 +8844,7 @@
Arguments:
pptrptr pointer to pointer in the parsed pattern
isinline FALSE if a reference or recursion; TRUE for inline group
+ extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to the errorcode
lcptr pointer to the loop counter
group number of captured group or -1 for a non-capturing group
@@ -8851,11 +8855,13 @@
*/
static int
-get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
- int group, parsed_recurse_check *recurses, compile_block *cb)
+get_grouplength(uint32_t **pptrptr, BOOL isinline, int *extraptr,
+ int *errcodeptr, int *lcptr, int group, parsed_recurse_check *recurses,
+ compile_block *cb)
{
int branchlength;
int grouplength = -1;
+int extra = 0;
/* The cache can be used only if there is no possibility of there being two
groups with the same number. We do not need to set the end pointer for a group
@@ -8869,6 +8875,7 @@
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
{
if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
+ *extraptr = (groupinfo & GI_EXTRA_MASK) >> GI_EXTRA_SHIFT;
return groupinfo & GI_FIXED_LENGTH_MASK;
}
}
@@ -8877,16 +8884,28 @@
for(;;)
{
- branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
+ int branchextra;
+ branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
+ recurses, cb);
if (branchlength < 0) goto ISNOTFIXED;
- if (grouplength == -1) grouplength = branchlength;
- else if (grouplength != branchlength) goto ISNOTFIXED;
+ if (grouplength == -1)
+ {
+ grouplength = branchlength;
+ extra = branchextra;
+ }
+ else if (grouplength != branchlength || extra != branchextra) goto ISNOTFIXED;
if (**pptrptr == META_KET) break;
*pptrptr += 1; /* Skip META_ALT */
}
-if (group > 0)
- cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
+/* There are only 12 bits for caching the extra value, but a pattern that
+needs more than that is weird indeed. */
+
+if (group > 0 && extra <= GI_EXTRA_MAX)
+ cb->groupinfo[group] |= (uint32_t)
+ (GI_SET_FIXED_LENGTH | (extra << GI_EXTRA_SHIFT) | grouplength);
+
+*extraptr = extra;
return grouplength;
ISNOTFIXED:
@@ -8901,13 +8920,17 @@
*************************************************/
/* Return a fixed length for a branch in a lookbehind, giving an error if the
-length is not fixed. If any lookbehinds are encountered on the way, they get
-their length set, and there is a check for them looking further back than the
-current lookbehind. On entry, *pptrptr points to the first element inside the
-branch. On exit it is set to point to the ALT or KET.
+length is not fixed. We also take note of any extra value that is generated
+from a nested lookbehind. For example, for /(?<=a(?<=ba)c)/ each individual
+lookbehind has length 2, but the max_lookbehind setting must be 3 because
+matching inspects 3 characters before the match starting point.
+On entry, *pptrptr points to the first element inside the branch. On exit it is
+set to point to the ALT or KET.
+
Arguments:
pptrptr pointer to pointer in the parsed pattern
+ extraptr pointer to where to return extra lookbehind length
errcodeptr pointer to error code
lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion
@@ -8917,11 +8940,12 @@
*/
static int
-get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
+get_branchlength(uint32_t **pptrptr, int *extraptr, int *errcodeptr, int *lcptr,
parsed_recurse_check *recurses, compile_block *cb)
{
int branchlength = 0;
int grouplength;
+int groupextra;
int max;
int extra = 0; /* Additional lookbehind from nesting */
uint32_t lastitemlength = 0;
@@ -9070,11 +9094,11 @@
}
break;
- /* A lookbehind does not contribute any length to this lookbehind, but must
- itself be checked and have its lengths set. If the maximum lookebhind of
- any branch is greater than the length so far computed for this branch, we
- must set an extra value for use when setting the maximum overall
- lookbehind. */
+ /* A nested lookbehind does not contribute any length to this lookbehind,
+ but must itself be checked and have its lengths set. If the maximum
+ lookbehind for the nested lookbehind is greater than the length so far
+ computed for this branch, we must compute an extra value and keep the
+ largest encountered for use when setting the maximum overall lookbehind. */
case META_LOOKBEHIND:
case META_LOOKBEHINDNOT:
@@ -9188,15 +9212,14 @@
in the cache. */
gptr++;
- grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
- &this_recurse, cb);
+ grouplength = get_grouplength(&gptr, FALSE, &groupextra, errcodeptr, lcptr,
+ group, &this_recurse, cb);
if (grouplength < 0)
{
if (*errcodeptr == 0) goto ISNOTFIXED;
return -1; /* Error already set */
}
- itemlength = grouplength;
- break;
+ goto OK_GROUP;
/* Check nested groups - advance past the initial data for each type and
then seek a fixed length with get_grouplength(). */
@@ -9226,10 +9249,16 @@
case META_SCRIPT_RUN:
pptr++;
CHECK_GROUP:
- grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
- recurses, cb);
+ grouplength = get_grouplength(&pptr, TRUE, &groupextra, errcodeptr, lcptr,
+ group, recurses, cb);
if (grouplength < 0) return -1;
+
+ /* A nested lookbehind within the group may require looking back further
+ than the length of the group. */
+
+ OK_GROUP:
itemlength = grouplength;
+ if (groupextra - branchlength > extra) extra = groupextra - branchlength;
break;
/* Exact repetition is OK; variable repetition is not. A repetition of zero
@@ -9272,15 +9301,7 @@
EXIT:
*pptrptr = pptr;
-
-/* The overall maximum lookbehind for any branch in the pattern takes note of
-any extra value that is generated from a nested lookbehind. For example, for
-/(?<=a(?<=ba)c)/ each individual lookbehind has length 2, but the
-max_lookbehind setting is 3 because matching inspects 3 characters before the
-match starting point. */
-
-if (branchlength + extra > cb->max_lookbehind)
- cb->max_lookbehind = branchlength + extra;
+*extraptr = extra;
return branchlength;
PARSED_SKIP_FAILED:
@@ -9299,9 +9320,14 @@
less than the maximum (65535). On exit, the pointer must be left on the final
ket.
+The function also maintains the max_lookbehind value. Any lookbehind branch
+that contains a nested lookbehind may actually look further back than the
+length of the branch. The additional amount is passed back from
+get_branchlength() as an "extra" value.
+
Arguments:
pptrptr pointer to pointer in the parsed pattern
- maxptr where to return maximum length for the whole group
+ maxptr where to return maximum lookbehind for the whole group
errcodeptr pointer to error code
lcptr pointer to loop counter
recurses chain of recurse_check to catch mutual recursion
@@ -9317,6 +9343,7 @@
{
PCRE2_SIZE offset;
int branchlength;
+int branchextra;
int max = 0;
uint32_t *bptr = *pptrptr;
@@ -9326,7 +9353,8 @@
do
{
*pptrptr += 1;
- branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
+ branchlength = get_branchlength(pptrptr, &branchextra, errcodeptr, lcptr,
+ recurses, cb);
if (branchlength < 0)
{
/* The errorcode and offset may already be set from a nested lookbehind. */
@@ -9334,12 +9362,13 @@
if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
return FALSE;
}
- if (branchlength > max) max = branchlength;
+ if (branchlength + branchextra > max) max = branchlength + branchextra;
*bptr |= branchlength; /* branchlength never more than 65535 */
bptr = *pptrptr;
}
while (*bptr == META_ALT);
+if (max > cb->max_lookbehind) cb->max_lookbehind = max;
*maxptr = max;
return TRUE;
}
Modified: code/trunk/testdata/testinput15
===================================================================
--- code/trunk/testdata/testinput15 2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/testdata/testinput15 2019-06-28 16:58:08 UTC (rev 1124)
@@ -157,6 +157,18 @@
/(?<=ab)cdef/
xxabcd\=ph
+/(?<=(?<=(?<=a)b)c)./I
+ 123abcXYZ
+
+/(?<=ab(cd(?<=...)))./I
+ abcdX
+
+/(?<=ab((?<=...)cd))./I
+ ZabcdX
+
+/(?<=((?<=(?<=ab).))(?1)(?1))./I
+ abxZ
+
#subject
# -------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput15
===================================================================
--- code/trunk/testdata/testoutput15 2019-06-26 16:13:28 UTC (rev 1123)
+++ code/trunk/testdata/testoutput15 2019-06-28 16:58:08 UTC (rev 1124)
@@ -335,6 +335,41 @@
Partial match: abcd
<<
+/(?<=(?<=(?<=a)b)c)./I
+Capture group count = 0
+Max lookbehind = 3
+Subject length lower bound = 1
+ 123abcXYZ
+ 0: abcX
+ <<<
+
+/(?<=ab(cd(?<=...)))./I
+Capture group count = 1
+Max lookbehind = 4
+Subject length lower bound = 1
+ abcdX
+ 0: abcdX
+ <<<<
+ 1: cd
+
+/(?<=ab((?<=...)cd))./I
+Capture group count = 1
+Max lookbehind = 5
+Subject length lower bound = 1
+ ZabcdX
+ 0: ZabcdX
+ <<<<<
+ 1: cd
+
+/(?<=((?<=(?<=ab).))(?1)(?1))./I
+Capture group count = 1
+Max lookbehind = 3
+Subject length lower bound = 1
+ abxZ
+ 0: abxZ
+ <<<
+ 1:
+
#subject
# -------------------------------------------------------------------