Revision: 1133
http://www.exim.org/viewvc/pcre2?view=rev&revision=1133
Author: ph10
Date: 2019-07-16 16:06:21 +0100 (Tue, 16 Jul 2019)
Log Message:
-----------
Fix lookbehind within lookahead within lookbehind misbehaviour bug.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/ChangeLog 2019-07-16 15:06:21 UTC (rev 1133)
@@ -90,7 +90,12 @@
18. Implement non-atomic positive lookaround assertions.
+19. If a lookbehind contained a lookahead that contained another lookbehind
+within it, the nested lookbehind was not correctly processed. For example, if
+/(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching
+"b".
+
Version 10.33 16-April-2019
---------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/src/pcre2_compile.c 2019-07-16 15:06:21 UTC (rev 1133)
@@ -135,6 +135,8 @@
set_lookbehind_lengths(uint32_t **, int *, int *, int *,
parsed_recurse_check *, compile_block *);
+static int
+ check_lookbehinds(uint32_t *, uint32_t **, compile_block *);
/*************************************************
@@ -651,7 +653,7 @@
STRING_positive_lookahead0
STRING_positive_lookbehind0
STRING_non_atomic_positive_lookahead0
- STRING_non_atomic_positive_lookbehind0
+ STRING_non_atomic_positive_lookbehind0
STRING_negative_lookahead0
STRING_negative_lookbehind0
STRING_atomic0
@@ -670,7 +672,7 @@
{ 18, META_LOOKAHEAD },
{ 19, META_LOOKBEHIND },
{ 29, META_LOOKAHEAD_NA },
- { 30, META_LOOKBEHIND_NA },
+ { 30, META_LOOKBEHIND_NA },
{ 18, META_LOOKAHEADNOT },
{ 19, META_LOOKBEHINDNOT },
{ 6, META_ATOMIC },
@@ -4738,7 +4740,7 @@
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
- case OP_ASSERTBACK_NA:
+ case OP_ASSERTBACK_NA:
if (!skipassert) return code;
do code += GET(code, 1); while (*code == OP_ALT);
code += PRIV(OP_lengths)[*code];
@@ -6579,7 +6581,7 @@
we must only take the reqcu when the group also set a firstcu. Otherwise,
in that example, 'X' ends up set for both. */
- else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
+ else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
subreqcuflags >= 0 && subfirstcuflags >= 0)
{
reqcu = subreqcu;
@@ -7014,10 +7016,10 @@
case OP_ASSERT:
case OP_ASSERT_NOT:
- case OP_ASSERT_NA:
+ case OP_ASSERT_NA:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
- case OP_ASSERTBACK_NA:
+ case OP_ASSERTBACK_NA:
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_BRA:
@@ -7973,7 +7975,7 @@
/* Remember if this is a lookbehind assertion, and if it is, save its length
and skip over the pattern offset. */
-lookbehind = *code == OP_ASSERTBACK ||
+lookbehind = *code == OP_ASSERTBACK ||
*code == OP_ASSERTBACK_NOT ||
*code == OP_ASSERTBACK_NA;
@@ -8649,10 +8651,10 @@
case OP_CBRAPOS:
case OP_SCBRAPOS:
case OP_ASSERT:
- case OP_ASSERT_NA:
+ case OP_ASSERT_NA:
case OP_ONCE:
case OP_SCRIPT_RUN:
- d = find_firstassertedcu(scode, &dflags, inassert +
+ d = find_firstassertedcu(scode, &dflags, inassert +
((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
if (dflags < 0)
return 0;
@@ -9108,16 +9110,16 @@
}
break;
- /* Lookaheads can be ignored, but we must start the skip inside the group
- so that it isn't treated as a group within the branch. */
+ /* Lookaheads do not contribute to the length of this branch, but they may
+ contain lookbehinds within them whose lengths need to be set. */
case META_LOOKAHEAD:
case META_LOOKAHEADNOT:
case META_LOOKAHEAD_NA:
- pptr = parsed_skip(pptr + 1, PSKIP_KET);
- if (pptr == NULL) goto PARSED_SKIP_FAILED;
+ *errcodeptr = check_lookbehinds(pptr + 1, &pptr, cb);
+ if (*errcodeptr != 0) return -1;
- /* Also ignore any qualifiers that follow a lookahead assertion. */
+ /* Ignore any qualifiers that follow a lookahead assertion. */
switch (pptr[1])
{
@@ -9454,21 +9456,29 @@
the error offset is marked unset. The enables the functions above not to
override settings from deeper nestings.
-Arguments cb points to the compile block
-Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
+This function is called recursively from get_branchlength() for lookaheads in
+order to process any lookbehinds that they may contain. It stops when it hits a
+non-nested closing parenthesis in this case, returning a pointer to it.
+
+Arguments
+ pptr points to where to start (start of pattern or start of lookahead)
+ retptr if not NULL, return the ket pointer here
+ cb points to the compile block
+
+Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
*/
static int
-check_lookbehinds(compile_block *cb)
+check_lookbehinds(uint32_t *pptr, uint32_t **retptr, compile_block *cb)
{
-uint32_t *pptr;
int max;
int errorcode = 0;
int loopcount = 0;
+int nestlevel = 0;
cb->erroroffset = PCRE2_UNSET;
-for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
+for (; *pptr != META_END; pptr++)
{
if (*pptr < META_END) continue; /* Literal */
@@ -9482,14 +9492,31 @@
pptr += 1;
break;
+ case META_KET:
+ if (--nestlevel < 0)
+ {
+ if (retptr != NULL) *retptr = pptr;
+ return 0;
+ }
+ break;
+
+ case META_ATOMIC:
+ case META_CAPTURE:
+ case META_COND_ASSERT:
+ case META_LOOKAHEAD:
+ case META_LOOKAHEADNOT:
+ case META_LOOKAHEAD_NA:
+ case META_NOCAPTURE:
+ case META_SCRIPT_RUN:
+ nestlevel++;
+ break;
+
case META_ACCEPT:
case META_ALT:
case META_ASTERISK:
case META_ASTERISK_PLUS:
case META_ASTERISK_QUERY:
- case META_ATOMIC:
case META_BACKREF:
- case META_CAPTURE:
case META_CIRCUMFLEX:
case META_CLASS:
case META_CLASS_EMPTY:
@@ -9497,15 +9524,9 @@
case META_CLASS_END:
case META_CLASS_NOT:
case META_COMMIT:
- case META_COND_ASSERT:
case META_DOLLAR:
case META_DOT:
case META_FAIL:
- case META_KET:
- case META_LOOKAHEAD:
- case META_LOOKAHEADNOT:
- case META_LOOKAHEAD_NA:
- case META_NOCAPTURE:
case META_PLUS:
case META_PLUS_PLUS:
case META_PLUS_QUERY:
@@ -9515,7 +9536,6 @@
case META_QUERY_QUERY:
case META_RANGE_ESCAPED:
case META_RANGE_LITERAL:
- case META_SCRIPT_RUN:
case META_SKIP:
case META_THEN:
break;
@@ -10021,7 +10041,7 @@
if (has_lookbehind)
{
- errorcode = check_lookbehinds(&cb);
+ errorcode = check_lookbehinds(cb.parsed_pattern, NULL, &cb);
if (errorcode != 0) goto HAD_CB_ERROR;
}
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testinput1 2019-07-16 15:06:21 UTC (rev 1133)
@@ -6377,4 +6377,10 @@
/(?<=a(*SKIP)x)|d/
abcd
+/(?<=(?=.(?<=x)))/aftertext
+ abx
+
+/(?<=(?=(?<=a)))b/
+ ab
+
# End of testinput1
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testinput2 2019-07-16 15:06:21 UTC (rev 1133)
@@ -5690,4 +5690,7 @@
# ----
+/(?<=(?=.(?<=x)))/
+ ab\=ph
+
# End of testinput2
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testoutput1 2019-07-16 15:06:21 UTC (rev 1133)
@@ -10097,4 +10097,13 @@
abcd
0: d
+/(?<=(?=.(?<=x)))/aftertext
+ abx
+ 0:
+ 0+ x
+
+/(?<=(?=(?<=a)))b/
+ ab
+ 0: b
+
# End of testinput1
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testoutput2 2019-07-16 15:06:21 UTC (rev 1133)
@@ -17185,6 +17185,10 @@
# ----
+/(?<=(?=.(?<=x)))/
+ ab\=ph
+No match
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data