Revision: 912
http://www.exim.org/viewvc/pcre2?view=rev&revision=912
Author: ph10
Date: 2018-01-31 17:53:56 +0000 (Wed, 31 Jan 2018)
Log Message:
-----------
Fix auto-possessification bug at the end of a capturing group that is called
recursively.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_auto_possess.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/ChangeLog 2018-01-31 17:53:56 UTC (rev 912)
@@ -134,7 +134,16 @@
131072, which allows for the maximum number of captures (65535) plus the
overall match. This fixes oss-fuzz issue 5415.
+31. Auto-possessification at the end of a capturing group was dependent on what
+follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused
+incorrect behaviour when the group was called recursively from elsewhere in the
+pattern where something different might follow. This bug is an unforseen
+consequence of change #1 for 10.30 - the implementation of backtracking into
+recursions. Iterators at the ends of capturing groups are no longer considered
+for auto-possessification if the pattern contains any recursions. Fixes
+Bugzilla #2232.
+
Version 10.30 14-August-2017
----------------------------
Modified: code/trunk/src/pcre2_auto_possess.c
===================================================================
--- code/trunk/src/pcre2_auto_possess.c 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/src/pcre2_auto_possess.c 2018-01-31 17:53:56 UTC (rev 912)
@@ -558,6 +558,8 @@
continue;
}
+ /* At the end of a branch, skip to the end of the group. */
+
if (c == OP_ALT)
{
do code += GET(code, 1); while (*code == OP_ALT);
@@ -564,25 +566,49 @@
c = *code;
}
+ /* Inspect the next opcode. */
+
switch(c)
{
+ /* We can always possessify a greedy iterator at the end of the pattern,
+ which is reached after skipping over the final OP_KET. A non-greedy
+ iterator must never be possessified. */
+
case OP_END:
- case OP_KETRPOS:
- /* TRUE only in greedy case. The non-greedy case could be replaced by
- an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
- uses more memory, which we cannot get at this stage.) */
-
return base_list[1] != 0;
+ /* When an iterator is at the end of certain kinds of group we can inspect
+ what follows the group by skipping over the closing ket. Note that this
+ does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
+ iteration is variable (could be another iteration or could be the next
+ item). As these two opcodes are not listed in the next switch, they will
+ end up as the next code to inspect, and return FALSE by virtue of being
+ unsupported. */
+
case OP_KET:
- /* If the bracket is capturing, and referenced by an OP_RECURSE, or
- it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
- cannot be converted to a possessive form. */
+ case OP_KETRPOS:
+ /* The non-greedy case cannot be converted to a possessive form. */
if (base_list[1] == 0) return FALSE;
+ /* If the bracket is capturing it might be referenced by an OP_RECURSE
+ so its last iterator can never be possessified if the pattern contains
+ recursions. (This could be improved by keeping a list of group numbers that
+ are called by recursion.) */
+
switch(*(code - GET(code, 1)))
{
+ case OP_CBRA:
+ case OP_SCBRA:
+ case OP_CBRAPOS:
+ case OP_SCBRAPOS:
+ if (cb->had_recurse) return FALSE;
+ break;
+
+ /* Atomic sub-patterns and assertions can always auto-possessify their
+ last iterator. However, if the group was entered as a result of checking
+ a previous iterator, this is not possible. */
+
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
@@ -589,16 +615,16 @@
case OP_ASSERTBACK_NOT:
case OP_ONCE:
- /* Atomic sub-patterns and assertions can always auto-possessify their
- last iterator. However, if the group was entered as a result of checking
- a previous iterator, this is not possible. */
-
return !entered_a_group;
}
+ /* Skip over the bracket and inspect what comes next. */
+
code += PRIV(OP_lengths)[c];
continue;
+ /* Handle cases where the next item is a group. */
+
case OP_ONCE:
case OP_BRA:
case OP_CBRA:
@@ -637,11 +663,15 @@
code += PRIV(OP_lengths)[c];
continue;
+ /* The next opcode does not need special handling; fall through and use it
+ to see if the base can be possessified. */
+
default:
break;
}
- /* Check for a supported opcode, and load its properties. */
+ /* We now have the next appropriate opcode to compare with the base. Check
+ for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testinput1 2018-01-31 17:53:56 UTC (rev 912)
@@ -6159,4 +6159,34 @@
/((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
XYYZ
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+ aa
+ a
+
+/^(a?)b(?1)a/
+ abaa
+ aba
+ baa
+ ba
+
+/^(a?)+b(?1)a/
+ abaa
+ aba
+ baa
+ ba
+
+/^(a?)++b(?1)a/
+ abaa
+ aba
+ baa
+ ba
+
+/^(a?)+b/
+ b
+ ab
+ aaab
+
+/(?=a+)a(a+)++b/
+ aab
+
# End of testinput1
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testinput2 2018-01-31 17:53:56 UTC (rev 912)
@@ -5412,4 +5412,21 @@
\= Expect no match
\na
+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified.
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+
+/^(a?)b(?1)a/B
+
+/^(a?)+b(?1)a/B
+
+/^(a?)++b(?1)a/B
+
+/^(a?)+b/B
+
+/(?=a+)a(a+)++b/B
+
# End of testinput2
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testoutput1 2018-01-31 17:53:56 UTC (rev 912)
@@ -9758,4 +9758,68 @@
1: Y
2:
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+ aa
+ 0: aa
+ a
+ 0: a
+
+/^(a?)b(?1)a/
+ abaa
+ 0: abaa
+ 1: a
+ aba
+ 0: aba
+ 1: a
+ baa
+ 0: baa
+ 1:
+ ba
+ 0: ba
+ 1:
+
+/^(a?)+b(?1)a/
+ abaa
+ 0: abaa
+ 1:
+ aba
+ 0: aba
+ 1:
+ baa
+ 0: baa
+ 1:
+ ba
+ 0: ba
+ 1:
+
+/^(a?)++b(?1)a/
+ abaa
+ 0: abaa
+ 1:
+ aba
+ 0: aba
+ 1:
+ baa
+ 0: baa
+ 1:
+ ba
+ 0: ba
+ 1:
+
+/^(a?)+b/
+ b
+ 0: b
+ 1:
+ ab
+ 0: ab
+ 1:
+ aaab
+ 0: aaab
+ 1:
+
+/(?=a+)a(a+)++b/
+ aab
+ 0: aab
+ 1: a
+
# End of testinput1
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testoutput2 2018-01-31 17:53:56 UTC (rev 912)
@@ -12701,7 +12701,7 @@
Ket
a
CBraPos 1
- a++
+ a+
KetRpos
a
Ket
@@ -16468,6 +16468,113 @@
\na
No match
+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified.
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+------------------------------------------------------------------
+ Bra
+ Cond
+ Cond false
+ CBra 1
+ a?
+ Ket
+ Ket
+ ^
+ Recurse
+ a
+ $
+ Ket
+ End
+------------------------------------------------------------------
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+------------------------------------------------------------------
+ Bra
+ Cond
+ Cond false
+ CBra 1
+ a?
+ Ket
+ X
+ Ket
+ ^
+ Recurse
+ a
+ $
+ Ket
+ End
+------------------------------------------------------------------
+
+/^(a?)b(?1)a/B
+------------------------------------------------------------------
+ Bra
+ ^
+ CBra 1
+ a?
+ Ket
+ b
+ Recurse
+ a
+ Ket
+ End
+------------------------------------------------------------------
+
+/^(a?)+b(?1)a/B
+------------------------------------------------------------------
+ Bra
+ ^
+ SCBra 1
+ a?
+ KetRmax
+ b
+ Recurse
+ a
+ Ket
+ End
+------------------------------------------------------------------
+
+/^(a?)++b(?1)a/B
+------------------------------------------------------------------
+ Bra
+ ^
+ SCBraPos 1
+ a?
+ KetRpos
+ b
+ Recurse
+ a
+ Ket
+ End
+------------------------------------------------------------------
+
+/^(a?)+b/B
+------------------------------------------------------------------
+ Bra
+ ^
+ SCBra 1
+ a?
+ KetRmax
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
+/(?=a+)a(a+)++b/B
+------------------------------------------------------------------
+ Bra
+ Assert
+ a++
+ Ket
+ a
+ CBraPos 1
+ a++
+ KetRpos
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data