Revision: 699
http://www.exim.org/viewvc/pcre2?view=rev&revision=699
Author: ph10
Date: 2017-03-23 17:54:58 +0000 (Thu, 23 Mar 2017)
Log Message:
-----------
Fix issues with (*VERB)s inside recursive subroutine calls.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_intmodedep.h
code/trunk/src/pcre2_match.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2017-03-22 15:12:06 UTC (rev 698)
+++ code/trunk/ChangeLog 2017-03-23 17:54:58 UTC (rev 699)
@@ -15,9 +15,8 @@
run no slower than the old code.
A number of bugs in the refactored code were subsequently fixed during testing
-before release, but after the code was made available in the repository. Many
-of the bugs were discovered by fuzzing testing. These bugs were never in fully
-released code, but are noted here for the record.
+before release, but after the code was made available in the repository. These
+bugs were never in fully released code, but are noted here for the record.
(a) If a pattern had fewer capturing parentheses than the ovector supplied in
the match data block, a memory error (detectable by ASAN) occurred after
@@ -30,6 +29,8 @@
vector on the stack is not big enough to handle at least 10 frames.
Fixes oss-fuzz issue 783.
+ (c) Handling of (*VERB)s in recursions was wrong in some cases.
+
2. Now that pcre2_match() no longer uses recursive function calls (see above),
the "match limit recursion" value seems misnamed. It still exists, and limits
the depth of tree that is searched. To avoid future confusion, it has been
Modified: code/trunk/src/pcre2_intmodedep.h
===================================================================
--- code/trunk/src/pcre2_intmodedep.h 2017-03-22 15:12:06 UTC (rev 698)
+++ code/trunk/src/pcre2_intmodedep.h 2017-03-23 17:54:58 UTC (rev 699)
@@ -826,13 +826,14 @@
PCRE2_SPTR start_code; /* For use when recursing */
PCRE2_SPTR start_subject; /* Start of the subject string */
PCRE2_SPTR end_subject; /* End of the subject string */
- PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
- PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
PCRE2_SPTR mark; /* Mark pointer to pass back on success */
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
+ PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
+ PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
+ uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
uint32_t moptions; /* Match options */
uint32_t poptions; /* Pattern options */
uint32_t skip_arg_count; /* For counting SKIP_ARGs */
Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c 2017-03-22 15:12:06 UTC (rev 698)
+++ code/trunk/src/pcre2_match.c 2017-03-23 17:54:58 UTC (rev 699)
@@ -5051,8 +5051,8 @@
offset data is the offset to the starting bracket from the start of the
whole pattern. (This is so that it works from duplicated subpatterns.) */
-#define Lframe_type F->temp_32[0]
-#define Lstart_group F->temp_sptr[0]
+#define Lframe_type F->temp_32[0]
+#define Lstart_branch F->temp_sptr[0]
case OP_RECURSE:
bracode = mb->start_code + GET(Fecode, 1);
@@ -5083,42 +5083,47 @@
bracket. We must leave Fecode unchanged so that the ending code can find
out where to continue. */
- Lstart_group = bracode;
+ Lstart_branch = bracode;
Lframe_type = GF_RECURSE | number;
for (;;)
{
+ PCRE2_SPTR next_ecode;
+
group_frame_type = Lframe_type;
- RMATCH(Lstart_group + PRIV(OP_lengths)[*Lstart_group], RM11);
+ RMATCH(Lstart_branch + PRIV(OP_lengths)[*Lstart_branch], RM11);
+ next_ecode = Lstart_branch + GET(Lstart_branch,1);
- /* See comment above about handling THEN. */
+ /* Handle backtracking verbs, which are defined in a range that can
+ easily be tested for. PCRE does not allow THEN, SKIP, PRUNE or COMMIT to
+ escape beyond a recursion; they cause a NOMATCH for the entire recursion.
- if (rrc == MATCH_THEN)
+ When one of these verbs triggers, the current recursion group number is
+ recorded. If it matches the recursion we are processing, the verb
+ happened within the recursion and we must deal with it. Otherwise it must
+ have happened after the recursion completed, and so has to be passed
+ back. See comment above about handling THEN. */
+
+ if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX &&
+ mb->verb_current_recurse == (Lframe_type ^ GF_RECURSE))
{
- PCRE2_SPTR next_ecode = Lstart_group + GET(Lstart_group,1);
- if (mb->verb_ecode_ptr < next_ecode &&
- (*Lstart_group == OP_ALT || *next_ecode == OP_ALT))
+ if (rrc == MATCH_THEN && mb->verb_ecode_ptr < next_ecode &&
+ (*Lstart_branch == OP_ALT || *next_ecode == OP_ALT))
rrc = MATCH_NOMATCH;
+ else RRETURN(MATCH_NOMATCH);
}
- /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
- recursion; they cause a NOMATCH for the entire recursion. These codes are
- defined in a range that can be tested for. */
-
- if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
- RRETURN(MATCH_NOMATCH);
-
/* Note that carrying on after (*ACCEPT) in a recursion is handled in the
OP_ACCEPT code. Nothing needs to be done here. */
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- Lstart_group += GET(Lstart_group, 1);
- if (*Lstart_group != OP_ALT) RRETURN(MATCH_NOMATCH);
+ Lstart_branch = next_ecode;
+ if (*Lstart_branch != OP_ALT) RRETURN(MATCH_NOMATCH);
}
/* Control never reaches here. */
#undef Lframe_type
-#undef Lstart_group
+#undef Lstart_branch
/* ===================================================================== */
@@ -5535,8 +5540,7 @@
/* Whole-pattern recursion is coded as a recurse into group 0, so it
won't be picked up here. Instead, we catch it when the OP_END is reached.
- Other recursion is handled here. We just have to record the current
- subject position and start match pointer and give a MATCH return. */
+ Other recursion is handled here. */
case OP_CBRA:
case OP_CBRAPOS:
@@ -5545,7 +5549,7 @@
number = GET2(bracode, 1+LINK_SIZE);
/* Handle a recursively called group. We reinstate the previous set of
- captures and then carry on. */
+ captures and then carry on after the recursion call. */
if (Fcurrent_recurse == number)
{
@@ -5837,14 +5841,20 @@
case OP_FAIL:
RRETURN(MATCH_NOMATCH);
+ /* Record the current recursing group number in mb->verb_current_recurse
+ when a backtracking return such as MATCH_COMMIT is given. This enables the
+ recurse processing to catch verbs from within the recursion. */
+
case OP_COMMIT:
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM13);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_COMMIT);
case OP_PRUNE:
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_PRUNE);
case OP_PRUNE_ARG:
@@ -5851,6 +5861,7 @@
Fmark = mb->nomatch_mark = Fecode + 2;
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_PRUNE);
case OP_SKIP:
@@ -5857,6 +5868,7 @@
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM16);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_skip_ptr = Feptr; /* Pass back current position */
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_SKIP);
/* Note that, for Perl compatibility, SKIP with an argument does NOT set
@@ -5883,6 +5895,7 @@
mb->skip_arg_count. */
mb->verb_skip_ptr = Fecode + 2;
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_SKIP_ARG);
/* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
@@ -5892,14 +5905,15 @@
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM18);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_ecode_ptr = Fecode;
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_THEN);
case OP_THEN_ARG:
Fmark = mb->nomatch_mark = Fecode + 2;
RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM19);
-
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
mb->verb_ecode_ptr = Fecode;
+ mb->verb_current_recurse = Fcurrent_recurse;
RRETURN(MATCH_THEN);
@@ -6243,7 +6257,7 @@
frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE));
/* If a pattern has very many capturing parentheses, the frame size may be very
-large. Ensure that there are at least 10 available frames by getting an initial
+large. Ensure that there are at least 10 available frames by getting an initial
vector on the heap if necessary. */
if (frame_size <= START_FRAMES_SIZE/10)
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2017-03-22 15:12:06 UTC (rev 698)
+++ code/trunk/testdata/testinput2 2017-03-23 17:54:58 UTC (rev 699)
@@ -4957,9 +4957,12 @@
//
\=ovector=7777777777
+
+# This is here because Perl matches, even though a COMMIT is encountered
+# outside of the recursion.
/(?1)(A(*COMMIT)|B)D/
- BAXBAD\=no_jit
+ BAXBAD
"(?1){2}(a)"B
@@ -5001,7 +5004,7 @@
/^(.|(.)(?1)?\2)$/
abcba
-# The first of these, when run by Perl, give the mark 'aa', which is wrong.
+# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
'(?>a(*:aa))b|ac' mark
ac
@@ -5019,4 +5022,11 @@
/\g{3/
+# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
+# pending backtracks in the recursion.
+
+/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
+\= Expect no match
+ abc
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2017-03-22 15:12:06 UTC (rev 698)
+++ code/trunk/testdata/testoutput2 2017-03-23 17:54:58 UTC (rev 699)
@@ -15424,11 +15424,13 @@
//
\=ovector=7777777777
** Invalid value in 'ovector=7777777777'
+
+# This is here because Perl matches, even though a COMMIT is encountered
+# outside of the recursion.
/(?1)(A(*COMMIT)|B)D/
- BAXBAD\=no_jit
- 0: BAD
- 1: A
+ BAXBAD
+No match
"(?1){2}(a)"B
------------------------------------------------------------------
@@ -15549,7 +15551,7 @@
1: abcba
2: a
-# The first of these, when run by Perl, give the mark 'aa', which is wrong.
+# The first of these, when run by Perl, gives the mark 'aa', which is wrong.
'(?>a(*:aa))b|ac' mark
ac
@@ -15573,6 +15575,14 @@
/\g{3/
Failed: error 157 at offset 2: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
+# Perl matches this one, but PCRE does not because (*ACCEPT) clears out any
+# pending backtracks in the recursion.
+
+/^ (?(DEFINE) (..(*ACCEPT)|...) ) (?1)$/x
+\= Expect no match
+ abc
+No match
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data