Revision: 609
http://vcs.pcre.org/viewvc?view=rev&revision=609
Author: ph10
Date: 2011-06-15 19:09:23 +0100 (Wed, 15 Jun 2011)
Log Message:
-----------
Fix bug with /\A.*?(?:a|b(*THEN)c)/ by removing the tail recursion optimization
for the final branch. Also fix a similar bug for conditional subpatterns.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput11
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput11
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/ChangeLog 2011-06-15 18:09:23 UTC (rev 609)
@@ -90,6 +90,13 @@
empty string, and PCRE_NOTEMPTY was set, pcre_exec() thought the whole
pattern had matched an empty string, and so incorrectly returned a no
match.
+
+17. There was optimizing code for the last branch of non-capturing parentheses,
+ and also for the obeyed branch of a conditional subexpression, which used
+ tail recursion to cut down on stack usage. Unfortunately, not that there is
+ the possibility of (*THEN) occurring in these branches, tail recursion is
+ no longer possible because the return has to be checked for (*THEN). These
+ two optimizations have therefore been removed.
Version 8.12 15-Jan-2011
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/pcre_exec.c 2011-06-15 18:09:23 UTC (rev 609)
@@ -276,7 +276,7 @@
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
- RM61, RM62, RM63, RM64 };
+ RM61, RM62, RM63};
/* These versions of the macros use the stack, as normal. There are debugging
versions and production versions. Note that the "rw" argument of RMATCH isn't
@@ -858,7 +858,7 @@
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
- if (rrc != MATCH_THEN) md->mark = markptr;
+ if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
RRETURN(MATCH_NOMATCH);
}
@@ -875,34 +875,17 @@
/* Non-capturing bracket, except for possessive with unlimited repeat. Loop
for all the alternatives. When we get to the final alternative within the
- brackets, we would return the result of a recursive call to match()
- whatever happened. We can reduce stack usage by turning this into a tail
- recursion, except in the case of a possibly empty group.*/
+ brackets, we used to return the result of a recursive call to match()
+ whatever happened so it was possible to reduce stack usage by turning this
+ into a tail recursion, except in the case of a possibly empty group.
+ However, now that there is the possiblity of (*THEN) occurring in the final
+ alternative, this optimization is no longer possible. */
case OP_BRA:
case OP_SBRA:
DPRINTF(("start non-capturing bracket\n"));
for (;;)
{
- if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
- {
- if (op >= OP_SBRA) /* Possibly empty group */
- {
- md->match_function_type = MATCH_CBEGROUP;
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
- RM48);
- if (rrc == MATCH_NOMATCH) md->mark = markptr;
- RRETURN(rrc);
- }
- /* Not a possibly empty group; use tail recursion */
- ecode += _pcre_OP_lengths[*ecode];
- DPRINTF(("bracket 0 tail recursion\n"));
- goto TAIL_RECURSE;
- }
-
- /* For non-final alternatives, continue the loop for a NOMATCH result;
- otherwise return. */
-
if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
RM2);
@@ -910,9 +893,12 @@
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
ecode += GET(ecode, 1);
+ if (*ecode != OP_ALT) break;
}
- /* Control never reaches here. */
+ if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
+ RRETURN(MATCH_NOMATCH);
+
/* Handle possessive capturing brackets with an unlimited repeat. We come
here from BRAZERO with allow_zero set TRUE. The offset_vector values are
handled similarly to the normal case above. However, the matching is
@@ -988,7 +974,7 @@
md->offset_vector[md->offset_end - number] = save_offset3;
}
- if (rrc != MATCH_THEN) md->mark = markptr;
+ if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
if (allow_zero || matched_once)
{
ecode += 1 + LINK_SIZE;
@@ -1026,7 +1012,7 @@
{
if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- eptrb, RM64);
+ eptrb, RM48);
if (rrc == MATCH_KETRPOS)
{
eptr = md->end_match_ptr;
@@ -1053,8 +1039,7 @@
/* Conditional group: compilation checked that there are no more than
two branches. If the condition is false, skipping the first branch takes us
past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. As there is only one branch to be
- obeyed, we can use tail recursion to avoid using another stack frame. */
+ exactly what going to the ket would do. */
case OP_COND:
case OP_SCOND:
@@ -1259,20 +1244,20 @@
}
/* We are now at the branch that is to be obeyed. As there is only one,
- we can use tail recursion to avoid using another stack frame, except when
- we have an unlimited repeat of a possibly empty group. If the second
- alternative doesn't exist, we can just plough on. */
+ we used to use tail recursion to avoid using another stack frame, except
+ when there was unlimited repeat of a possibly empty group. However, that
+ strategy no longer works because of the possibilty of (*THEN) being
+ encountered in the branch. A recursive call to match() is always required,
+ unless the second alternative doesn't exist, in which case we can just
+ plough on. */
if (condition || *ecode == OP_ALT)
{
- ecode += 1 + LINK_SIZE;
- if (op == OP_SCOND) /* Possibly empty group */
- {
- md->match_function_type = MATCH_CBEGROUP;
- RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
- RRETURN(rrc);
- }
- else goto TAIL_RECURSE;
+ if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
+ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
+ if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
+ rrc = MATCH_NOMATCH;
+ RRETURN(rrc);
}
else /* Condition false & no alternative */
{
@@ -5703,7 +5688,7 @@
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
+ LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
#ifdef SUPPORT_UTF8
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
Modified: code/trunk/testdata/testinput11
===================================================================
--- code/trunk/testdata/testinput11 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testinput11 2011-06-15 18:09:23 UTC (rev 609)
@@ -593,4 +593,34 @@
/(A (A|B(*ACCEPT)|C) D)(E)/x
AB
+/\A.*?(?:a|b(*THEN)c)/
+ ba
+
+/\A.*?(?:a|bc)/
+ ba
+
+/\A.*?(a|b(*THEN)c)/
+ ba
+
+/\A.*?(a|bc)/
+ ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+ ba
+
+/\A.*?(?:a|bc)++/
+ ba
+
+/\A.*?(a|b(*THEN)c)++/
+ ba
+
+/\A.*?(a|bc)++/
+ ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+ ba
+
+/\A.*?(?:a|bc|d)/
+ ba
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testinput2 2011-06-15 18:09:23 UTC (rev 609)
@@ -3673,6 +3673,12 @@
c
c\N
+/^.*?(?(?=a)a|b(*THEN)c)/
+ ba
+
+/^.*?(?(?=a)a|bc)/
+ ba
+
/-- --/
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput11
===================================================================
--- code/trunk/testdata/testoutput11 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testoutput11 2011-06-15 18:09:23 UTC (rev 609)
@@ -1142,4 +1142,48 @@
1: AB
2: B
+/\A.*?(?:a|b(*THEN)c)/
+ ba
+ 0: ba
+
+/\A.*?(?:a|bc)/
+ ba
+ 0: ba
+
+/\A.*?(a|b(*THEN)c)/
+ ba
+ 0: ba
+ 1: a
+
+/\A.*?(a|bc)/
+ ba
+ 0: ba
+ 1: a
+
+/\A.*?(?:a|b(*THEN)c)++/
+ ba
+ 0: ba
+
+/\A.*?(?:a|bc)++/
+ ba
+ 0: ba
+
+/\A.*?(a|b(*THEN)c)++/
+ ba
+ 0: ba
+ 1: a
+
+/\A.*?(a|bc)++/
+ ba
+ 0: ba
+ 1: a
+
+/\A.*?(?:a|b(*THEN)c|d)/
+ ba
+ 0: ba
+
+/\A.*?(?:a|bc|d)/
+ ba
+ 0: ba
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testoutput2 2011-06-15 18:09:23 UTC (rev 609)
@@ -4414,12 +4414,12 @@
Need char = 'z'
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\M
Minimum match() limit = 8
-Minimum match() recursion limit = 6
+Minimum match() recursion limit = 7
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\M
Minimum match() limit = 32768
-Minimum match() recursion limit = 42
+Minimum match() recursion limit = 43
No match
/(aaa(?C1)bbb|ab)/I
@@ -6411,7 +6411,7 @@
No need char
/* this is a C style comment */\M
Minimum match() limit = 120
-Minimum match() recursion limit = 6
+Minimum match() recursion limit = 35
0: /* this is a C style comment */
1: /* this is a C style comment */
@@ -11567,6 +11567,14 @@
c\N
0: c
+/^.*?(?(?=a)a|b(*THEN)c)/
+ ba
+ 0: ba
+
+/^.*?(?(?=a)a|bc)/
+ ba
+ 0: ba
+
/-- --/
/-- End of testinput2 --/