Revision: 614
http://vcs.pcre.org/viewvc?view=rev&revision=614
Author: ph10
Date: 2011-07-09 11:48:16 +0100 (Sat, 09 Jul 2011)
Log Message:
-----------
Fix atomic group and assertion capturing problems.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput1
code/trunk/testdata/testoutput1
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2011-07-02 16:59:52 UTC (rev 613)
+++ code/trunk/ChangeLog 2011-07-09 10:48:16 UTC (rev 614)
@@ -113,6 +113,15 @@
21. When (*ACCEPT) was used in an assertion that matched an empty string and
PCRE_NOTEMPTY was set, PCRE applied the non-empty test to the assertion.
+
+22. When an atomic group that contained a capturing parenthesis was
+ successfully matched, but the branch in which it appeared failed, the
+ capturing was not being forgotten if a higher numbered group was later
+ captured. For example, /(?>(a))b|(a)c/ when matching "ac" set capturing
+ group 1 to "a", when in fact it should be unset. This applied to multi-
+ branched capturing and non-capturing groups, repeated or not, and also to
+ positive assertions (capturing in negative assertions is not well defined
+ in PCRE) and also to nested atomic groups.
Version 8.12 15-Jan-2011
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-07-02 16:59:52 UTC (rev 613)
+++ code/trunk/pcre_exec.c 2011-07-09 10:48:16 UTC (rev 614)
@@ -847,6 +847,23 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* If md->end_offset_top is greater than offset_top, it means that the
+ branch we have just failed to match did manage to match some capturing
+ parentheses within an atomic group or an assertion. Although offset_top
+ reverts to its original value at this level, we must unset the captured
+ values in case a later match sets a higher capturing number. Example:
+ matching /((?>(a))b|(a)c)/ against "ac". This captures 3, but we need
+ to ensure that 2 - which was captured in the atomic matching - is
+ unset. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
@@ -892,6 +909,16 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* See explanatory comment above under OP_CBRA. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
}
@@ -962,6 +989,16 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* See explanatory comment above under OP_CBRA. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
@@ -1024,6 +1061,16 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* See explanatory comment above under OP_CBRA. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
}
@@ -1366,6 +1413,16 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* See explanatory comment above under OP_CBRA. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
@@ -1593,6 +1650,16 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
+
+ /* See explanatory comment above under OP_CBRA. */
+
+ if (md->end_offset_top > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + md->end_offset_top;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
@@ -1601,8 +1668,8 @@
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
+ /* Continue after the group, updating the offsets high water mark, since
+ extracts may have been taken. */
do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
@@ -6298,6 +6365,7 @@
md->start_used_ptr = start_match;
md->match_call_count = 0;
md->match_function_type = 0;
+ md->end_offset_top = 0;
rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2011-07-02 16:59:52 UTC (rev 613)
+++ code/trunk/testdata/testinput1 2011-07-09 10:48:16 UTC (rev 614)
@@ -4133,4 +4133,28 @@
/((a|)+)+Z/
Z
+/(a)b|(a)c/
+ ac
+
+/(?>(a))b|(a)c/
+ ac
+
+/(?=(a))ab|(a)c/
+ ac
+
+/((?>(a))b|(a)c)/
+ ac
+
+/((?>(a))b|(a)c)++/
+ ac
+
+/(?:(?>(a))b|(a)c)++/
+ ac
+
+/(?=(?>(a))b|(a)c)(..)/
+ ac
+
+/(?>(?>(a))b|(a)c)/
+ ac
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2011-07-02 16:59:52 UTC (rev 613)
+++ code/trunk/testdata/testoutput1 2011-07-09 10:48:16 UTC (rev 614)
@@ -6750,4 +6750,55 @@
1:
2:
+/(a)b|(a)c/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+
+/(?>(a))b|(a)c/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+
+/(?=(a))ab|(a)c/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+
+/((?>(a))b|(a)c)/
+ ac
+ 0: ac
+ 1: ac
+ 2: <unset>
+ 3: a
+
+/((?>(a))b|(a)c)++/
+ ac
+ 0: ac
+ 1: ac
+ 2: <unset>
+ 3: a
+
+/(?:(?>(a))b|(a)c)++/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+
+/(?=(?>(a))b|(a)c)(..)/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+ 3: ac
+
+/(?>(?>(a))b|(a)c)/
+ ac
+ 0: ac
+ 1: <unset>
+ 2: a
+
/-- End of testinput1 --/