Revision: 615
http://vcs.pcre.org/viewvc?view=rev&revision=615
Author: ph10
Date: 2011-07-11 15:23:06 +0100 (Mon, 11 Jul 2011)
Log Message:
-----------
A better patch for the atomic capturing not resetting bug.
Modified Paths:
--------------
code/trunk/pcre_exec.c
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-07-09 10:48:16 UTC (rev 614)
+++ code/trunk/pcre_exec.c 2011-07-11 14:23:06 UTC (rev 615)
@@ -847,23 +847,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* If md->end_offset_top is greater than offset_top, it means that the
- branch we have just failed to match did manage to match some capturing
- parentheses within an atomic group or an assertion. Although offset_top
- reverts to its original value at this level, we must unset the captured
- values in case a later match sets a higher capturing number. Example:
- matching /((?>(a))b|(a)c)/ against "ac". This captures 3, but we need
- to ensure that 2 - which was captured in the atomic matching - is
- unset. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
@@ -909,16 +892,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* See explanatory comment above under OP_CBRA. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
}
@@ -989,16 +962,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* See explanatory comment above under OP_CBRA. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
@@ -1061,16 +1024,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* See explanatory comment above under OP_CBRA. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
}
@@ -1413,16 +1366,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* See explanatory comment above under OP_CBRA. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
ecode += GET(ecode, 1);
}
while (*ecode == OP_ALT);
@@ -1650,16 +1593,6 @@
if (rrc != MATCH_NOMATCH &&
(rrc != MATCH_THEN || md->start_match_ptr != ecode))
RRETURN(rrc);
-
- /* See explanatory comment above under OP_CBRA. */
-
- if (md->end_offset_top > offset_top)
- {
- register int *iptr = md->offset_vector + offset_top;
- register int *iend = md->offset_vector + md->end_offset_top;
- while (iptr < iend) *iptr++ = -1;
- }
-
ecode += GET(ecode,1);
}
while (*ecode == OP_ALT);
@@ -1807,6 +1740,25 @@
md->capture_last = number;
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
+ /* If offset is greater than offset_top, it means that we are
+ "skipping" a capturing group, and that group's offsets must be marked
+ unset. In earlier versions of PCRE, all the offsets were unset at the
+ start of matching, but this doesn't work because atomic groups and
+ assertions can cause a value to be set that should later be unset.
+ Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
+ part of the atomic group, but this is not on the final matching path,
+ so must be unset when 2 is set. (If there is no group 2, there is no
+ problem, because offset_top will then be 2, indicating no capture.) */
+
+ if (offset > offset_top)
+ {
+ register int *iptr = md->offset_vector + offset_top;
+ register int *iend = md->offset_vector + offset;
+ while (iptr < iend) *iptr++ = -1;
+ }
+
+ /* Now make the extraction */
+
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
@@ -5859,7 +5811,7 @@
PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
int offsetcount)
{
-int rc, resetcount, ocount;
+int rc, ocount;
int first_byte = -1;
int req_byte = -1;
int req_byte2 = -1;
@@ -6108,22 +6060,19 @@
md->offset_overflow = FALSE;
md->capture_last = -1;
-/* Compute the minimum number of offsets that we need to reset each time. Doing
-this makes a huge difference to execution time when there aren't many brackets
-in the pattern. */
-
-resetcount = 2 + re->top_bracket * 2;
-if (resetcount > offsetcount) resetcount = ocount;
-
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
-initialize them to avoid reading uninitialized locations. */
+initialize them to avoid reading uninitialized locations. Also, unset the
+offsets for the matched string. This is really just for tidiness with callouts,
+in case they inspect these fields. */
if (md->offset_vector != NULL)
{
register int *iptr = md->offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
+ register int *iend = iptr - re->top_bracket;
+ if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
while (--iptr >= iend) *iptr = -1;
+ md->offset_vector[0] = md->offset_vector[1] = -1;
}
/* Set up the first character to match, if available. The first_byte value is
@@ -6157,6 +6106,8 @@
}
+
+
/* ==========================================================================*/
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
@@ -6167,15 +6118,6 @@
USPTR save_end_subject = end_subject;
USPTR new_start_match;
- /* Reset the maximum number of extractions we might see. */
-
- if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector;
- register int *iend = iptr + resetcount;
- while (iptr < iend) *iptr++ = -1;
- }
-
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the first
newline. Implement this by temporarily adjusting end_subject so that we stop