Revision: 447
http://vcs.pcre.org/viewvc?view=rev&revision=447
Author: ph10
Date: 2009-09-15 19:17:54 +0100 (Tue, 15 Sep 2009)
Log Message:
-----------
Capture data when (*ACCEPT) is inside capturing parentheses.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrecompat.3
code/trunk/doc/pcrepattern.3
code/trunk/pcre_compile.c
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcre_printint.src
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/ChangeLog 2009-09-15 18:17:54 UTC (rev 447)
@@ -123,6 +123,11 @@
with unset values at the outer level. The correct (outer level) value is
now given.
+22. If (*ACCEPT) appeared inside capturing parentheses, previous releases of
+ PCRE did not set those parentheses (unlike Perl). I have now found a way to
+ make it do so. The string so far is captured, making this feature
+ compatible with Perl.
+
Version 7.9 11-Apr-09
---------------------
Modified: code/trunk/doc/pcrecompat.3
===================================================================
--- code/trunk/doc/pcrecompat.3 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/doc/pcrecompat.3 2009-09-15 18:17:54 UTC (rev 447)
@@ -83,8 +83,7 @@
.P
11. PCRE does support Perl 5.10's backtracking verbs (*ACCEPT), (*FAIL), (*F),
(*COMMIT), (*PRUNE), (*SKIP), and (*THEN), but only in the forms without an
-argument. PCRE does not support (*MARK). If (*ACCEPT) is within capturing
-parentheses, PCRE does not set that capture group; this is different to Perl.
+argument. PCRE does not support (*MARK).
.P
12. PCRE provides some extensions to the Perl regular expression facilities.
Perl 5.10 will include new features that are not in earlier versions, some of
@@ -143,6 +142,6 @@
.rs
.sp
.nf
-Last updated: 11 September 2009
+Last updated: 15 September 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/doc/pcrepattern.3 2009-09-15 18:17:54 UTC (rev 447)
@@ -2155,14 +2155,13 @@
.sp
This verb causes the match to end successfully, skipping the remainder of the
pattern. When inside a recursion, only the innermost pattern is ended
-immediately. PCRE differs from Perl in what happens if the (*ACCEPT) is inside
-capturing parentheses. In Perl, the data so far is captured: in PCRE no data is
-captured. For example:
+immediately. If the (*ACCEPT) is inside capturing parentheses, the data so far
+is captured. (This feature was added to PCRE at release 8.00.) For example:
.sp
- A(A|B(*ACCEPT)|C)D
+ A((?:A|B(*ACCEPT)|C)D)
.sp
-This matches "AB", "AAD", or "ACD", but when it matches "AB", no data is
-captured.
+This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
+the outer parentheses.
.sp
(*FAIL) or (*F)
.sp
@@ -2259,6 +2258,6 @@
.rs
.sp
.nf
-Last updated: 13 September 2009
+Last updated: 15 September 2009
Copyright (c) 1997-2009 University of Cambridge.
.fi
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/pcre_compile.c 2009-09-15 18:17:54 UTC (rev 447)
@@ -4440,8 +4440,19 @@
if (namelen == verbs[i].len &&
strncmp((char *)name, vn, namelen) == 0)
{
- *code = verbs[i].op;
- if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
+ /* Check for open captures before ACCEPT */
+
+ if (verbs[i].op == OP_ACCEPT)
+ {
+ open_capitem *oc;
+ cd->had_accept = TRUE;
+ for (oc = cd->open_caps; oc != NULL; oc = oc->next)
+ {
+ *code++ = OP_CLOSE;
+ PUT2INC(code, 0, oc->number);
+ }
+ }
+ *code++ = verbs[i].op;
break;
}
vn += verbs[i].len + 1;
@@ -5669,6 +5680,8 @@
uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
+open_capitem capitem;
+int capnumber = 0;
int firstbyte, reqbyte;
int branchfirstbyte, branchreqbyte;
int length;
@@ -5695,6 +5708,17 @@
them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
pre-compile phase to find out whether anything has yet been compiled or not. */
+/* If this is a capturing subpattern, add to the chain of open capturing items
+so that we can detect them if (*ACCEPT) is encountered. */
+
+if (*code == OP_CBRA)
+ {
+ capnumber = GET2(code, 1 + LINK_SIZE);
+ capitem.number = capnumber;
+ capitem.next = cd->open_caps;
+ cd->open_caps = &capitem;
+ }
+
/* Offset is set zero to mark that this bracket is still open */
PUT(code, 1, 0);
@@ -5830,6 +5854,10 @@
}
while (branch_length > 0);
}
+
+ /* If it was a capturing subpattern, remove it from the chain. */
+
+ if (capnumber > 0) cd->open_caps = cd->open_caps->next;
/* Fill in the ket */
@@ -6398,6 +6426,7 @@
cd->req_varyopt = 0;
cd->external_options = options;
cd->external_flags = 0;
+cd->open_caps = NULL;
/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
don't need to look at the result of the function here. The initial options have
@@ -6472,6 +6501,7 @@
cd->hwm = cworkspace;
cd->req_varyopt = 0;
cd->had_accept = FALSE;
+cd->open_caps = NULL;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, errorcode will be set non-zero, so we don't need to look at the result
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/pcre_exec.c 2009-09-15 18:17:54 UTC (rev 447)
@@ -909,8 +909,32 @@
ecode += 1 + LINK_SIZE;
}
break;
+
+ /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
+ to close any currently open capturing brackets. */
+
+ case OP_CLOSE:
+ number = GET2(ecode, 1);
+ offset = number << 1;
+
+#ifdef DEBUG
+ printf("end bracket %d at *ACCEPT", number);
+ printf("\n");
+#endif
+ md->capture_last = number;
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ {
+ md->offset_vector[offset] =
+ md->offset_vector[md->offset_end - number];
+ md->offset_vector[offset+1] = eptr - md->start_subject;
+ if (offset_top <= offset) offset_top = offset + 2;
+ }
+ ecode += 3;
+ break;
+
+
/* End of the pattern, either real or forced. If we are in a top-level
recursion, we should restore the offsets appropriately and continue from
after the call. */
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/pcre_internal.h 2009-09-15 18:17:54 UTC (rev 447)
@@ -1364,10 +1364,11 @@
OP_FAIL, /* 109 */
OP_ACCEPT, /* 110 */
+ OP_CLOSE, /* 111 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
- OP_SKIPZERO /* 111 */
+ OP_SKIPZERO /* 112 */
};
@@ -1393,7 +1394,7 @@
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
- "Skip zero"
+ "Close", "Skip zero"
/* This macro defines the length of fixed length operations in the compiled
@@ -1458,7 +1459,7 @@
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
- 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
+ 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
@@ -1521,6 +1522,15 @@
uschar start_bits[32];
} pcre_study_data;
+/* Structure for building a chain of open capturing subpatterns during
+compiling, so that instructions to close them can be compiled when (*ACCEPT) is
+encountered. */
+
+typedef struct open_capitem {
+ struct open_capitem *next; /* Chain link */
+ pcre_uint16 number; /* Capture number */
+} open_capitem;
+
/* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */
@@ -1533,6 +1543,7 @@
const uschar *start_code; /* The start of the compiled code */
const uschar *start_pattern; /* The start of the pattern */
const uschar *end_pattern; /* The end of the pattern */
+ open_capitem *open_caps; /* Chain of open capture items */
uschar *hwm; /* High watermark of workspace */
uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
Modified: code/trunk/pcre_printint.src
===================================================================
--- code/trunk/pcre_printint.src 2009-09-15 10:49:50 UTC (rev 446)
+++ code/trunk/pcre_printint.src 2009-09-15 18:17:54 UTC (rev 447)
@@ -245,6 +245,10 @@
else fprintf(f, " ");
fprintf(f, "%s", OP_names[*code]);
break;
+
+ case OP_CLOSE:
+ fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
+ break;
case OP_CREF:
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);