Revision: 994
http://vcs.pcre.org/viewvc?view=rev&revision=994
Author: ph10
Date: 2012-07-10 15:29:26 +0100 (Tue, 10 Jul 2012)
Log Message:
-----------
Fix patterns that incorrectly set "anchored" or "start of line" for .* inside
atomic parentheses or when *PRUNE or *SKIP is present.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrepattern.3
code/trunk/pcre_compile.c
code/trunk/pcre_internal.h
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/ChangeLog 2012-07-10 14:29:26 UTC (rev 994)
@@ -16,6 +16,11 @@
5. Rename the "leave" variable names to "quit" to improve WinCE compatibility.
Reported by Giuseppe D'Angelo.
+
+6. The PCRE_STARTLINE bit, indicating that a match can occur only at the start
+ of a line, was being set incorrectly in cases where .* appeared inside
+ atomic brackets at the start of a pattern, or where there was a subsequent
+ *PRUNE or *SKIP.
Version 8.31 06-July-2012
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/doc/pcrepattern.3 2012-07-10 14:29:26 UTC (rev 994)
@@ -1621,7 +1621,7 @@
worth setting PCRE_DOTALL in order to obtain this optimization, or
alternatively using ^ to indicate anchoring explicitly.
.P
-However, there is one situation where the optimization cannot be used. When .*
+However, there are some cases where the optimization cannot be used. When .*
is inside capturing parentheses that are the subject of a back reference
elsewhere in the pattern, a match at the start may fail where a later one
succeeds. Consider, for example:
@@ -1631,6 +1631,15 @@
If the subject is "xyz123abc123" the match point is the fourth character. For
this reason, such a pattern is not implicitly anchored.
.P
+Another case where implicit anchoring is not applied is when the leading .* is
+inside an atomic group. Once again, a match at the start may fail where a later
+one succeeds. Consider this pattern:
+.sp
+ (?>.*?a)b
+.sp
+It matches "ab" in the subject "aab". The use of the backtracking control verbs
+(*PRUNE) and (*SKIP) also disable this optimization.
+.P
When a capturing subpattern is repeated, the value captured is the substring
that matched the final iteration. For example, after
.sp
@@ -2913,6 +2922,6 @@
.rs
.sp
.nf
-Last updated: 17 June 2012
+Last updated: 10 July 2012
Copyright (c) 1997-2012 University of Cambridge.
.fi
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/pcre_compile.c 2012-07-10 14:29:26 UTC (rev 994)
@@ -5636,6 +5636,8 @@
if (namelen == verbs[i].len &&
STRNCMP_UC_C8(name, vn, namelen) == 0)
{
+ int setverb;
+
/* Check for open captures before ACCEPT and convert it to
ASSERT_ACCEPT if in an assertion. */
@@ -5653,7 +5655,8 @@
*code++ = OP_CLOSE;
PUT2INC(code, 0, oc->number);
}
- *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
+ setverb = *code++ =
+ (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
/* Do not set firstchar after *ACCEPT */
if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
@@ -5668,8 +5671,7 @@
*errorcodeptr = ERR66;
goto FAILED;
}
- *code = verbs[i].op;
- if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
+ setverb = *code++ = verbs[i].op;
}
else
@@ -5679,14 +5681,28 @@
*errorcodeptr = ERR59;
goto FAILED;
}
- *code = verbs[i].op_arg;
- if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
+ setverb = *code++ = verbs[i].op_arg;
*code++ = arglen;
memcpy(code, arg, IN_UCHARS(arglen));
code += arglen;
*code++ = 0;
}
+ switch (setverb)
+ {
+ case OP_THEN:
+ case OP_THEN_ARG:
+ cd->external_flags |= PCRE_HASTHEN;
+ break;
+
+ case OP_PRUNE:
+ case OP_PRUNE_ARG:
+ case OP_SKIP:
+ case OP_SKIP_ARG:
+ cd->had_pruneorskip = TRUE;
+ break;
+ }
+
break; /* Found verb, exit loop */
}
@@ -7323,19 +7339,23 @@
However, by keeping a bitmap of the first 31 back references, we can catch some
of the more common cases more precisely.
+... A second exception is when the .* appears inside an atomic group, because
+this prevents the number of characters it matches from being adjusted.
+
Arguments:
code points to start of expression (the bracket)
bracket_map a bitmap of which brackets we are inside while testing; this
handles up to substring 31; after that we just have to take
the less precise approach
- backref_map the back reference bitmap
+ cd points to the compile data block
+ atomcount atomic group level
Returns: TRUE or FALSE
*/
static BOOL
is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
- unsigned int backref_map)
+ compile_data *cd, int atomcount)
{
do {
const pcre_uchar *scode = first_significant_code(
@@ -7347,7 +7367,7 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
}
/* Capturing brackets */
@@ -7357,30 +7377,40 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
- if (!is_anchored(scode, new_map, backref_map)) return FALSE;
+ if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
}
- /* Other brackets */
+ /* Positive forward assertions and conditions */
- else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
- op == OP_COND)
+ else if (op == OP_ASSERT || op == OP_COND)
{
- if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
}
+ /* Atomic groups */
+
+ else if (op == OP_ONCE || op == OP_ONCE_NC)
+ {
+ if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
+ return FALSE;
+ }
+
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
- it isn't in brackets that are or may be referenced. */
+ it isn't in brackets that are or may be referenced or inside an atomic
+ group. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR))
{
- if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
+ if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
+ atomcount > 0 || cd->had_pruneorskip)
return FALSE;
}
/* Check for explicit anchoring */
else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
+
code += GET(code, 1);
}
while (*code == OP_ALT); /* Loop for each alternative */
@@ -7398,21 +7428,24 @@
matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
-because in that case we can't make the assumption.
+because in that case we can't make the assumption. Also, the appearance of .*
+inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
+count, because once again the assumption no longer holds.
Arguments:
code points to start of expression (the bracket)
bracket_map a bitmap of which brackets we are inside while testing; this
handles up to substring 31; after that we just have to take
the less precise approach
- backref_map the back reference bitmap
+ cd points to the compile data
+ atomcount atomic group level
Returns: TRUE or FALSE
*/
static BOOL
is_startline(const pcre_uchar *code, unsigned int bracket_map,
- unsigned int backref_map)
+ compile_data *cd, int atomcount)
{
do {
const pcre_uchar *scode = first_significant_code(
@@ -7438,7 +7471,7 @@
return FALSE;
default: /* Assertion */
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE;
break;
@@ -7452,7 +7485,7 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
}
/* Capturing brackets */
@@ -7462,25 +7495,40 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
- if (!is_startline(scode, new_map, backref_map)) return FALSE;
+ if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
}
- /* Other brackets */
+ /* Positive forward assertions */
- else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
+ else if (op == OP_ASSERT)
{
- if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
}
+
+ /* Atomic brackets */
- /* .* means "start at start or after \n" if it isn't in brackets that
- may be referenced. */
+ else if (op == OP_ONCE || op == OP_ONCE_NC)
+ {
+ if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
+ }
+ /* .* means "start at start or after \n" if it isn't in atomic brackets or
+ brackets that may be referenced, as long as the pattern does not contain
+ *PRUNE or *SKIP, because these break the feature. Consider, for example,
+ /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
+ start of a line. */
+
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
- if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
+ atomcount > 0 || cd->had_pruneorskip)
+ return FALSE;
}
- /* Check for explicit circumflex */
+ /* Check for explicit circumflex; anything else gives a FALSE result. Note
+ in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
+ because the number of characters matched by .* cannot be adjusted inside
+ them. */
else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
@@ -7939,6 +7987,7 @@
cd->hwm = (pcre_uchar *)(cd->start_workspace);
cd->req_varyopt = 0;
cd->had_accept = FALSE;
+cd->had_pruneorskip = FALSE;
cd->check_lookbehind = FALSE;
cd->open_caps = NULL;
@@ -8062,19 +8111,19 @@
}
/* If the anchored option was not passed, set the flag if we can determine that
-the pattern is anchored by virtue of ^ characters or \A or anything else (such
-as starting with .* when DOTALL is set).
+the pattern is anchored by virtue of ^ characters or \A or anything else, such
+as starting with non-atomic .* when DOTALL is set and there are no occurrences
+of *PRUNE or *SKIP.
Otherwise, if we know what the first byte has to be, save it, because that
speeds up unanchored matches no end. If not, see if we can set the
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
-start with ^. and also when all branches start with .* for non-DOTALL matches.
-*/
+start with ^. and also when all branches start with non-atomic .* for
+non-DOTALL matches when *PRUNE and SKIP are not present. */
if ((re->options & PCRE_ANCHORED) == 0)
{
- if (is_anchored(codestart, 0, cd->backref_map))
- re->options |= PCRE_ANCHORED;
+ if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
else
{
if (firstchar < 0)
@@ -8111,8 +8160,8 @@
re->flags |= PCRE_FIRSTSET;
}
- else if (is_startline(codestart, 0, cd->backref_map))
- re->flags |= PCRE_STARTLINE;
+
+ else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
}
}
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/pcre_internal.h 2012-07-10 14:29:26 UTC (rev 994)
@@ -2041,6 +2041,7 @@
int external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL had_accept; /* (*ACCEPT) encountered */
+ BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
int nltype; /* Newline type */
int nllen; /* Newline string length */
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/testdata/testinput1 2012-07-10 14:29:26 UTC (rev 994)
@@ -5262,4 +5262,45 @@
/((?>a?)*)*c/
aac
+/(?>.*?a)(?<=ba)/
+ aba
+
+/(?:.*?a)(?<=ba)/
+ aba
+
+/.*?a(*PRUNE)b/
+ aab
+
+/.*?a(*PRUNE)b/s
+ aab
+
+/^a(*PRUNE)b/s
+ aab
+
+/.*?a(*SKIP)b/
+ aab
+
+/(?>.*?a)b/s
+ aab
+
+/(?>.*?a)b/
+ aab
+
+/(?>^a)b/s
+ aab
+
+/(?>.*?)(?<=(abcd)|(wxyz))/
+ alphabetabcd
+ endingwxyz
+
+/(?>.*)(?<=(abcd)|(wxyz))/
+ alphabetabcd
+ endingwxyz
+
+"(?>.*)foo"
+ abcdfooxyz
+
+"(?>.*?)foo"
+ abcdfooxyz
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/testdata/testinput2 2012-07-10 14:29:26 UTC (rev 994)
@@ -3768,5 +3768,40 @@
/((?=a(*COMMIT)b)ab|ac){0}(?:(?1)|a(c))/
ac
+
+/-- These are all run as real matches in test 1; here we are just checking the
+settings of the anchored and startline bits. */
+/(?>.*?a)(?<=ba)/I
+
+/(?:.*?a)(?<=ba)/I
+
+/.*?a(*PRUNE)b/I
+
+/.*?a(*PRUNE)b/sI
+
+/^a(*PRUNE)b/sI
+
+/.*?a(*SKIP)b/I
+
+/(?>.*?a)b/sI
+
+/(?>.*?a)b/I
+
+/(?>^a)b/sI
+
+/(?>.*?)(?<=(abcd)|(wxyz))/I
+
+/(?>.*)(?<=(abcd)|(wxyz))/I
+
+"(?>.*)foo"I
+
+"(?>.*?)foo"I
+
+/(?>^abc)/mI
+
+/(?>.*abc)/mI
+
+/(?:.*abc)/mI
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/testdata/testoutput1 2012-07-10 14:29:26 UTC (rev 994)
@@ -8733,4 +8733,66 @@
0: aac
1:
+/(?>.*?a)(?<=ba)/
+ aba
+ 0: ba
+
+/(?:.*?a)(?<=ba)/
+ aba
+ 0: aba
+
+/.*?a(*PRUNE)b/
+ aab
+ 0: ab
+
+/.*?a(*PRUNE)b/s
+ aab
+ 0: ab
+
+/^a(*PRUNE)b/s
+ aab
+No match
+
+/.*?a(*SKIP)b/
+ aab
+ 0: ab
+
+/(?>.*?a)b/s
+ aab
+ 0: ab
+
+/(?>.*?a)b/
+ aab
+ 0: ab
+
+/(?>^a)b/s
+ aab
+No match
+
+/(?>.*?)(?<=(abcd)|(wxyz))/
+ alphabetabcd
+ 0:
+ 1: abcd
+ endingwxyz
+ 0:
+ 1: <unset>
+ 2: wxyz
+
+/(?>.*)(?<=(abcd)|(wxyz))/
+ alphabetabcd
+ 0: alphabetabcd
+ 1: abcd
+ endingwxyz
+ 0: endingwxyz
+ 1: <unset>
+ 2: wxyz
+
+"(?>.*)foo"
+ abcdfooxyz
+No match
+
+"(?>.*?)foo"
+ abcdfooxyz
+ 0: foo
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2012-07-10 04:33:00 UTC (rev 993)
+++ code/trunk/testdata/testoutput2 2012-07-10 14:29:26 UTC (rev 994)
@@ -768,7 +768,7 @@
/(?>.*)(?<=(abcd)|(xyz))/I
Capturing subpattern count = 2
No options
-First char at start or follows newline
+No first char
No need char
Max lookbehind = 4
alphabetabcd
@@ -10110,7 +10110,7 @@
"(?>.*/)foo"SI
Capturing subpattern count = 0
No options
-First char at start or follows newline
+No first char
Need char = 'o'
Subject length lower bound = 4
No set of starting bytes
@@ -12360,5 +12360,108 @@
/((?=a(*COMMIT)b)ab|ac){0}(?:(?1)|a(c))/
ac
0: ac
+
+/-- These are all run as real matches in test 1; here we are just checking the
+settings of the anchored and startline bits. */
+/(?>.*?a)(?<=ba)/I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'a'
+Max lookbehind = 2
+
+/(?:.*?a)(?<=ba)/I
+Capturing subpattern count = 0
+No options
+First char at start or follows newline
+Need char = 'a'
+Max lookbehind = 2
+
+/.*?a(*PRUNE)b/I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'b'
+
+/.*?a(*PRUNE)b/sI
+Capturing subpattern count = 0
+Options: dotall
+No first char
+Need char = 'b'
+
+/^a(*PRUNE)b/sI
+Capturing subpattern count = 0
+Options: anchored dotall
+No first char
+No need char
+
+/.*?a(*SKIP)b/I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'b'
+
+/(?>.*?a)b/sI
+Capturing subpattern count = 0
+Options: dotall
+No first char
+Need char = 'b'
+
+/(?>.*?a)b/I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'b'
+
+/(?>^a)b/sI
+Capturing subpattern count = 0
+Options: anchored dotall
+No first char
+No need char
+
+/(?>.*?)(?<=(abcd)|(wxyz))/I
+Capturing subpattern count = 2
+No options
+No first char
+No need char
+Max lookbehind = 4
+
+/(?>.*)(?<=(abcd)|(wxyz))/I
+Capturing subpattern count = 2
+No options
+No first char
+No need char
+Max lookbehind = 4
+
+"(?>.*)foo"I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'o'
+
+"(?>.*?)foo"I
+Capturing subpattern count = 0
+No options
+No first char
+Need char = 'o'
+
+/(?>^abc)/mI
+Capturing subpattern count = 0
+Options: multiline
+First char at start or follows newline
+Need char = 'c'
+
+/(?>.*abc)/mI
+Capturing subpattern count = 0
+Options: multiline
+No first char
+Need char = 'c'
+
+/(?:.*abc)/mI
+Capturing subpattern count = 0
+Options: multiline
+First char at start or follows newline
+Need char = 'c'
+
/-- End of testinput2 --/