Revision: 587
http://www.exim.org/viewvc/pcre2?view=rev&revision=587
Author: ph10
Date: 2016-11-01 15:58:28 +0000 (Tue, 01 Nov 2016)
Log Message:
-----------
Fix auto-anchor bug when .* is inside an assertion.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput1
code/trunk/testdata/testoutput1
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2016-11-01 12:12:45 UTC (rev 586)
+++ code/trunk/ChangeLog 2016-11-01 15:58:28 UTC (rev 587)
@@ -29,15 +29,7 @@
existing subpattern.
(e) A conditional recursion test such as (?(R)...) misbehaved if there was a
group whose name began with "R".
- (f) The amount of memory needed for a compiled pattern was miscalculated if a
- lookbehind contained more than one toplevel branch and the first branch
- was of length zero.
- (g) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
- terminated pattern, if a # comment ran on to the end of the pattern, one
- or more code units past the end were being read.
- (h) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
- "{2,2") could cause reading beyond the pattern.
-
+
One effect of the refactoring is that some error numbers and messages have
changed, and the pattern offset given for compiling errors is not always the
right-most character that has been read. In particular, for a variable-length
@@ -61,6 +53,17 @@
a lookup outside one of the global tables. A similar bug existed for wide
characters in *VERB names.
+ (d) The amount of memory needed for a compiled pattern was miscalculated if a
+ lookbehind contained more than one toplevel branch and the first branch
+ was of length zero.
+
+ (e) In UTF-8 or UTF-16 modes with PCRE2_EXTENDED (/x) set and a non-zero-
+ terminated pattern, if a # comment ran on to the end of the pattern, one
+ or more code units past the end were being read.
+
+ (f) An unterminated repeat at the end of a non-zero-terminated pattern (e.g.
+ "{2,2") could cause reading beyond the pattern.
+
4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the
reference is by name, there is only one group of that name. The referenced
@@ -122,7 +125,11 @@
compiled. A non-installed binary to run the test function locally, called
pcre2fuzzcheck is also compiled.
+18. A pattern with PCRE2_DOTALL (/s) set but not PCRE2_NO_DOTSTAR_ANCHOR, and
+which started with .* inside a positive lookahead was incorrectly being
+compiled as implicitly anchored.
+
Version 10.22 29-July-2016
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2016-11-01 12:12:45 UTC (rev 586)
+++ code/trunk/src/pcre2_compile.c 2016-11-01 15:58:28 UTC (rev 587)
@@ -7634,6 +7634,7 @@
the less precise approach
cb points to the compile data block
atomcount atomic group level
+ inassert TRUE if in an assertion
Returns: TRUE or FALSE
*/
@@ -7640,7 +7641,7 @@
static BOOL
is_anchored(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
- int atomcount)
+ int atomcount, BOOL inassert)
{
do {
PCRE2_SPTR scode = first_significant_code(
@@ -7652,7 +7653,8 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
}
/* Capturing brackets */
@@ -7662,33 +7664,44 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
- if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
}
- /* Positive forward assertions and conditions */
+ /* Positive forward assertion */
- else if (op == OP_ASSERT || op == OP_COND)
+ else if (op == OP_ASSERT)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
}
+ /* Condition */
+
+ else if (op == OP_COND)
+ {
+ if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
+ }
+
/* Atomic groups */
else if (op == OP_ONCE || op == OP_ONCE_NC)
{
- if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
+ if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
return FALSE;
}
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
it isn't in brackets that are or may be referenced or inside an atomic
- group. There is also an option that disables auto-anchoring. */
+ group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
+ because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
+ with the subject "aab", which matches "b", i.e. not at the start of a line.
+ There is also an option that disables auto-anchoring. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR))
{
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
- atomcount > 0 || cb->had_pruneorskip ||
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE;
}
@@ -9423,7 +9436,7 @@
disable this case). */
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
- is_anchored(codestart, 0, &cb, 0))
+ is_anchored(codestart, 0, &cb, 0, FALSE))
re->overall_options |= PCRE2_ANCHORED;
/* If the pattern is still not anchored and we do not have a first code unit,
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2016-11-01 12:12:45 UTC (rev 586)
+++ code/trunk/testdata/testinput1 2016-11-01 15:58:28 UTC (rev 587)
@@ -5812,4 +5812,7 @@
/(?=.*X)X$/
\ X
+/(?s)(?=.*?)b/
+ aabc
+
# End of testinput1
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2016-11-01 12:12:45 UTC (rev 586)
+++ code/trunk/testdata/testoutput1 2016-11-01 15:58:28 UTC (rev 587)
@@ -9285,4 +9285,8 @@
\ X
0: X
+/(?s)(?=.*?)b/
+ aabc
+ 0: b
+
# End of testinput1