Revision: 573
http://www.exim.org/viewvc/pcre2?view=rev&revision=573
Author: ph10
Date: 2016-10-18 12:22:40 +0100 (Tue, 18 Oct 2016)
Log Message:
-----------
Fix optimization bugs when pattern starts with lookahead.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
code/trunk/testdata/testoutput4
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/ChangeLog 2016-10-18 11:22:40 UTC (rev 573)
@@ -80,7 +80,20 @@
14. Add the -t (grand total) option to pcre2grep.
+15. A number of bugs have been mended relating to match start-up optimizations
+when the first thing in a pattern is a positive lookahead. These all applied
+only when PCRE2_NO_START_OPTIMIZE was *not* set:
+ (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed
+ both an initial 'X' and a following 'X'.
+ (b) Some patterns starting with an assertion that started with .* were
+ incorrectly optimized as having to match at the start of the subject or
+ after a newline. There are cases where this is not true, for example,
+ (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that
+ start with spaces. Starting .* in an assertion is no longer taken as an
+ indication of matching at the start (or after a newline).
+
+
Version 10.22 29-July-2016
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/src/pcre2_compile.c 2016-10-18 11:22:40 UTC (rev 573)
@@ -3312,8 +3312,8 @@
if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
{
errorcode = ERR58;
- goto FAILED;
- }
+ goto FAILED;
+ }
goto SET_RECURSION;
/* An item starting (?- followed by a digit comes here via the "default"
@@ -5994,7 +5994,7 @@
zerofirstcuflags = firstcuflags;
groupsetfirstcu = FALSE;
- if (bravalue >= OP_ONCE)
+ if (bravalue >= OP_ONCE) /* Not an assertion */
{
/* If we have not yet set a firstcu in this branch, take it from the
subpattern, remembering that it was set here so that a repeat of more
@@ -6034,15 +6034,19 @@
}
}
- /* For a forward assertion, we take the reqcu, if set. This can be
- helpful if the pattern that follows the assertion doesn't set a different
- char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
- for an assertion, however because it leads to incorrect effect for patterns
- such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
- of a firstcu. This is overcome by a scan at the end if there's no
- firstcu, looking for an asserted first char. */
+ /* For a forward assertion, we take the reqcu, if set, provided that the
+ group has also set a firstcu. This can be helpful if the pattern that
+ follows the assertion doesn't set a different char. For example, it's
+ useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
+ the "real" "a" would then become a reqcu instead of a firstcu. This is
+ overcome by a scan at the end if there's no firstcu, looking for an
+ asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
+ we must only take the reqcu when the group also set a firstcu. Otherwise,
+ in that example, 'X' ends up set for both. */
- else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
+ else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
+ subfirstcuflags >= 0)
{
reqcu = subreqcu;
reqcuflags = subreqcuflags;
@@ -6542,8 +6546,9 @@
*lengthptr += delta;
}
- /* This is compiling for real. If there is a set first code unit for
- the group, and we have not yet set a "required code unit", set it. */
+ /* This is compiling for real. If there is a set first code unit
+ for the group, and we have not yet set a "required code unit", set
+ it. */
else
{
@@ -7701,8 +7706,8 @@
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption. Also, the appearance of .*
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
-count, because once again the assumption no longer holds.
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
+or *SKIP does not count, because once again the assumption no longer holds.
Arguments:
code points to start of the compiled pattern or a group
@@ -7711,6 +7716,7 @@
the less precise approach
cb points to the compile data
atomcount atomic group level
+ inassert TRUE if in an assertion
Returns: TRUE or FALSE
*/
@@ -7717,7 +7723,7 @@
static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
- int atomcount)
+ int atomcount, BOOL inassert)
{
do {
PCRE2_SPTR scode = first_significant_code(
@@ -7748,7 +7754,7 @@
return FALSE;
default: /* Assertion */
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE;
break;
@@ -7762,7 +7768,8 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
}
/* Capturing brackets */
@@ -7772,7 +7779,7 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
- if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
}
/* Positive forward assertions */
@@ -7779,7 +7786,8 @@
else if (op == OP_ASSERT)
{
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
+ return FALSE;
}
/* Atomic brackets */
@@ -7786,19 +7794,21 @@
else if (op == OP_ONCE || op == OP_ONCE_NC)
{
- if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
+ return FALSE;
}
/* .* means "start at start or after \n" if it isn't in atomic brackets or
- brackets that may be referenced, as long as the pattern does not contain
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
- start of a line. There is also an option that disables this optimization. */
+ brackets that may be referenced or an assertion, and as long as the pattern
+ does not contain *PRUNE or *SKIP, because these break the feature. Consider,
+ for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
+ i.e. not at the start of a line. There is also an option that disables this
+ optimization. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
- atomcount > 0 || cb->had_pruneorskip ||
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE;
}
@@ -9452,7 +9462,8 @@
when *PRUNE and SKIP are not present. (There is an option that disables this
case.) */
- else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
+ else if (is_startline(codestart, 0, &cb, 0, FALSE))
+ re->flags |= PCRE2_STARTLINE;
}
/* Handle the "required code unit", if one is set. In the case of an anchored
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testinput1 2016-10-18 11:22:40 UTC (rev 573)
@@ -5806,4 +5806,10 @@
\= Expect no match
baaab
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+
+/(?=.*X)X$/
+ \ X
+
# End of testinput1
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testinput2 2016-10-18 11:22:40 UTC (rev 573)
@@ -4892,4 +4892,6 @@
/(?<R>abc)(?(R)xyz)/B
+/(?=.*[A-Z])/I
+
# End of testinput2
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testinput4 2016-10-18 11:22:40 UTC (rev 573)
@@ -2282,4 +2282,10 @@
\x{389}
\x{20ac}
+/(?=.*b)\pL/
+ 11bb
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+
# End of testinput4
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testoutput1 2016-10-18 11:22:40 UTC (rev 573)
@@ -9277,4 +9277,12 @@
baaab
No match
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+ 0:
+
+/(?=.*X)X$/
+ \ X
+ 0: X
+
# End of testinput1
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testoutput2 2016-10-18 11:22:40 UTC (rev 573)
@@ -8750,7 +8750,6 @@
/(?(?=.*b).*b|^d)/I
Capturing subpattern count = 0
-First code unit at start or follows newline
Subject length lower bound = 1
/xyz/auto_callout
@@ -15334,6 +15333,11 @@
End
------------------------------------------------------------------
+/(?=.*[A-Z])/I
+Capturing subpattern count = 0
+May match empty string
+Subject length lower bound = 0
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2016-10-16 16:48:14 UTC (rev 572)
+++ code/trunk/testdata/testoutput4 2016-10-18 11:22:40 UTC (rev 573)
@@ -3703,4 +3703,12 @@
\x{20ac}
No match
+/(?=.*b)\pL/
+ 11bb
+ 0: b
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+ 0: b
+
# End of testinput4