Revision: 1669
http://vcs.pcre.org/viewvc?view=rev&revision=1669
Author: ph10
Date: 2016-10-18 16:10:09 +0100 (Tue, 18 Oct 2016)
Log Message:
-----------
Fix optimization bugs for patterns starting with lookaheads.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput2
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
code/trunk/testdata/testoutput6
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/ChangeLog 2016-10-18 15:10:09 UTC (rev 1669)
@@ -25,19 +25,32 @@
is in the class. There was a bug that caused this not to happen if a
Unicode property item was added to such a class, for example [\D\P{Nd}] or
[\W\pL].
-
+
7. When pcretest was outputing information from a callout, the caret indicator
for the current position in the subject line was incorrect if it was after
an escape sequence for a character whose code point was greater than
\x{ff}.
-
+
8. A pattern such as (?<RA>abc)(?(R)xyz) was incorrectly compiled such that
the conditional was interpreted as a reference to capturing group 1 instead
of a test for recursion. Any group whose name began with R was
- misinterpreted in this way. (The reference interpretation should only
+ misinterpreted in this way. (The reference interpretation should only
happen if the group's name is precisely "R".)
+9. A number of bugs have been mended relating to match start-up optimizations
+ when the first thing in a pattern is a positive lookahead. These all
+ applied only when PCRE_NO_START_OPTIMIZE was *not* set:
+ (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed
+ both an initial 'X' and a following 'X'.
+ (b) Some patterns starting with an assertion that started with .* were
+ incorrectly optimized as having to match at the start of the subject or
+ after a newline. There are cases where this is not true, for example,
+ (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that
+ start with spaces. Starting .* in an assertion is no longer taken as an
+ indication of matching at the start (or after a newline).
+
+
Version 8.39 14-June-2016
-------------------------
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/pcre_compile.c 2016-10-18 15:10:09 UTC (rev 1669)
@@ -7918,15 +7918,17 @@
}
}
- /* For a forward assertion, we take the reqchar, if set. This can be
- helpful if the pattern that follows the assertion doesn't set a different
- char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
- for an assertion, however because it leads to incorrect effect for patterns
- such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
- of a firstchar. This is overcome by a scan at the end if there's no
- firstchar, looking for an asserted first char. */
+ /* For a forward assertion, we take the reqchar, if set, provided that the
+ group has also set a first char. This can be helpful if the pattern that
+ follows the assertion doesn't set a different char. For example, it's
+ useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
+ the "real" "a" would then become a reqchar instead of a firstchar. This is
+ overcome by a scan at the end if there's no firstchar, looking for an
+ asserted first char. */
- else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
+ else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
+ subfirstcharflags >= 0)
{
reqchar = subreqchar;
reqcharflags = subreqcharflags;
@@ -8715,8 +8717,8 @@
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption. Also, the appearance of .*
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
-count, because once again the assumption no longer holds.
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
+or *SKIP does not count, because once again the assumption no longer holds.
Arguments:
code points to start of expression (the bracket)
@@ -8725,6 +8727,7 @@
the less precise approach
cd points to the compile data
atomcount atomic group level
+ inassert TRUE if in an assertion
Returns: TRUE or FALSE
*/
@@ -8731,7 +8734,7 @@
static BOOL
is_startline(const pcre_uchar *code, unsigned int bracket_map,
- compile_data *cd, int atomcount)
+ compile_data *cd, int atomcount, BOOL inassert)
{
do {
const pcre_uchar *scode = first_significant_code(
@@ -8758,7 +8761,7 @@
return FALSE;
default: /* Assertion */
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE;
break;
@@ -8772,7 +8775,7 @@
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
}
/* Capturing brackets */
@@ -8782,7 +8785,7 @@
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
- if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
+ if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
}
/* Positive forward assertions */
@@ -8789,7 +8792,7 @@
else if (op == OP_ASSERT)
{
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
}
/* Atomic brackets */
@@ -8796,19 +8799,19 @@
else if (op == OP_ONCE || op == OP_ONCE_NC)
{
- if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
+ if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
}
/* .* means "start at start or after \n" if it isn't in atomic brackets or
- brackets that may be referenced, as long as the pattern does not contain
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
- start of a line. */
+ brackets that may be referenced or an assertion, as long as the pattern does
+ not contain *PRUNE or *SKIP, because these break the feature. Consider, for
+ example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
+ not at the start of a line. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
- atomcount > 0 || cd->had_pruneorskip)
+ atomcount > 0 || cd->had_pruneorskip || inassert)
return FALSE;
}
@@ -9663,7 +9666,7 @@
re->flags |= PCRE_FIRSTSET;
}
- else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
+ else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
}
}
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testinput1 2016-10-18 15:10:09 UTC (rev 1669)
@@ -5733,4 +5733,10 @@
"(?|(\k'Pm')|(?'Pm'))"
abcd
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+
+/(?=.*X)X$/
+ \ X
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testinput2 2016-10-18 15:10:09 UTC (rev 1669)
@@ -4247,4 +4247,6 @@
/(?<R>abc)(?(R)xyz)/BZ
+/(?=.*[A-Z])/I
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testinput6 2016-10-18 15:10:09 UTC (rev 1669)
@@ -1562,4 +1562,10 @@
\x{389}
\x{20ac}
+/(?=.*b)\pL/
+ 11bb
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testoutput1 2016-10-18 15:10:09 UTC (rev 1669)
@@ -9434,4 +9434,12 @@
0:
1:
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+ 0:
+
+/(?=.*X)X$/
+ \ X
+ 0: X
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testoutput2 2016-10-18 15:10:09 UTC (rev 1669)
@@ -9380,7 +9380,7 @@
/(?(?=.*b).*b|^d)/I
Capturing subpattern count = 0
No options
-First char at start or follows newline
+No first char
No need char
/xyz/C
@@ -14698,4 +14698,11 @@
End
------------------------------------------------------------------
+/(?=.*[A-Z])/I
+Capturing subpattern count = 0
+May match empty string
+No options
+No first char
+No need char
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2016-10-14 17:06:22 UTC (rev 1668)
+++ code/trunk/testdata/testoutput6 2016-10-18 15:10:09 UTC (rev 1669)
@@ -2573,4 +2573,12 @@
\x{20ac}
No match
+/(?=.*b)\pL/
+ 11bb
+ 0: b
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+ 0: b
+
/-- End of testinput6 --/