Revision: 453
http://www.exim.org/viewvc/pcre2?view=rev&revision=453
Author: ph10
Date: 2015-11-30 17:31:16 +0000 (Mon, 30 Nov 2015)
Log Message:
-----------
Fix empty \Q\E between an item and a qualifier in auto-callout mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-11-29 17:45:27 UTC (rev 452)
+++ code/trunk/ChangeLog 2015-11-30 17:31:16 UTC (rev 453)
@@ -349,7 +349,11 @@
104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep.
+105. An empty \Q\E sequence between an item and its qualifier caused
+pcre2_compile() to misbehave when auto callouts were enabled. This bug
+was found by the LLVM fuzzer.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-11-29 17:45:27 UTC (rev 452)
+++ code/trunk/src/pcre2_compile.c 2015-11-30 17:31:16 UTC (rev 453)
@@ -3947,9 +3947,17 @@
last_code = code;
}
- /* If in \Q...\E, check for the end; if not, we have a literal. If not in
- \Q...\E, an isolated \E is ignored. */
+ /* Before doing anything else we must handle all the special items that do
+ nothing, and which may come between an item and its quantifier. Otherwise,
+ when auto-callouts are enabled, a callout gets incorrectly inserted before
+ the quantifier is recognized. After recognizing a "do nothing" item, restart
+ the loop in case another one follows. */
+ /* If c is not NULL we are not at the end of the pattern. If it is NULL, we
+ may still be in the pattern with a NULL data item. In these cases, if we are
+ in \Q...\E, check for the \E that ends the literal string; if not, we have a
+ literal character. If not in \Q...\E, an isolated \E is ignored. */
+
if (c != CHAR_NULL || ptr < cb->end_pattern)
{
if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
@@ -3958,7 +3966,7 @@
ptr++;
continue;
}
- else if (inescq)
+ else if (inescq) /* Literal character */
{
if (previous_callout != NULL)
{
@@ -3973,17 +3981,27 @@
}
goto NORMAL_CHAR;
}
+
+ /* Check for the start of a \Q...\E sequence. We must do this here rather
+ than later in case it is immediately followed by \E, which turns it into a
+ "do nothing" sequence. */
+
+ if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
+ {
+ inescq = TRUE;
+ ptr++;
+ continue;
+ }
}
- /* In extended mode, skip white space and comments. We need a loop in order
- to check for more white space and more comments after a comment. */
+ /* In extended mode, skip white space and #-comments that end at newline. */
if ((options & PCRE2_EXTENDED) != 0)
{
- for (;;)
+ PCRE2_SPTR wscptr = ptr;
+ while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
+ if (c == CHAR_NUMBER_SIGN)
{
- while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
- if (c != CHAR_NUMBER_SIGN) break;
ptr++;
while (*ptr != CHAR_NULL)
{
@@ -3997,13 +4015,19 @@
if (utf) FORWARDCHAR(ptr);
#endif
}
- c = *ptr; /* Either NULL or the char after a newline */
}
+
+ /* If we skipped any characters, restart the loop. Otherwise, we didn't see
+ a comment. */
+
+ if (ptr > wscptr)
+ {
+ ptr--;
+ continue;
+ }
}
- /* Skip over (?# comments. We need to do this here because we want to know if
- the next thing is a quantifier, and these comments may come between an item
- and its quantifier. */
+ /* Skip over (?# comments. */
if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
ptr[2] == CHAR_NUMBER_SIGN)
@@ -4018,7 +4042,8 @@
continue;
}
- /* See if the next thing is a quantifier. */
+ /* End of processing "do nothing" items. See if the next thing is a
+ quantifier. */
is_quantifier =
c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
@@ -7133,8 +7158,11 @@
are negative the reference number. Only back references and those types
that consume a character may be repeated. We can test for values between
ESC_b and ESC_Z for the latter; this may have to change if any new ones are
- ever created. */
+ ever created.
+ Note: \Q and \E are handled at the start of the character-processing loop,
+ not here. */
+
case CHAR_BACKSLASH:
tempptr = ptr;
escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
@@ -7145,16 +7173,6 @@
c = ec;
else
{
- if (escape == ESC_Q) /* Handle start of quoted string */
- {
- if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
- ptr += 2; /* avoid empty string */
- else inescq = TRUE;
- continue;
- }
-
- if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
-
/* For metasequences that actually match a character, we disable the
setting of a first character if it hasn't already been set. */
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2015-11-29 17:45:27 UTC (rev 452)
+++ code/trunk/testdata/testinput2 2015-11-30 17:31:16 UTC (rev 453)
@@ -4699,4 +4699,7 @@
/(A*)\E+/B,auto_callout
+/()\Q\E*]/B,auto_callout
+ a[bc]d
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-11-29 17:45:27 UTC (rev 452)
+++ code/trunk/testdata/testoutput2 2015-11-30 17:31:16 UTC (rev 453)
@@ -14956,4 +14956,27 @@
End
------------------------------------------------------------------
+/()\Q\E*]/B,auto_callout
+------------------------------------------------------------------
+ Bra
+ Callout 255 0 7
+ Brazero
+ SCBra 1
+ Callout 255 1 0
+ KetRmax
+ Callout 255 7 1
+ ]
+ Callout 255 8 0
+ Ket
+ End
+------------------------------------------------------------------
+ a[bc]d
+--->a[bc]d
+ +0 ^ ()\Q\E*
+ +1 ^ )
+ +7 ^ ]
+ +8 ^^
+ 0: ]
+ 1:
+
# End of testinput2