Revision: 626
http://www.exim.org/viewvc/pcre2?view=rev&revision=626
Author: ph10
Date: 2016-12-23 18:34:10 +0000 (Fri, 23 Dec 2016)
Log Message:
-----------
Fix mis-parsing of a conditional group with callout but a question mark where
the assertion should start.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/ChangeLog 2016-12-23 18:34:10 UTC (rev 626)
@@ -110,6 +110,10 @@
(p) A buffer overflow could occur while sorting the names in the group name
list (depending on the order in which the names were seen).
+
+ (q) A conditional group that started with a callout was not doing the right
+ check for a following assertion, leading to compiling bad code. Example:
+ /(?(C'XX))?!XX/
4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/src/pcre2_compile.c 2016-12-23 18:34:10 UTC (rev 626)
@@ -2325,7 +2325,7 @@
parsed_pattern = manage_callouts(thisptr, &previous_callout, options,
parsed_pattern, cb);
PARSED_LITERAL(c, parsed_pattern);
- meta_quantifier = 0;
+ meta_quantifier = 0;
}
continue; /* Next character */
}
@@ -2475,13 +2475,16 @@
/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
assertion, possibly preceded by a callout. If the value is 1, we have just
had the callout and expect an assertion. There must be at least 3 more
- characters in all cases. We know that the current character is an opening
- parenthesis, as otherwise we wouldn't be here. Note that expect_cond_assert
- may be negative, since all callouts just decrement it. */
+ characters in all cases. When expect_cond_assert is 2, we know that the
+ current character is an opening parenthesis, as otherwise we wouldn't be
+ here. However, when it is 1, we need to check, and it's easiest just to check
+ always. Note that expect_cond_assert may be negative, since all callouts just
+ decrement it. */
if (expect_cond_assert > 0)
{
- BOOL ok = ptrend - ptr >= 3 && ptr[0] == CHAR_QUESTION_MARK;
+ BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
+ ptr[0] == CHAR_QUESTION_MARK;
if (ok) switch(ptr[1])
{
case CHAR_C:
@@ -4527,7 +4530,7 @@
Returns: 0 There's been an error, *errorcodeptr is non-zero
+1 Success, this branch must match at least one character
- -1 Success, this branch may match an empty string
+ -1 Success, this branch may match an empty string
*/
static int
@@ -4538,7 +4541,7 @@
{
int bravalue = 0;
int okreturn = -1;
-int group_return = 0;
+int group_return = 0;
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
uint32_t greedy_default, greedy_non_default;
uint32_t repeat_type, op_type;
@@ -4622,7 +4625,7 @@
BOOL should_flip_negation;
BOOL match_all_or_no_wide_chars;
BOOL possessive_quantifier;
- BOOL note_group_empty;
+ BOOL note_group_empty;
int class_has_8bitchar;
int i;
uint32_t mclength;
@@ -4687,15 +4690,15 @@
Checking for the legality of quantifiers happens in parse_regex(), except for
a quantifier after an assertion that is a condition. */
- if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
+ if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
{
previous = code;
if (matched_char) okreturn = 1;
- }
+ }
previous_matched_char = matched_char;
matched_char = FALSE;
- note_group_empty = FALSE;
+ note_group_empty = FALSE;
skipunits = 0; /* Default value for most subgroups */
switch(meta)
@@ -4737,7 +4740,7 @@
repeats. The value of reqcu doesn't change either. */
case META_DOT:
- matched_char = TRUE;
+ matched_char = TRUE;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags;
@@ -4755,7 +4758,7 @@
case META_CLASS_EMPTY:
case META_CLASS_EMPTY_NOT:
- matched_char = TRUE;
+ matched_char = TRUE;
*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
zerofirstcu = firstcu;
@@ -4778,7 +4781,7 @@
case META_CLASS_NOT:
case META_CLASS:
- matched_char = TRUE;
+ matched_char = TRUE;
negate_class = meta == META_CLASS_NOT;
/* We can optimize the case of a single character in a class by generating
@@ -5502,7 +5505,7 @@
goto GROUP_PROCESS_NOTE_EMPTY;
/* The DEFINE condition is always false. It's internal groups may never
- be called, so matched_char must remain false, hence the jump to
+ be called, so matched_char must remain false, hence the jump to
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
case META_COND_DEFINE:
@@ -5603,12 +5606,12 @@
/* Process nested bracketed regex. The nesting depth is maintained for the
benefit of the stackguard function. The test for too deep nesting is now
- done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
- others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
+ done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
+ others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
note of whether or not they may match an empty string. */
-
+
GROUP_PROCESS_NOTE_EMPTY:
- note_group_empty = TRUE;
+ note_group_empty = TRUE;
GROUP_PROCESS:
cb->parens_depth += 1;
@@ -5619,7 +5622,7 @@
templastcapture = cb->lastcapture; /* Save value before group */
length_prevgroup = 0; /* Initialize for pre-compile phase */
- if ((group_return =
+ if ((group_return =
compile_regex(
options, /* The option state */
&tempcode, /* Where to put code (updated) */
@@ -5636,14 +5639,14 @@
&length_prevgroup /* Pre-compile phase */
)) == 0)
return 0; /* Error */
-
+
cb->parens_depth -= 1;
- /* If that was a non-conditional significant group (not an assertion, not a
+ /* If that was a non-conditional significant group (not an assertion, not a
DEFINE) that matches at least one character, then the current item matches
a character. Conditionals are handled below. */
- if (note_group_empty && bravalue != OP_COND && group_return > 0)
+ if (note_group_empty && bravalue != OP_COND && group_return > 0)
matched_char = TRUE;
/* If that was an atomic group and there are no capturing groups within it,
@@ -5676,7 +5679,7 @@
tc += GET(tc,1);
}
while (*tc != OP_KET);
-
+
/* A DEFINE group is never obeyed inline (the "condition" is always
false). It must have only one branch. Having checked this, change the
opcode to OP_FALSE. */
@@ -5695,7 +5698,7 @@
/* A "normal" conditional group. If there is just one branch, we must not
make use of its firstcu or reqcu, because this is equivalent to an
- empty second branch. Also, it may match an empty string. If there are two
+ empty second branch. Also, it may match an empty string. If there are two
branches, this item must match a character if the group must. */
else
@@ -6018,7 +6021,7 @@
repeat_max = 1;
REPEAT:
- if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
+ if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
/* Remember whether this is a variable length repeat, and default to
single-char opcodes. */
@@ -6087,7 +6090,7 @@
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
code += 2 + 2 * LINK_SIZE;
length_prevgroup = 3 + 3*LINK_SIZE;
- group_return = -1; /* Set "may match empty string" */
+ group_return = -1; /* Set "may match empty string" */
}
/* Now handle repetition for the different types of item. */
@@ -6444,9 +6447,9 @@
else
{
- /* In the compile phase, adjust the opcode if the group can match
+ /* In the compile phase, adjust the opcode if the group can match
an empty string. For a conditional group with only one branch, the
- value of group_return will not show "could be empty", so we must
+ value of group_return will not show "could be empty", so we must
check that separately. */
if (lengthptr == NULL)
@@ -6877,10 +6880,10 @@
character if it hasn't already been set. */
if (meta_arg > ESC_b && meta_arg < ESC_Z)
- {
- matched_char = TRUE;
+ {
+ matched_char = TRUE;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
- }
+ }
/* Set values to reset to if this is followed by a zero repeat. */
@@ -6946,7 +6949,7 @@
NORMAL_CHAR:
meta = *pptr; /* Get the full 32 bits */
NORMAL_CHAR_SET: /* Character is already in meta */
- matched_char = TRUE;
+ matched_char = TRUE;
/* For caseless UTF mode, check whether this character has more than one
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
@@ -7071,7 +7074,7 @@
Returns: 0 There has been an error
+1 Success, this group must match at least one character
-1 Success, this group may match an empty string
-*/
+*/
static int
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
@@ -7156,7 +7159,7 @@
for (;;)
{
int branch_return;
-
+
/* Insert OP_REVERSE if this is as lookbehind assertion. */
if (lookbehind && lookbehindlength > 0)
@@ -7169,14 +7172,14 @@
/* Now compile the branch; in the pre-compile phase its length gets added
into the length. */
- if ((branch_return =
+ if ((branch_return =
compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
cb, (lengthptr == NULL)? NULL : &length)) == 0)
return 0;
-
+
/* If a branch can match an empty string, so can the whole group. */
-
+
if (branch_return < 0) okreturn = -1;
/* In the real compile phase, there is some post-processing to be done. */
@@ -7308,7 +7311,7 @@
}
*lengthptr += length;
}
-// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
+// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
return okreturn;
}
@@ -9127,7 +9130,7 @@
pptr = cb.parsed_pattern;
code = (PCRE2_UCHAR *)codestart;
*code = OP_BRA;
-regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
+regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
re->top_bracket = cb.bracount;
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/testdata/testinput2 2016-12-23 18:34:10 UTC (rev 626)
@@ -4944,4 +4944,10 @@
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
+# This one used to compile rubbish instead of a compile error, and then
+# behave unpredictably at match time.
+
+/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
+ .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/testdata/testoutput2 2016-12-23 18:34:10 UTC (rev 626)
@@ -15424,6 +15424,13 @@
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
+# This one used to compile rubbish instead of a compile error, and then
+# behave unpredictably at match time.
+
+/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
+Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
+ .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data