Revision: 390
http://www.exim.org/viewvc/pcre2?view=rev&revision=390
Author: ph10
Date: 2015-10-21 12:29:07 +0100 (Wed, 21 Oct 2015)
Log Message:
-----------
Fix UCP with [[:<]] and [[:>:]] bad compile bug.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/src/pcre2_intmodedep.h
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/ChangeLog 2015-10-21 11:29:07 UTC (rev 390)
@@ -206,7 +206,10 @@
59. Change 55 above introduced a bug by which certain patterns provoked the
erroneous error "\ at end of pattern".
+60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling
+errors or other strange effects if compiled in UCP mode.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/src/pcre2_compile.c 2015-10-21 11:29:07 UTC (rev 390)
@@ -1618,6 +1618,13 @@
not relevant, but the options argument is the final value of the compiled
pattern's options.
+There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
+processed, it is replaced by a nested alternative sequence. If this contains a
+backslash (which is usually does), ptrend does not point to its end - it still
+points to the end of the whole pattern. However, we can detect this case
+because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
+terminated and there are only ever two levels of nesting.
+
Arguments:
ptrptr points to the input position pointer
ptrend points to the end of the input
@@ -1643,10 +1650,14 @@
int escape = 0;
int i;
-/* If backslash is at the end of the string, it's an error. The check must be
-skipped when processing a nested insertion string during compilation. */
+/* Find the end of a nested insert. */
-if ((cb == NULL || cb->nestptr == NULL) && ptr >= ptrend)
+if (cb != NULL && cb->nestptr[0] != NULL)
+ ptrend = ptr + PRIV(strlen)(ptr);
+
+/* If backslash is at the end of the string, it's an error. */
+
+if (ptr >= ptrend)
{
*errorcodeptr = ERR1;
return 0;
@@ -3700,13 +3711,14 @@
c = *ptr;
/* If we are at the end of a nested substitution, revert to the outer level
- string. Nesting only happens one level deep, and the inserted string is
- always zero terminated. */
+ string. Nesting only happens one or two levels deep, and the inserted string
+ is always zero terminated. */
- if (c == CHAR_NULL && cb->nestptr != NULL)
+ if (c == CHAR_NULL && cb->nestptr[0] != NULL)
{
- ptr = cb->nestptr;
- cb->nestptr = NULL;
+ ptr = cb->nestptr[0];
+ cb->nestptr[0] = cb->nestptr[1];
+ cb->nestptr[1] = NULL;
c = *ptr;
}
@@ -3823,7 +3835,7 @@
/* Fill in length of a previous callout, except when the next thing is a
quantifier or when processing a property substitution string in UCP mode. */
- if (!is_quantifier && previous_callout != NULL && cb->nestptr == NULL &&
+ if (!is_quantifier && previous_callout != NULL && cb->nestptr[0] == NULL &&
after_manual_callout-- <= 0)
{
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
@@ -3834,7 +3846,8 @@
/* Create auto callout, except for quantifiers, or while processing property
strings that are substituted for \w etc in UCP mode. */
- if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && cb->nestptr == NULL)
+ if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier &&
+ cb->nestptr[0] == NULL)
{
previous_callout = code;
code = auto_callout(code, ptr, cb);
@@ -3926,13 +3939,15 @@
In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
used for "start of word" and "end of word". As these are otherwise illegal
sequences, we don't break anything by recognizing them. They are replaced
- by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
- erroneous and are handled by the normal code below. */
+ by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
+ nesting level, as no other inserted sequences will contains these oddities.
+ Sequences like [a[:<:]] are erroneous and are handled by the normal code
+ below. */
case CHAR_LEFT_SQUARE_BRACKET:
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
{
- cb->nestptr = ptr + 7;
+ cb->nestptr[0] = ptr + 7;
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */
continue;
@@ -3940,7 +3955,7 @@
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
{
- cb->nestptr = ptr + 7;
+ cb->nestptr[0] = ptr + 7;
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */
continue;
@@ -4125,11 +4140,13 @@
int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
/* The posix_substitutes table specifies which POSIX classes can be
- converted to \p or \P items. */
+ converted to \p or \P items. This can only happen at top nestling
+ level, as there will never be a POSIX class in a string that is
+ substituted for something else. */
if (posix_substitutes[pc] != NULL)
{
- cb->nestptr = tempptr + 1;
+ cb->nestptr[0] = tempptr + 1;
ptr = posix_substitutes[pc] - 1;
goto CONTINUE_CLASS;
}
@@ -4263,9 +4280,10 @@
case ESC_DU: /* when PCRE2_UCP is set. We replace the */
case ESC_wu: /* escape sequence with an appropriate \p */
case ESC_WU: /* or \P to test Unicode properties instead */
- case ESC_su: /* of the default ASCII testing. */
- case ESC_SU:
- cb->nestptr = ptr;
+ case ESC_su: /* of the default ASCII testing. This might be */
+ case ESC_SU: /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
+ cb->nestptr[1] = cb->nestptr[0];
+ cb->nestptr[0] = ptr;
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
class_has_8bitchar--; /* Undo! */
break;
@@ -4607,10 +4625,11 @@
CONTINUE_CLASS:
c = *(++ptr);
- if (c == 0 && cb->nestptr != NULL)
+ if (c == CHAR_NULL && cb->nestptr[0] != NULL)
{
- ptr = cb->nestptr;
- cb->nestptr = NULL;
+ ptr = cb->nestptr[0];
+ cb->nestptr[0] = cb->nestptr[1];
+ cb->nestptr[1] = NULL;
c = *(++ptr);
}
@@ -7082,7 +7101,8 @@
#ifdef SUPPORT_UNICODE
if (escape >= ESC_DU && escape <= ESC_wu)
{
- cb->nestptr = ptr + 1; /* Where to resume */
+ cb->nestptr[1] = cb->nestptr[0]; /* Back up if at 2nd level */
+ cb->nestptr[0] = ptr + 1; /* Where to resume */
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
}
else
@@ -8079,7 +8099,7 @@
cb.cx = ccontext;
cb.dupnames = FALSE;
cb.end_pattern = pattern + patlen;
-cb.nestptr = NULL;
+cb.nestptr[0] = cb.nestptr[1] = NULL;
cb.external_flags = 0;
cb.external_options = options;
cb.had_recurse = FALSE;
Modified: code/trunk/src/pcre2_intmodedep.h
===================================================================
--- code/trunk/src/pcre2_intmodedep.h 2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/src/pcre2_intmodedep.h 2015-10-21 11:29:07 UTC (rev 390)
@@ -686,7 +686,7 @@
PCRE2_SPTR start_code; /* The start of the compiled code */
PCRE2_SPTR start_pattern; /* The start of the pattern */
PCRE2_SPTR end_pattern; /* The end of the pattern */
- PCRE2_SPTR nestptr; /* Pointer saved for string substitution */
+ PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
PCRE2_UCHAR *name_table; /* The name/number table */
size_t workspace_size; /* Size of workspace */
uint16_t names_found; /* Number of entries so far */
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/testdata/testinput5 2015-10-21 11:29:07 UTC (rev 390)
@@ -1686,4 +1686,6 @@
\= Expect no match
123
+/(*UCP)(*UTF)[[:>:]]X/B
+
# End of testinput5
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/testdata/testoutput5 2015-10-21 11:29:07 UTC (rev 390)
@@ -4047,4 +4047,17 @@
123
No match
+/(*UCP)(*UTF)[[:>:]]X/B
+------------------------------------------------------------------
+ Bra
+ \b
+ AssertB
+ Reverse
+ prop Xwd
+ Ket
+ X
+ Ket
+ End
+------------------------------------------------------------------
+
# End of testinput5