Revision: 1495
http://vcs.pcre.org/viewvc?view=rev&revision=1495
Author: ph10
Date: 2014-07-12 19:22:54 +0100 (Sat, 12 Jul 2014)
Log Message:
-----------
Fix compiler crash/misbehaviour for zero-repeated groups that include a
recursive back reference.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/testdata/testinput11
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput11-16
code/trunk/testdata/testoutput11-32
code/trunk/testdata/testoutput11-8
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/ChangeLog 2014-07-12 18:22:54 UTC (rev 1495)
@@ -84,6 +84,11 @@
18. Avoid a compiler warning (from some compilers) for a function call with
a cast that removes "const" from an lvalue by using an intermediate
variable (to which the compiler does not object).
+
+19. Incorrect code was compiled if a group that contained an internal recursive
+ back reference was optional (had quantifier with a minimum of zero). This
+ example compiled incorrect code: /(((a\2)|(a*)\g<-1>))*/ and other examples
+ caused segmentation faults because of stack overflows at compile time.
Version 8.35 04-April-2014
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/pcre_compile.c 2014-07-12 18:22:54 UTC (rev 1495)
@@ -549,7 +549,7 @@
"group name must start with a non-digit\0"
/* 85 */
"parentheses are too deeply nested (stack check)\0"
- "digits missing in \\x{} or \\o{}\0"
+ "digits missing in \\x{} or \\o{}\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -1260,7 +1260,7 @@
case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
- if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
+ if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
{
ptr += 2;
c = 0;
@@ -1334,7 +1334,7 @@
{
*errorcodeptr = ERR86;
break;
- }
+ }
c = 0;
overflow = FALSE;
while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
@@ -1590,7 +1590,7 @@
int min = 0;
int max = -1;
-while (IS_DIGIT(*p))
+while (IS_DIGIT(*p))
{
min = min * 10 + (int)(*p++ - CHAR_0);
if (min > 65535)
@@ -1598,14 +1598,14 @@
*errorcodeptr = ERR5;
return p;
}
- }
+ }
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
{
if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
{
max = 0;
- while(IS_DIGIT(*p))
+ while(IS_DIGIT(*p))
{
max = max * 10 + (int)(*p++ - CHAR_0);
if (max > 65535)
@@ -1613,7 +1613,7 @@
*errorcodeptr = ERR5;
return p;
}
- }
+ }
if (max < min)
{
*errorcodeptr = ERR4;
@@ -3096,7 +3096,7 @@
Therefore infinite recursions are not possible. */
c = *code;
-
+
/* Skip over callouts */
if (c == OP_CALLOUT)
@@ -3125,7 +3125,7 @@
/* If the bracket is capturing, and referenced by an OP_RECURSE, or
it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
cannot be converted to a possessive form. */
-
+
if (base_list[1] == 0) return FALSE;
switch(*(code - GET(code, 1)))
@@ -3137,7 +3137,7 @@
case OP_ONCE:
case OP_ONCE_NC:
/* Atomic sub-patterns and assertions can always auto-possessify their
- last iterator. However, if the group was entered as a result of checking
+ last iterator. However, if the group was entered as a result of checking
a previous iterator, this is not possible. */
return !entered_a_group;
@@ -3182,14 +3182,14 @@
continue;
default:
- break;
+ break;
}
/* Check for a supported opcode, and load its properties. */
code = get_chr_property_list(code, utf, cd->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
-
+
/* If either opcode is a small character list, set pointers for comparing
characters from that list with another list, or with a property. */
@@ -3422,7 +3422,7 @@
autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
if (!accepted) return FALSE;
-
+
if (list[1] == 0) return TRUE;
/* Might be an empty repeat. */
continue;
@@ -4694,7 +4694,7 @@
previous = NULL;
if ((options & PCRE_MULTILINE) != 0)
{
- if (firstcharflags == REQ_UNSET)
+ if (firstcharflags == REQ_UNSET)
zerofirstcharflags = firstcharflags = REQ_NONE;
*code++ = OP_CIRCM;
}
@@ -5983,7 +5983,7 @@
just adjust the length as if we had. Do some paranoid checks for
potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
integer type when available, otherwise double. */
-
+
if (lengthptr != NULL)
{
int delta = (repeat_min - 1)*length_prevgroup;
@@ -6701,7 +6701,7 @@
ptr++;
}
namelen = (int)(ptr - name);
- if (lengthptr != NULL && (options & PCRE_DUPNAMES) != 0)
+ if (lengthptr != NULL && (options & PCRE_DUPNAMES) != 0)
*lengthptr += IMM2_SIZE;
}
@@ -6767,7 +6767,7 @@
(slot+IMM2_SIZE)[namelen] != 0) break;
count++;
}
-
+
if (count > 1)
{
PUT2(code, 2+LINK_SIZE, offset);
@@ -7116,7 +7116,7 @@
/* Count named back references. */
if (!is_recurse) cd->namedrefcount++;
-
+
/* If duplicate names are permitted, we have to allow for a named
reference to a duplicated name (this cannot be determined until the
second pass). This needs an extra 16-bit data item. */
@@ -7168,12 +7168,12 @@
for (i++; i < cd->names_found; i++)
{
if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
-
-
+
+
count++;
cslot += cd->name_entry_size;
}
-
+
if (count > 1)
{
if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
@@ -8267,12 +8267,16 @@
/* If it was a capturing subpattern, check to see if it contained any
recursive back references. If so, we must wrap it in atomic brackets.
- In any event, remove the block from the chain. */
+ Because we are moving code along, we must ensure that any pending recursive
+ references are updated. In any event, remove the block from the chain. */
if (capnumber > 0)
{
if (cd->open_caps->flag)
{
+ *code = OP_END;
+ adjust_recurse(start_bracket, 1 + LINK_SIZE,
+ (options & PCRE_UTF8) != 0, cd, cd->hwm);
memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
IN_UCHARS(code - start_bracket));
*start_bracket = OP_ONCE;
@@ -9277,7 +9281,7 @@
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
-/* Unless disabled, check whether any single character iterators can be
+/* Unless disabled, check whether any single character iterators can be
auto-possessified. The function overwrites the appropriate opcode values, so
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
used in this code because at least one compiler gives a warning about loss of
@@ -9288,7 +9292,7 @@
{
pcre_uchar *temp = (pcre_uchar *)codestart;
auto_possessify(temp, utf, cd);
- }
+ }
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
Modified: code/trunk/testdata/testinput11
===================================================================
--- code/trunk/testdata/testinput11 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testinput11 2014-07-12 18:22:54 UTC (rev 1495)
@@ -132,4 +132,6 @@
/abc(d|e)(*THEN)x(123(*THEN)4|567(b|q)(*THEN)xx)/B
+/(((a\2)|(a*)\g<-1>))*a?/B
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testinput2 2014-07-12 18:22:54 UTC (rev 1495)
@@ -4035,6 +4035,8 @@
/(?(R&6yh)abc)/
+/(((a\2)|(a*)\g<-1>))*a?/BZ
+
/-- Test the ugly "start or end of word" compatibility syntax --/
/[[:<:]]red[[:>:]]/BZ
Modified: code/trunk/testdata/testoutput11-16
===================================================================
--- code/trunk/testdata/testoutput11-16 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testoutput11-16 2014-07-12 18:22:54 UTC (rev 1495)
@@ -709,4 +709,28 @@
62 End
------------------------------------------------------------------
+/(((a\2)|(a*)\g<-1>))*a?/B
+------------------------------------------------------------------
+ 0 39 Bra
+ 2 Brazero
+ 3 32 SCBra 1
+ 6 27 Once
+ 8 12 CBra 2
+ 11 7 CBra 3
+ 14 a
+ 16 \2
+ 18 7 Ket
+ 20 11 Alt
+ 22 5 CBra 4
+ 25 a*
+ 27 5 Ket
+ 29 22 Recurse
+ 31 23 Ket
+ 33 27 Ket
+ 35 32 KetRmax
+ 37 a?+
+ 39 39 Ket
+ 41 End
+------------------------------------------------------------------
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testoutput11-32
===================================================================
--- code/trunk/testdata/testoutput11-32 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testoutput11-32 2014-07-12 18:22:54 UTC (rev 1495)
@@ -709,4 +709,28 @@
62 End
------------------------------------------------------------------
+/(((a\2)|(a*)\g<-1>))*a?/B
+------------------------------------------------------------------
+ 0 39 Bra
+ 2 Brazero
+ 3 32 SCBra 1
+ 6 27 Once
+ 8 12 CBra 2
+ 11 7 CBra 3
+ 14 a
+ 16 \2
+ 18 7 Ket
+ 20 11 Alt
+ 22 5 CBra 4
+ 25 a*
+ 27 5 Ket
+ 29 22 Recurse
+ 31 23 Ket
+ 33 27 Ket
+ 35 32 KetRmax
+ 37 a?+
+ 39 39 Ket
+ 41 End
+------------------------------------------------------------------
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testoutput11-8
===================================================================
--- code/trunk/testdata/testoutput11-8 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testoutput11-8 2014-07-12 18:22:54 UTC (rev 1495)
@@ -709,4 +709,28 @@
76 End
------------------------------------------------------------------
+/(((a\2)|(a*)\g<-1>))*a?/B
+------------------------------------------------------------------
+ 0 57 Bra
+ 3 Brazero
+ 4 48 SCBra 1
+ 9 40 Once
+ 12 18 CBra 2
+ 17 10 CBra 3
+ 22 a
+ 24 \2
+ 27 10 Ket
+ 30 16 Alt
+ 33 7 CBra 4
+ 38 a*
+ 40 7 Ket
+ 43 33 Recurse
+ 46 34 Ket
+ 49 40 Ket
+ 52 48 KetRmax
+ 55 a?+
+ 57 57 Ket
+ 60 End
+------------------------------------------------------------------
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2014-07-10 16:38:05 UTC (rev 1494)
+++ code/trunk/testdata/testoutput2 2014-07-12 18:22:54 UTC (rev 1495)
@@ -14093,6 +14093,30 @@
/(?(R&6yh)abc)/
Failed: group name must start with a non-digit at offset 5
+/(((a\2)|(a*)\g<-1>))*a?/BZ
+------------------------------------------------------------------
+ Bra
+ Brazero
+ SCBra 1
+ Once
+ CBra 2
+ CBra 3
+ a
+ \2
+ Ket
+ Alt
+ CBra 4
+ a*
+ Ket
+ Recurse
+ Ket
+ Ket
+ KetRmax
+ a?+
+ Ket
+ End
+------------------------------------------------------------------
+
/-- Test the ugly "start or end of word" compatibility syntax --/
/[[:<:]]red[[:>:]]/BZ