Revision: 244
http://www.exim.org/viewvc/pcre2?view=rev&revision=244
Author: ph10
Date: 2015-04-08 17:33:58 +0100 (Wed, 08 Apr 2015)
Log Message:
-----------
Fix backtracking bug for \C\X* in UTF mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_match.c
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput4
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-04-06 12:16:36 UTC (rev 243)
+++ code/trunk/ChangeLog 2015-04-08 16:33:58 UTC (rev 244)
@@ -73,7 +73,13 @@
18. There was a similar problem to 17 in pcre2test for global matches, though
the code there did catch the loop.
+19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*),
+and a subsequent item in the pattern caused a non-match, backtracking over the
+repeated \X did not stop, but carried on past the start of the subject, causing
+reference to random memory and/or a segfault. This bug was discovered by the
+LLVM fuzzer.
+
Version 10.10 06-March-2015
---------------------------
Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c 2015-04-06 12:16:36 UTC (rev 243)
+++ code/trunk/src/pcre2_match.c 2015-04-08 16:33:58 UTC (rev 244)
@@ -1333,14 +1333,14 @@
if (*ecode == OP_CALLOUT)
{
cb.callout_number = ecode[1 + 2*LINK_SIZE];
- cb.callout_string_offset = 0;
+ cb.callout_string_offset = 0;
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
- cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
+ cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 4*LINK_SIZE) - 2;
@@ -1408,7 +1408,7 @@
break;
case OP_FALSE:
- case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
+ case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
break;
case OP_TRUE:
@@ -1760,14 +1760,14 @@
if (*ecode == OP_CALLOUT)
{
cb.callout_number = ecode[1 + 2*LINK_SIZE];
- cb.callout_string_offset = 0;
+ cb.callout_string_offset = 0;
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
- cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
+ cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 4*LINK_SIZE) - 2;
@@ -5723,12 +5723,17 @@
if (possessive) continue; /* No backtracking */
+ /* We use <= pp rather than == pp to detect the start of the run while
+ backtracking because the use of \C in UTF mode can cause BACKCHAR to
+ move back past pp. This is just palliative; the use of \C in UTF mode
+ is fraught with danger. */
+
for(;;)
{
int lgb, rgb;
PCRE2_SPTR fptr;
- if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -5746,7 +5751,7 @@
for (;;)
{
- if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
fptr = eptr - 1;
if (!utf) c = *fptr; else
{
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2015-04-06 12:16:36 UTC (rev 243)
+++ code/trunk/testdata/testinput4 2015-04-08 16:33:58 UTC (rev 244)
@@ -2221,4 +2221,10 @@
"[\S\V\H]"utf
+/\C\X*TӅ;
+{0,6}\v+
+F
+/utf
+ Ӆ\x0a
+
# End of testinput4
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2015-04-06 12:16:36 UTC (rev 243)
+++ code/trunk/testdata/testoutput4 2015-04-08 16:33:58 UTC (rev 244)
@@ -3741,4 +3741,11 @@
"[\S\V\H]"utf
+/\C\X*TӅ;
+{0,6}\v+
+F
+/utf
+ Ӆ\x0a
+No match
+
# End of testinput4