Revision: 979
http://vcs.pcre.org/viewvc?view=rev&revision=979
Author: ph10
Date: 2012-06-17 20:08:41 +0100 (Sun, 17 Jun 2012)
Log Message:
-----------
Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_dfa_exec.c
code/trunk/testdata/testinput8
code/trunk/testdata/testoutput8
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/ChangeLog 2012-06-17 19:08:41 UTC (rev 979)
@@ -136,6 +136,15 @@
the same checks as \x{...} characters in non-JavaScript mode. Specifically,
codepoints that are too big for the mode are faulted, and in a UTF mode,
disallowed codepoints are also faulted.
+
+39. If PCRE was compiled with UTF support, in three places in the DFA
+ matcher there was code that should only have been obeyed in UTF mode, but
+ was being obeyed unconditionally. In 8-bit mode this could cause incorrect
+ processing when bytes with values greater than 127 were present. In 16-bit
+ mode the bug would be provoked by values in the range 0xfc00 to 0xdc00. In
+ both cases the values are those that cannot be the first data item in a UTF
+ character. The three items that might have provoked this were recursions,
+ possessively repeated groups, and atomic groups.
Version 8.30 04-February-2012
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/pcre_dfa_exec.c 2012-06-17 19:08:41 UTC (rev 979)
@@ -38,7 +38,6 @@
-----------------------------------------------------------------------------
*/
-
/* This module contains the external function pcre_dfa_exec(), which is an
alternative matching function that uses a sort of DFA algorithm (not a true
FSM). This is NOT Perl-compatible, but it has advantages in certain
@@ -382,7 +381,8 @@
next_new_state->count = (y); \
next_new_state->data = (z); \
next_new_state++; \
- DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+ DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+ (x), (y), (z), __LINE__)); \
} \
else return PCRE_ERROR_DFA_WSSIZE
@@ -611,7 +611,7 @@
if (ptr < end_subject)
{
- clen = 1; /* Number of bytes in the character */
+ clen = 1; /* Number of data items in the character */
#ifdef SUPPORT_UTF
if (utf) { GETCHARLEN(c, ptr, clen); } else
#endif /* SUPPORT_UTF */
@@ -789,7 +789,7 @@
offsets[0] = (int)(current_subject - start_subject);
offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
- offsets[1] - offsets[0], current_subject));
+ offsets[1] - offsets[0], (char *)current_subject));
}
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
{
@@ -2797,9 +2797,12 @@
{
int charcount = local_offsets[rc+1] - local_offsets[rc];
#ifdef SUPPORT_UTF
- const pcre_uchar *p = start_subject + local_offsets[rc];
- const pcre_uchar *pp = start_subject + local_offsets[rc+1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[rc];
+ const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
#endif
if (charcount > 0)
{
@@ -2898,7 +2901,7 @@
const pcre_uchar *pp = local_ptr;
charcount = (int)(pp - p);
#ifdef SUPPORT_UTF
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
}
@@ -2980,9 +2983,12 @@
else
{
#ifdef SUPPORT_UTF
- const pcre_uchar *p = start_subject + local_offsets[0];
- const pcre_uchar *pp = start_subject + local_offsets[1];
- while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ if (utf)
+ {
+ const pcre_uchar *p = start_subject + local_offsets[0];
+ const pcre_uchar *pp = start_subject + local_offsets[1];
+ while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+ }
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
if (repeat_state_offset >= 0)
Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8 2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/testdata/testinput8 2012-06-17 19:08:41 UTC (rev 979)
@@ -4787,4 +4787,15 @@
/abcdef/
abc\R
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+ text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+
+/^(?>.{4})abc|^\w\w.xabcd/
+ xxxxabcd
+ xx\xa0xabcd
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+ xxxxxxxxabcd
+ xx\xa0xxxxxabcd
+
/-- End of testinput8 --/
Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8 2012-06-17 16:55:07 UTC (rev 978)
+++ code/trunk/testdata/testoutput8 2012-06-17 19:08:41 UTC (rev 979)
@@ -7996,4 +7996,24 @@
abc\R
Error -30 (invalid data in workspace for DFA restart)
+/<H((?(?!<H|F>)(.)|(?R))++)*F>/
+ text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.
+ 0: <H more text <H texting more hexA0-"\xa0" hex above 7F-"\xbc" F> text xxxxx <H text F> text F>
+
+/^(?>.{4})abc|^\w\w.xabcd/
+ xxxxabcd
+ 0: xxxxabcd
+ 1: xxxxabc
+ xx\xa0xabcd
+ 0: xx\xa0xabcd
+ 1: xx\xa0xabc
+
+/^(.{4}){2}+abc|^\w\w.x\w\w\w\wabcd/
+ xxxxxxxxabcd
+ 0: xxxxxxxxabcd
+ 1: xxxxxxxxabc
+ xx\xa0xxxxxabcd
+ 0: xx\xa0xxxxxabcd
+ 1: xx\xa0xxxxxabc
+
/-- End of testinput8 --/