Revision: 365
http://vcs.pcre.org/viewvc?view=rev&revision=365
Author: ph10
Date: 2008-07-11 18:06:55 +0100 (Fri, 11 Jul 2008)
Log Message:
-----------
Further fixes for bumpalong processing in UTF-8 mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/testdata/testinput8
code/trunk/testdata/testoutput8
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/ChangeLog 2008-07-11 17:06:55 UTC (rev 365)
@@ -61,7 +61,7 @@
14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
the data contained the byte 0x85 as part of a UTF-8 character within its
- first line.
+ first line. This applied both to normal and DFA matching.
Version 7.7 07-May-08
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/pcre_dfa_exec.c 2008-07-11 17:06:55 UTC (rev 365)
@@ -2733,7 +2733,18 @@
if (firstline)
{
- const uschar *t = current_subject;
+ USPTR t = current_subject;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (t < md->end_subject && !IS_NEWLINE(t))
+ {
+ t++;
+ while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ }
+ }
+ else
+#endif
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
@@ -2755,9 +2766,22 @@
{
if (current_subject > md->start_subject + start_offset)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
+ {
+ current_subject++;
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80)
+ current_subject++;
+ }
+ }
+ else
+#endif
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
current_subject++;
-
+
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
character. */
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/pcre_exec.c 2008-07-11 17:06:55 UTC (rev 365)
@@ -4695,7 +4695,7 @@
if (firstline)
{
- USPTR *t = start_match;
+ USPTR t = start_match;
#ifdef SUPPORT_UTF8
if (utf8)
{
@@ -4716,39 +4716,11 @@
if (first_byte >= 0)
{
if (first_byte_caseless)
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject && md->lcc[*start_match] != first_byte)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- }
- else
-#endif
while (start_match < end_subject && md->lcc[*start_match] != first_byte)
start_match++;
- }
- else /* Caseful case */
- {
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject && *start_match != first_byte)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- }
- else
-#endif
+ else
while (start_match < end_subject && *start_match != first_byte)
start_match++;
- }
}
/* Or to just after a linebreak for a multiline match */
@@ -4788,23 +4760,6 @@
else if (start_bits != NULL)
{
-#ifdef SUPPORT_UTF8
- if (utf8)
- {
- while (start_match < end_subject)
- {
- register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- {
- start_match++;
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
- }
- else break;
- }
- }
- else
-#endif
while (start_match < end_subject)
{
register unsigned int c = *start_match;
Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8 2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/testdata/testinput8 2008-07-11 17:06:55 UTC (rev 365)
@@ -664,4 +664,7 @@
a\x{85}b\<bsr_anycrlf>
a\x0bb\<bsr_anycrlf>
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+
/ End of testinput 8 /
Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8 2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/testdata/testoutput8 2008-07-11 17:06:55 UTC (rev 365)
@@ -1284,4 +1284,8 @@
a\x0bb\<bsr_anycrlf>
No match
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+ 0: X
+
/ End of testinput 8 /