Revision: 364
http://vcs.pcre.org/viewvc?view=rev&revision=364
Author: ph10
Date: 2008-07-11 15:53:41 +0100 (Fri, 11 Jul 2008)
Log Message:
-----------
Several bugs concerned with skipping over UTF-8 characters at the start of
matching (8.0/13, 8.0/14).
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/ChangeLog 2008-07-11 14:53:41 UTC (rev 364)
@@ -55,6 +55,13 @@
pcre_dfa_exec() could read past the end of the passed subject if there was
no match. To help with detecting such bugs (e.g. with valgrind), I modified
pcretest so that it places the subject at the end of its malloc-ed buffer.
+
+13. The change to pcretest in 12 above threw up a couple more cases when pcre_
+ exec() might read past the end of the data buffer in UTF-8 mode.
+
+14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
+ the data contained the byte 0x85 as part of a UTF-8 character within its
+ first line.
Version 7.7 07-May-08
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/pcre_exec.c 2008-07-11 14:53:41 UTC (rev 364)
@@ -4695,32 +4695,82 @@
if (firstline)
{
- USPTR t = start_match;
+ USPTR *t = start_match;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (t < md->end_subject && !IS_NEWLINE(t))
+ {
+ t++;
+ while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+ }
+ }
+ else
+#endif
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
- /* Now test for a unique first byte */
+ /* Now advance to a unique first byte if there is one. */
if (first_byte >= 0)
{
if (first_byte_caseless)
- while (start_match < end_subject &&
- md->lcc[*start_match] != first_byte)
- { NEXTCHAR(start_match); }
- else
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
+ while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+ start_match++;
+ }
+ else /* Caseful case */
+ {
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && *start_match != first_byte)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
while (start_match < end_subject && *start_match != first_byte)
- { NEXTCHAR(start_match); }
+ start_match++;
+ }
}
- /* Or to just after a linebreak for a multiline match if possible */
+ /* Or to just after a linebreak for a multiline match */
else if (startline)
{
if (start_match > md->start_subject + start_offset)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject && !WAS_NEWLINE(start_match))
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ }
+ else
+#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
- { NEXTCHAR(start_match); }
+ start_match++;
/* If we have just passed a CR and the newline option is ANY or ANYCRLF,
and we are now at a LF, advance the match position by one more character.
@@ -4734,16 +4784,32 @@
}
}
- /* Or to a non-unique first char after study */
+ /* Or to a non-unique first byte after study */
else if (start_bits != NULL)
{
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ {
+ while (start_match < end_subject)
+ {
+ register unsigned int c = *start_match;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+ }
+ else break;
+ }
+ }
+ else
+#endif
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- { NEXTCHAR(start_match); }
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+ else break;
}
}
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/pcre_internal.h 2008-07-11 14:53:41 UTC (rev 364)
@@ -381,7 +381,6 @@
support is omitted, we don't even define it. */
#ifndef SUPPORT_UTF8
-#define NEXTCHAR(p) p++;
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -391,13 +390,6 @@
#else /* SUPPORT_UTF8 */
-/* Advance a character pointer one byte in non-UTF-8 mode and by one character
-in UTF-8 mode. */
-
-#define NEXTCHAR(p) \
- p++; \
- if (utf8) { while((*p & 0xc0) == 0x80) p++; }
-
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/testdata/testinput5 2008-07-11 14:53:41 UTC (rev 364)
@@ -477,4 +477,7 @@
\x{de}\x{de}
\x{123}
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+
/ End of testinput5 /
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/testdata/testoutput5 2008-07-11 14:53:41 UTC (rev 364)
@@ -1637,4 +1637,8 @@
** Truncation will probably give the wrong result.
No match
+/X/8f<any>
+ A\x{1ec5}ABCXYZ
+ 0: X
+
/ End of testinput5 /