Revision: 538
http://vcs.pcre.org/viewvc?view=rev&revision=538
Author: ph10
Date: 2010-06-09 20:30:57 +0100 (Wed, 09 Jun 2010)
Log Message:
-----------
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_study.c
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/ChangeLog 2010-06-09 19:30:57 UTC (rev 538)
@@ -72,6 +72,18 @@
18. If the last data line in a file for pcretest does not have a newline on
the end, a newline was missing in the output.
+19. The default pcre_chartables.c file recognizes only ASCII characters (values
+ less than 128) in its various bitmaps. However, there is a facility for
+ generating tables according to the current locale when PCRE is compiled. It
+ turns out that in some environments, 0x85 and 0xa0, which are Unicode space
+ characters, are recognized by isspace() and therefore were getting set in
+ these tables. This caused a problem in UTF-8 mode when pcre_study() was
+ used to create a list of bytes that can start a match. For \s, it was
+ including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
+ have changed the code so that only real ASCII characters (less than 128)
+ are set in this case because the \s etc escapes are documented as
+ recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above -
+ the code is different altogether.)
Version 8.02 19-Mar-2010
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_dfa_exec.c 2010-06-09 19:30:57 UTC (rev 538)
@@ -3109,8 +3109,16 @@
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ current_subject++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(current_subject < end_subject &&
+ (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+ }
+ else break;
}
}
}
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_exec.c 2010-06-09 19:30:57 UTC (rev 538)
@@ -5959,8 +5959,16 @@
while (start_match < end_subject)
{
register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
- else break;
+ if ((start_bits[c/8] & (1 << (c&7))) == 0)
+ {
+ start_match++;
+#ifdef SUPPORT_UTF8
+ if (utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
+#endif
+ }
+ else break;
}
}
} /* Starting optimizations */
Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_study.c 2010-06-09 19:30:57 UTC (rev 538)
@@ -519,6 +519,7 @@
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@@ -676,13 +677,14 @@
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -692,24 +694,33 @@
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
try_next = FALSE;
break;
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
properties. Therefore, these apply in the case when only ASCII characters
- are recognized to match the types. */
+ are recognized to match the types. In UTF-8 mode, we must restrict
+ ourselves to bytes less than 128, as otherwise there can be confusion
+ with bytes in the middle of UTF-8 characters. (In a "traditional"
+ environment, the tables will only recognize ASCII characters anyway, but
+ in at least one Windows environment, some higher bytes bits were set in
+ the tables.) */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
try_next = FALSE;
break;
@@ -718,7 +729,7 @@
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -731,7 +742,7 @@
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -741,13 +752,13 @@
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
try_next = FALSE;
break;
@@ -789,13 +800,14 @@
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- SET_BIT(0xA0);
if (utf8)
{
+ SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
+ else SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -804,17 +816,21 @@
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- SET_BIT(0x85);
- if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
@@ -822,7 +838,7 @@
discard it. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -834,7 +850,7 @@
discard it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
@@ -843,12 +859,12 @@
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
+ for (c = 0; c < table_limit; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
}
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/testdata/testinput5 2010-06-09 19:30:57 UTC (rev 538)
@@ -777,4 +777,6 @@
/\v+A/SI8
+/\s?xxx\s/8SI
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/testdata/testoutput5 2010-06-09 19:30:57 UTC (rev 538)
@@ -2090,13 +2090,13 @@
No first char
No need char
Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3
ABC\x{09}
0: \x{09}
ABC\x{20}
0:
ABC\x{a0}
- 0: \xa0
+ 0: \x{a0}
ABC\x{1680}
0: \x{1680}
ABC\x{180e}
@@ -2124,7 +2124,7 @@
No first char
No need char
Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
ABC\x{0a}
0: \x{0a}
ABC\x{0b}
@@ -2134,7 +2134,7 @@
ABC\x{0d}
0: \x{0d}
ABC\x{85}
- 0: \x85
+ 0: \x{85}
ABC\x{2028}
0: \x{2028}
@@ -2152,7 +2152,7 @@
No first char
No need char
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
/\h*A/SI8
Capturing subpattern count = 0
@@ -2160,7 +2160,7 @@
No first char
Need char = 'A'
Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3
CDBABC
0: A
@@ -2170,6 +2170,14 @@
No first char
Need char = 'A'
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x
+
/-- End of testinput5 --/