Revision: 520
http://vcs.pcre.org/viewvc?view=rev&revision=520
Author: ph10
Date: 2010-05-22 19:54:05 +0100 (Sat, 22 May 2010)
Log Message:
-----------
Fix caseless bug with pcre_study() for starting letter greater than 127.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_study.c
code/trunk/testdata/testinput12
code/trunk/testdata/testoutput12
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/ChangeLog 2010-05-22 18:54:05 UTC (rev 520)
@@ -39,7 +39,12 @@
10. Added --line-buffered to pcregrep.
+11. In UTF-8 mode, if a pattern that was compiled with PCRE_CASELESS was
+ studied, and the match started with a letter with a code point greater than
+ 127 whose first byte was different to the first byte of the other case of
+ the letter, the other case of this starting letter was not recognized.
+
Version 8.02 19-Mar-2010
------------------------
Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c 2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/pcre_study.c 2010-05-22 18:54:05 UTC (rev 520)
@@ -440,25 +440,51 @@
* Set a bit and maybe its alternate case *
*************************************************/
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.
Arguments:
start_bits points to the bit map
- c is the character
+ p points to the character
caseless the caseless flag
cd the block with char table pointers
+ utf8 TRUE for UTF-8 mode
-Returns: nothing
+Returns: pointer after the character
*/
-static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
- compile_data *cd)
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+ compile_data *cd, BOOL utf8)
{
+unsigned int c = *p;
start_bits[c/8] |= (1 << (c&7));
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ uschar buff[8];
+ c = UCD_OTHERCASE(c);
+ (void)_pcre_ord2utf8(c, buff);
+ c = buff[0];
+ start_bits[c/8] |= (1 << (c&7));
+ }
+#endif
+ return p;
+ }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+return p + 1;
}
@@ -616,12 +642,7 @@
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- set_table_bit(start_bits, tcode[1], caseless, cd);
- tcode += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
@@ -629,12 +650,7 @@
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- set_table_bit(start_bits, tcode[3], caseless, cd);
- tcode += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
break;
/* At least one single char sets the bit and stops */
@@ -647,11 +663,14 @@
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- set_table_bit(start_bits, tcode[1], caseless, cd);
+ (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
try_next = FALSE;
break;
- /* Single character type sets the bits and stops */
+ /* Single character types set the bits and stop. Note that if PCRE_UCP
+ is set, we do not see these op codes because \d etc are converted to
+ properties. Therefore, these apply in the case when only ASCII characters
+ are recognized to match the types. */
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12 2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/testdata/testinput12 2010-05-22 18:54:05 UTC (rev 520)
@@ -479,4 +479,8 @@
abc\x{123}
\x{660}abc
+/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
+ \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+ \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+
/-- End of testinput12 --/
Modified: code/trunk/testdata/testoutput12
===================================================================
--- code/trunk/testdata/testoutput12 2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/testdata/testoutput12 2010-05-22 18:54:05 UTC (rev 520)
@@ -1047,4 +1047,16 @@
\x{660}abc
0: abc
+/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
+Capturing subpattern count = 0
+Options: caseless utf8
+No first char
+No need char
+Subject length lower bound = 17
+Starting byte set: \xd0 \xd1
+ \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+ 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+ \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+ 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+
/-- End of testinput12 --/