[Pcre-svn] [520] code/trunk: Fix caseless bug with pcre_stud…

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [520] code/trunk: Fix caseless bug with pcre_study() for starting letter greater than 127.
Revision: 520
          http://vcs.pcre.org/viewvc?view=rev&revision=520
Author:   ph10
Date:     2010-05-22 19:54:05 +0100 (Sat, 22 May 2010)


Log Message:
-----------
Fix caseless bug with pcre_study() for starting letter greater than 127.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_study.c
    code/trunk/testdata/testinput12
    code/trunk/testdata/testoutput12


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/ChangeLog    2010-05-22 18:54:05 UTC (rev 520)
@@ -39,7 +39,12 @@


10. Added --line-buffered to pcregrep.

+11. In UTF-8 mode, if a pattern that was compiled with PCRE_CASELESS was 
+    studied, and the match started with a letter with a code point greater than 
+    127 whose first byte was different to the first byte of the other case of
+    the letter, the other case of this starting letter was not recognized.


+
Version 8.02 19-Mar-2010
------------------------


Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c    2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/pcre_study.c    2010-05-22 18:54:05 UTC (rev 520)
@@ -440,25 +440,51 @@
 *      Set a bit and maybe its alternate case    *
 *************************************************/


-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.

 Arguments:
   start_bits    points to the bit map
-  c             is the character
+  p             points to the character
   caseless      the caseless flag
   cd            the block with char table pointers
+  utf8          TRUE for UTF-8 mode 


-Returns:        nothing
+Returns:        pointer after the character
 */


-static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
-  compile_data *cd)
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+  compile_data *cd, BOOL utf8)
 {
+unsigned int c = *p;
 start_bits[c/8] |= (1 << (c&7));
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+  {
+  GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+  if (caseless)
+    {
+    uschar buff[8]; 
+    c = UCD_OTHERCASE(c);
+    (void)_pcre_ord2utf8(c, buff); 
+    c = buff[0];
+    start_bits[c/8] |= (1 << (c&7));
+    }  
+#endif 
+  return p;
+  }
+#endif    
+
+/* Not UTF-8 mode, or character is less than 127. */
+
 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+return p + 1;
 }



@@ -616,12 +642,7 @@
       case OP_QUERY:
       case OP_MINQUERY:
       case OP_POSQUERY:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
-      tcode += 2;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       break;


       /* Single-char upto sets the bit and tries the next */
@@ -629,12 +650,7 @@
       case OP_UPTO:
       case OP_MINUPTO:
       case OP_POSUPTO:
-      set_table_bit(start_bits, tcode[3], caseless, cd);
-      tcode += 4;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
       break;


       /* At least one single char sets the bit and stops */
@@ -647,11 +663,14 @@
       case OP_PLUS:
       case OP_MINPLUS:
       case OP_POSPLUS:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
+      (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       try_next = FALSE;
       break;


-      /* Single character type sets the bits and stops */
+      /* Single character types set the bits and stop. Note that if PCRE_UCP 
+      is set, we do not see these op codes because \d etc are converted to 
+      properties. Therefore, these apply in the case when only ASCII characters 
+      are recognized to match the types. */


       case OP_NOT_DIGIT:
       for (c = 0; c < 32; c++)


Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/testdata/testinput12    2010-05-22 18:54:05 UTC (rev 520)
@@ -479,4 +479,8 @@
     abc\x{123}
     \x{660}abc   


+/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
+    \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+    \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+
 /-- End of testinput12 --/


Modified: code/trunk/testdata/testoutput12
===================================================================
--- code/trunk/testdata/testoutput12    2010-05-21 16:43:17 UTC (rev 519)
+++ code/trunk/testdata/testoutput12    2010-05-22 18:54:05 UTC (rev 520)
@@ -1047,4 +1047,16 @@
     \x{660}abc   
  0: abc


+/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
+Capturing subpattern count = 0
+Options: caseless utf8
+No first char
+No need char
+Subject length lower bound = 17
+Starting byte set: \xd0 \xd1 
+    \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+ 0: \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
+    \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+ 0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+
 /-- End of testinput12 --/