[Pcre-svn] [538] code/trunk: Fix pcre_study() problem with …

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [538] code/trunk: Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.
Revision: 538
          http://vcs.pcre.org/viewvc?view=rev&revision=538
Author:   ph10
Date:     2010-06-09 20:30:57 +0100 (Wed, 09 Jun 2010)


Log Message:
-----------
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/pcre_study.c
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/ChangeLog    2010-06-09 19:30:57 UTC (rev 538)
@@ -72,6 +72,18 @@
 18. If the last data line in a file for pcretest does not have a newline on
     the end, a newline was missing in the output. 


+19. The default pcre_chartables.c file recognizes only ASCII characters (values 
+    less than 128) in its various bitmaps. However, there is a facility for 
+    generating tables according to the current locale when PCRE is compiled. It 
+    turns out that in some environments, 0x85 and 0xa0, which are Unicode space 
+    characters, are recognized by isspace() and therefore were getting set in 
+    these tables. This caused a problem in UTF-8 mode when pcre_study() was
+    used to create a list of bytes that can start a match. For \s, it was
+    including 0x85 and 0xa0, which of course cannot start UTF-8 characters. I
+    have changed the code so that only real ASCII characters (less than 128)
+    are set in this case because the \s etc escapes are documented as 
+    recognizing only ASCII characters. (When PCRE_UCP is set - see 9 above - 
+    the code is different altogether.)



Version 8.02 19-Mar-2010

Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_dfa_exec.c    2010-06-09 19:30:57 UTC (rev 538)
@@ -3109,8 +3109,16 @@
         while (current_subject < end_subject)
           {
           register unsigned int c = *current_subject;
-          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
-            else break;
+          if ((start_bits[c/8] & (1 << (c&7))) == 0) 
+            {
+            current_subject++;
+#ifdef SUPPORT_UTF8
+            if (utf8)
+              while(current_subject < end_subject && 
+                    (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif            
+            }
+          else break;
           }
         }
       }


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_exec.c    2010-06-09 19:30:57 UTC (rev 538)
@@ -5959,8 +5959,16 @@
       while (start_match < end_subject)
         {
         register unsigned int c = *start_match;
-        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
-          else break;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0) 
+          {
+          start_match++;
+#ifdef SUPPORT_UTF8
+          if (utf8)
+            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+              start_match++;
+#endif            
+          }
+        else break;
         }
       }
     }   /* Starting optimizations */


Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/pcre_study.c    2010-06-09 19:30:57 UTC (rev 538)
@@ -519,6 +519,7 @@
 {
 register int c;
 int yield = SSB_DONE;
+int table_limit = utf8? 16:32;


 #if 0
 /* ========================================================================= */
@@ -676,13 +677,14 @@
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
-      SET_BIT(0xA0);
       if (utf8)
         {
+        SET_BIT(0xC2);  /* For U+00A0 */ 
         SET_BIT(0xE1);  /* For U+1680, U+180E */
         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
         SET_BIT(0xE3);  /* For U+3000 */
         }
+      else SET_BIT(0xA0);
       try_next = FALSE;
       break;


@@ -692,24 +694,33 @@
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
-      SET_BIT(0x85);
-      if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */
+      if (utf8) 
+        { 
+        SET_BIT(0xC2);  /* For U+0085 */ 
+        SET_BIT(0xE2);  /* For U+2028, U+2029 */
+        } 
+      else SET_BIT(0x85);
       try_next = FALSE;
       break;


       /* Single character types set the bits and stop. Note that if PCRE_UCP
       is set, we do not see these op codes because \d etc are converted to
       properties. Therefore, these apply in the case when only ASCII characters
-      are recognized to match the types. */
+      are recognized to match the types. In UTF-8 mode, we must restrict 
+      ourselves to bytes less than 128, as otherwise there can be confusion 
+      with bytes in the middle of UTF-8 characters. (In a "traditional" 
+      environment, the tables will only recognize ASCII characters anyway, but 
+      in at least one Windows environment, some higher bytes bits were set in 
+      the tables.) */


       case OP_NOT_DIGIT:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= ~cd->cbits[c+cbit_digit];
       try_next = FALSE;
       break;


       case OP_DIGIT:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= cd->cbits[c+cbit_digit];
       try_next = FALSE;
       break;
@@ -718,7 +729,7 @@
       discard it. */


       case OP_NOT_WHITESPACE:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         {
         int d = cd->cbits[c+cbit_space];
         if (c == 1) d &= ~0x08;
@@ -731,7 +742,7 @@
       discard it. */


       case OP_WHITESPACE:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         {
         int d = cd->cbits[c+cbit_space];
         if (c == 1) d &= ~0x08;
@@ -741,13 +752,13 @@
       break;


       case OP_NOT_WORDCHAR:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= ~cd->cbits[c+cbit_word];
       try_next = FALSE;
       break;


       case OP_WORDCHAR:
-      for (c = 0; c < 32; c++)
+      for (c = 0; c < table_limit; c++)
         start_bits[c] |= cd->cbits[c+cbit_word];
       try_next = FALSE;
       break;
@@ -789,13 +800,14 @@
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
-        SET_BIT(0xA0);
         if (utf8)
           {
+          SET_BIT(0xC2);  /* For U+00A0 */ 
           SET_BIT(0xE1);  /* For U+1680, U+180E */
           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
           SET_BIT(0xE3);  /* For U+3000 */
           }
+        else SET_BIT(0xA0);
         break;


         case OP_ANYNL:
@@ -804,17 +816,21 @@
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
-        SET_BIT(0x85);
-        if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */
+        if (utf8) 
+          {
+          SET_BIT(0xC2);  /* For U+0085 */ 
+          SET_BIT(0xE2);  /* For U+2028, U+2029 */
+          } 
+        else SET_BIT(0x85);
         break;


         case OP_NOT_DIGIT:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_digit];
         break;


         case OP_DIGIT:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= cd->cbits[c+cbit_digit];
         break;


@@ -822,7 +838,7 @@
         discard it. */


         case OP_NOT_WHITESPACE:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           {
           int d = cd->cbits[c+cbit_space];
           if (c == 1) d &= ~0x08;
@@ -834,7 +850,7 @@
         discard it. */


         case OP_WHITESPACE:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           {
           int d = cd->cbits[c+cbit_space];
           if (c == 1) d &= ~0x08;
@@ -843,12 +859,12 @@
         break;


         case OP_NOT_WORDCHAR:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_word];
         break;


         case OP_WORDCHAR:
-        for (c = 0; c < 32; c++)
+        for (c = 0; c < table_limit; c++)
           start_bits[c] |= cd->cbits[c+cbit_word];
         break;
         }


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/testdata/testinput5    2010-06-09 19:30:57 UTC (rev 538)
@@ -777,4 +777,6 @@


/\v+A/SI8

+/\s?xxx\s/8SI
+
/-- End of testinput5 --/

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2010-06-08 15:28:26 UTC (rev 537)
+++ code/trunk/testdata/testoutput5    2010-06-09 19:30:57 UTC (rev 538)
@@ -2090,13 +2090,13 @@
 No first char
 No need char
 Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3 
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 
     ABC\x{09}
  0: \x{09}
     ABC\x{20}
  0:  
     ABC\x{a0}
- 0: \xa0
+ 0: \x{a0}
     ABC\x{1680}
  0: \x{1680}
     ABC\x{180e}
@@ -2124,7 +2124,7 @@
 No first char
 No need char
 Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
     ABC\x{0a}
  0: \x{0a}
     ABC\x{0b}
@@ -2134,7 +2134,7 @@
     ABC\x{0d}
  0: \x{0d}
     ABC\x{85}
- 0: \x85
+ 0: \x{85}
     ABC\x{2028}
  0: \x{2028}


@@ -2152,7 +2152,7 @@
No first char
No need char
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2

 /\h*A/SI8
 Capturing subpattern count = 0
@@ -2160,7 +2160,7 @@
 No first char
 Need char = 'A'
 Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3 
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 
     CDBABC
  0: A


@@ -2170,6 +2170,14 @@
No first char
Need char = 'A'
Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2

+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x
+
/-- End of testinput5 --/