[Pcre-svn] [524] code/trunk: Make pcre_study() recognize \h,…

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [524] code/trunk: Make pcre_study() recognize \h, \v, and \R.
Revision: 524
          http://vcs.pcre.org/viewvc?view=rev&revision=524
Author:   ph10
Date:     2010-05-24 18:06:28 +0100 (Mon, 24 May 2010)


Log Message:
-----------
Make pcre_study() recognize \h, \v, and \R.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_study.c
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput12
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/ChangeLog    2010-05-24 17:06:28 UTC (rev 524)
@@ -54,6 +54,9 @@
     setting up an incorrect bitmap of starting bytes, but fortunately it could 
     not have actually happened in practice until change 8 above was made (it 
     added property types that matched character-matching opcodes).
+    
+14. pcre_study() now recognizes \h, \v, and \R when constructing a bit map of 
+    possible starting bytes for non-anchored patterns. 





Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/pcre_study.c    2010-05-24 17:06:28 UTC (rev 524)
@@ -48,6 +48,7 @@


#include "pcre_internal.h"

+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))

/* Returns from set_start_bits() */

@@ -460,8 +461,9 @@
compile_data *cd, BOOL utf8)
{
unsigned int c = *p;
-start_bits[c/8] |= (1 << (c&7));

+SET_BIT(c);
+
 #ifdef SUPPORT_UTF8
 if (utf8 && c > 127)
   {
@@ -472,8 +474,7 @@
     uschar buff[8]; 
     c = UCD_OTHERCASE(c);
     (void)_pcre_ord2utf8(c, buff); 
-    c = buff[0];
-    start_bits[c/8] |= (1 << (c&7));
+    SET_BIT(buff[0]); 
     }  
 #endif 
   return p;
@@ -482,8 +483,7 @@


/* Not UTF-8 mode, or character is less than 127. */

-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
}

@@ -666,7 +666,37 @@
       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
       try_next = FALSE;
       break;
+      
+      /* Special spacing and line-terminating items. These recognize specific 
+      lists of characters. The difference between VSPACE and ANYNL is that the 
+      latter can match the two-character CRLF sequence, but that is not 
+      relevant for finding the first character, so their code here is 
+      identical. */
+      
+      case OP_HSPACE:
+      SET_BIT(0x09);
+      SET_BIT(0x20);
+      SET_BIT(0xA0);
+      if (utf8)
+        {  
+        SET_BIT(0xE1);  /* For U+1680, U+180E */
+        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+        SET_BIT(0xE3);  /* For U+3000 */ 
+        }
+      try_next = FALSE;
+      break;         


+      case OP_ANYNL:  
+      case OP_VSPACE:
+      SET_BIT(0x0A); 
+      SET_BIT(0x0B); 
+      SET_BIT(0x0C); 
+      SET_BIT(0x0D); 
+      SET_BIT(0x85); 
+      if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */ 
+      try_next = FALSE;
+      break;  
+
       /* Single character types set the bits and stop. Note that if PCRE_UCP 
       is set, we do not see these op codes because \d etc are converted to 
       properties. Therefore, these apply in the case when only ASCII characters 
@@ -727,6 +757,7 @@


       case OP_TYPEPLUS:
       case OP_TYPEMINPLUS:
+      case OP_TYPEPOSPLUS: 
       tcode++;
       break;


@@ -754,7 +785,29 @@
         case OP_ANY:
         case OP_ALLANY:
         return SSB_FAIL;
-
+        
+        case OP_HSPACE:
+        SET_BIT(0x09);
+        SET_BIT(0x20);
+        SET_BIT(0xA0);
+        if (utf8)
+          {  
+          SET_BIT(0xE1);  /* For U+1680, U+180E */
+          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+          SET_BIT(0xE3);  /* For U+3000 */ 
+          }
+        break;         
+  
+        case OP_ANYNL:  
+        case OP_VSPACE:
+        SET_BIT(0x0A); 
+        SET_BIT(0x0B); 
+        SET_BIT(0x0C); 
+        SET_BIT(0x0D); 
+        SET_BIT(0x85); 
+        if (utf8) SET_BIT(0xE2);    /* For U+2028, U+2029 */ 
+        break;  
+ 
         case OP_NOT_DIGIT:
         for (c = 0; c < 32; c++)
           start_bits[c] |= ~cd->cbits[c+cbit_digit];


Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testinput12    2010-05-24 17:06:28 UTC (rev 524)
@@ -483,4 +483,6 @@
     \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
     \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}


+/\p{Xps}*/SI
+
/-- End of testinput12 --/

Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testinput5    2010-05-24 17:06:28 UTC (rev 524)
@@ -745,4 +745,36 @@
 /X\W{3}X/8
     \PX


+/\h/SI
+
+/\h/SI8
+    ABC\x{09}
+    ABC\x{20}
+    ABC\x{a0}
+    ABC\x{1680}
+    ABC\x{180e}
+    ABC\x{2000}
+    ABC\x{202f} 
+    ABC\x{205f} 
+    ABC\x{3000} 
+
+/\v/SI
+
+/\v/SI8
+    ABC\x{0a}
+    ABC\x{0b}
+    ABC\x{0c}
+    ABC\x{0d}
+    ABC\x{85}
+    ABC\x{2028}
+
+/\R/SI
+
+/\R/SI8
+
+/\h*A/SI8
+    CDBABC
+    
+/\v+A/SI8
+
 /-- End of testinput5 --/


Modified: code/trunk/testdata/testoutput12
===================================================================
--- code/trunk/testdata/testoutput12    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testoutput12    2010-05-24 17:06:28 UTC (rev 524)
@@ -1059,4 +1059,12 @@
     \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
  0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}


+/\p{Xps}*/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 0
+No set of starting bytes
+
/-- End of testinput12 --/

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testoutput5    2010-05-24 17:06:28 UTC (rev 524)
@@ -2076,4 +2076,100 @@
     \PX
 Partial match: X


+/\h/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3 
+    ABC\x{09}
+ 0: \x{09}
+    ABC\x{20}
+ 0:  
+    ABC\x{a0}
+ 0: \xa0
+    ABC\x{1680}
+ 0: \x{1680}
+    ABC\x{180e}
+ 0: \x{180e}
+    ABC\x{2000}
+ 0: \x{2000}
+    ABC\x{202f} 
+ 0: \x{202f}
+    ABC\x{205f} 
+ 0: \x{205f}
+    ABC\x{3000} 
+ 0: \x{3000}
+
+/\v/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+    ABC\x{0a}
+ 0: \x{0a}
+    ABC\x{0b}
+ 0: \x{0b}
+    ABC\x{0c}
+ 0: \x{0c}
+    ABC\x{0d}
+ 0: \x{0d}
+    ABC\x{85}
+ 0: \x85
+    ABC\x{2028}
+ 0: \x{2028}
+
+/\R/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3 
+    CDBABC
+ 0: A
+    
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2 
+
 /-- End of testinput5 --/