Revision: 524
http://vcs.pcre.org/viewvc?view=rev&revision=524
Author: ph10
Date: 2010-05-24 18:06:28 +0100 (Mon, 24 May 2010)
Log Message:
-----------
Make pcre_study() recognize \h, \v, and \R.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_study.c
code/trunk/testdata/testinput12
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput12
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/ChangeLog 2010-05-24 17:06:28 UTC (rev 524)
@@ -54,6 +54,9 @@
setting up an incorrect bitmap of starting bytes, but fortunately it could
not have actually happened in practice until change 8 above was made (it
added property types that matched character-matching opcodes).
+
+14. pcre_study() now recognizes \h, \v, and \R when constructing a bit map of
+ possible starting bytes for non-anchored patterns.
Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/pcre_study.c 2010-05-24 17:06:28 UTC (rev 524)
@@ -48,6 +48,7 @@
#include "pcre_internal.h"
+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
/* Returns from set_start_bits() */
@@ -460,8 +461,9 @@
compile_data *cd, BOOL utf8)
{
unsigned int c = *p;
-start_bits[c/8] |= (1 << (c&7));
+SET_BIT(c);
+
#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
{
@@ -472,8 +474,7 @@
uschar buff[8];
c = UCD_OTHERCASE(c);
(void)_pcre_ord2utf8(c, buff);
- c = buff[0];
- start_bits[c/8] |= (1 << (c&7));
+ SET_BIT(buff[0]);
}
#endif
return p;
@@ -482,8 +483,7 @@
/* Not UTF-8 mode, or character is less than 127. */
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
}
@@ -666,7 +666,37 @@
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
try_next = FALSE;
break;
+
+ /* Special spacing and line-terminating items. These recognize specific
+ lists of characters. The difference between VSPACE and ANYNL is that the
+ latter can match the two-character CRLF sequence, but that is not
+ relevant for finding the first character, so their code here is
+ identical. */
+
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ SET_BIT(0xA0);
+ if (utf8)
+ {
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ try_next = FALSE;
+ break;
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ SET_BIT(0x85);
+ if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ try_next = FALSE;
+ break;
+
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
properties. Therefore, these apply in the case when only ASCII characters
@@ -727,6 +757,7 @@
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
tcode++;
break;
@@ -754,7 +785,29 @@
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
-
+
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ SET_BIT(0xA0);
+ if (utf8)
+ {
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ SET_BIT(0x85);
+ if (utf8) SET_BIT(0xE2); /* For U+2028, U+2029 */
+ break;
+
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testinput12 2010-05-24 17:06:28 UTC (rev 524)
@@ -483,4 +483,6 @@
\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
\x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+/\p{Xps}*/SI
+
/-- End of testinput12 --/
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testinput5 2010-05-24 17:06:28 UTC (rev 524)
@@ -745,4 +745,36 @@
/X\W{3}X/8
\PX
+/\h/SI
+
+/\h/SI8
+ ABC\x{09}
+ ABC\x{20}
+ ABC\x{a0}
+ ABC\x{1680}
+ ABC\x{180e}
+ ABC\x{2000}
+ ABC\x{202f}
+ ABC\x{205f}
+ ABC\x{3000}
+
+/\v/SI
+
+/\v/SI8
+ ABC\x{0a}
+ ABC\x{0b}
+ ABC\x{0c}
+ ABC\x{0d}
+ ABC\x{85}
+ ABC\x{2028}
+
+/\R/SI
+
+/\R/SI8
+
+/\h*A/SI8
+ CDBABC
+
+/\v+A/SI8
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testoutput12
===================================================================
--- code/trunk/testdata/testoutput12 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testoutput12 2010-05-24 17:06:28 UTC (rev 524)
@@ -1059,4 +1059,12 @@
\x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
0: \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
+/\p{Xps}*/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 0
+No set of starting bytes
+
/-- End of testinput12 --/
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2010-05-23 18:48:54 UTC (rev 523)
+++ code/trunk/testdata/testoutput5 2010-05-24 17:06:28 UTC (rev 524)
@@ -2076,4 +2076,100 @@
\PX
Partial match: X
+/\h/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 \xe1 \xe2 \xe3
+ ABC\x{09}
+ 0: \x{09}
+ ABC\x{20}
+ 0:
+ ABC\x{a0}
+ 0: \xa0
+ ABC\x{1680}
+ 0: \x{1680}
+ ABC\x{180e}
+ 0: \x{180e}
+ ABC\x{2000}
+ 0: \x{2000}
+ ABC\x{202f}
+ 0: \x{202f}
+ ABC\x{205f}
+ 0: \x{205f}
+ ABC\x{3000}
+ 0: \x{3000}
+
+/\v/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+ ABC\x{0a}
+ 0: \x{0a}
+ ABC\x{0b}
+ 0: \x{0b}
+ ABC\x{0c}
+ 0: \x{0c}
+ ABC\x{0d}
+ 0: \x{0d}
+ ABC\x{85}
+ 0: \x85
+ ABC\x{2028}
+ 0: \x{2028}
+
+/\R/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xa0 \xe1 \xe2 \xe3
+ CDBABC
+ 0: A
+
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf8
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xe2
+
/-- End of testinput5 --/