Revision: 925
http://vcs.pcre.org/viewvc?view=rev&revision=925
Author: ph10
Date: 2012-02-22 14:24:56 +0000 (Wed, 22 Feb 2012)
Log Message:
-----------
Upgrade DFA support to handle OP_NOTI with multibyte characters (other related
opcodes were OK).
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_dfa_exec.c
code/trunk/testdata/testinput10
code/trunk/testdata/testinput9
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput9
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/ChangeLog 2012-02-22 14:24:56 UTC (rev 925)
@@ -42,7 +42,7 @@
7. Individual JIT compile options can be set in pcretest by following -s+[+]
or /S+[+] with a digit between 1 and 7.
-8. OP_NOT now supports any UTF character not just single character ones.
+8. OP_NOT now supports any UTF character not just single-byte ones.
Version 8.30 04-February-2012
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/pcre_dfa_exec.c 2012-02-22 14:24:56 UTC (rev 925)
@@ -695,10 +695,10 @@
permitted.
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
- argument that is not a data character - but is always one byte long. We
- have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
- this case. To keep the other cases fast, convert these ones to new opcodes.
- */
+ argument that is not a data character - but is always one byte long because
+ the values are small. We have to take special action to deal with \P, \p,
+ \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
+ these ones to new opcodes. */
if (coptable[codevalue] > 0)
{
@@ -2266,22 +2266,32 @@
break;
/*-----------------------------------------------------------------*/
- /* Match a negated single character casefully. This is only used for
- one-byte characters, that is, we know that d < 256. The character we are
- checking (c) can be multibyte. */
+ /* Match a negated single character casefully. */
case OP_NOT:
if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
break;
/*-----------------------------------------------------------------*/
- /* Match a negated single character caselessly. This is only used for
- one-byte characters, that is, we know that d < 256. The character we are
- checking (c) can be multibyte. */
+ /* Match a negated single character caselessly. */
case OP_NOTI:
- if (clen > 0 && c != d && c != fcc[d])
- { ADD_NEW(state_offset + dlen + 1, 0); }
+ if (clen > 0)
+ {
+ unsigned int otherd;
+#ifdef SUPPORT_UTF
+ if (utf && d >= 128)
+ {
+#ifdef SUPPORT_UCP
+ otherd = UCD_OTHERCASE(d);
+#endif /* SUPPORT_UCP */
+ }
+ else
+#endif /* SUPPORT_UTF */
+ otherd = TABLE_GET(d, fcc, d);
+ if (c != d && c != otherd)
+ { ADD_NEW(state_offset + dlen + 1, 0); }
+ }
break;
/*-----------------------------------------------------------------*/
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/testdata/testinput10 2012-02-22 14:24:56 UTC (rev 925)
@@ -985,5 +985,13 @@
abc_
!\x{c0}++\x{c1}\x{c2}
!\x{c0}+++++
+
+/-- Caseless single negated characters > 127 need UCP support --/
+/[^\x{100}]/8i
+ \x{100}\x{101}X
+
+/[^\x{100}]+/8i
+ \x{100}\x{101}XX
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testinput9
===================================================================
--- code/trunk/testdata/testinput9 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/testdata/testinput9 2012-02-22 14:24:56 UTC (rev 925)
@@ -740,4 +740,10 @@
\r\r\r\P
\r\r\r\P\P
+/[^\x{100}]/8
+ \x{100}\x{101}X
+
+/[^\x{100}]+/8
+ \x{100}\x{101}X
+
/-- End of testinput9 --/
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/testdata/testoutput10 2012-02-22 14:24:56 UTC (rev 925)
@@ -2033,5 +2033,16 @@
0: ++\xc1
!\x{c0}+++++
0: \xc0++
+
+/-- Caseless single negated characters > 127 need UCP support --/
+/[^\x{100}]/8i
+ \x{100}\x{101}X
+ 0: X
+
+/[^\x{100}]+/8i
+ \x{100}\x{101}XX
+ 0: XX
+ 1: X
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testoutput9
===================================================================
--- code/trunk/testdata/testoutput9 2012-02-22 10:23:56 UTC (rev 924)
+++ code/trunk/testdata/testoutput9 2012-02-22 14:24:56 UTC (rev 925)
@@ -1414,4 +1414,13 @@
\r\r\r\P\P
Partial match: \x{0d}\x{0d}\x{0d}
+/[^\x{100}]/8
+ \x{100}\x{101}X
+ 0: \x{101}
+
+/[^\x{100}]+/8
+ \x{100}\x{101}X
+ 0: \x{101}X
+ 1: \x{101}
+
/-- End of testinput9 --/