Revision: 443
http://www.exim.org/viewvc/pcre2?view=rev&revision=443
Author: ph10
Date: 2015-11-20 16:55:36 +0000 (Fri, 20 Nov 2015)
Log Message:
-----------
Fix wide character problem with negated POSIX ascii and xdigit class items.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput4
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput4
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/ChangeLog 2015-11-20 16:55:36 UTC (rev 443)
@@ -321,7 +321,13 @@
96. [:punct:] in UCP mode was matching some characters in the range 128-255
that should not have been matched.
+97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
+characters with code points greater than 255 are in the class. When a Unicode
+property was also in the class (if PCRE2_UCP is set, escapes such as \w are
+turned into Unicode properties), wide characters were not correctly handled,
+and could fail to match.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/src/pcre2_compile.c 2015-11-20 16:55:36 UTC (rev 443)
@@ -3857,6 +3857,7 @@
{
BOOL negate_class;
BOOL should_flip_negation;
+ BOOL match_all_wide_chars;
BOOL possessive_quantifier;
BOOL is_quantifier;
BOOL is_recurse;
@@ -4187,11 +4188,12 @@
break;
}
- /* If a class contains a negative special such as \S, we need to flip the
- negation flag at the end, so that support for characters > 255 works
- correctly (they are all included in the class). */
+ /* If a non-extended class contains a negative special such as \S, we need
+ to flip the negation flag at the end, so that support for characters > 255
+ works correctly (they are all included in the class). An extended class may
+ need to insert specific matching code for wide characters. */
- should_flip_negation = FALSE;
+ should_flip_negation = match_all_wide_chars = FALSE;
/* Extended class (xclass) will be used when characters > 255
might match. */
@@ -4345,10 +4347,23 @@
ptr = tempptr + 1;
goto CONTINUE_CLASS;
- /* For all other POSIX classes, no special action is taken in UCP
- mode. Fall through to the non_UCP case. */
+ /* For the other POSIX classes (ascii, xdigit) we are going to fall
+ through to the non-UCP case and build a bit map for characters with
+ code points less than 256. If we are in a negated POSIX class
+ within a non-negated overall class, characters with code points
+ greater than 255 must all match. In the special case where we have
+ not yet generated any xclass data, and this is the final item in
+ the overall class, we need do nothing: later on, the opcode
+ OP_NCLASS will be used to indicate that characters greater than 255
+ are acceptable. If we have already seen an xclass item or one may
+ follow (we have to assume that it might if this is not the end of
+ the class), set a flag to cause the generation of an explicit range
+ for all wide codepoints. */
default:
+ if (!negate_class && local_negate &&
+ (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
+ match_all_wide_chars = TRUE;
break;
}
}
@@ -4848,10 +4863,16 @@
unless there were no property settings and there was a negated special such
as \S in the class, and PCRE2_UCP is not set, because in that case all
characters > 255 are in the class, so any that were explicitly given as
- well can be ignored. If (when there are explicit characters > 255 or
- property settings that must be listed) there are no characters < 256, we
- can omit the bitmap in the actual compiled code. */
+ well can be ignored.
+ In the UCP case, if certain negated POSIX classes ([:^ascii:] or
+ {^:xdigit:]) were present in a non-negative class, we again have to match
+ all wide characters, indicated by match_all_wide_chars being true. We do
+ this by including an explicit range.
+
+ If, when generating an xclass, there are no characters < 256, we can omit
+ the bitmap in the actual compiled code. */
+
#ifdef SUPPORT_WIDE_CHARS
#ifdef SUPPORT_UNICODE
if (xclass && (xclass_has_prop || !should_flip_negation ||
@@ -4860,6 +4881,13 @@
if (xclass && (xclass_has_prop || !should_flip_negation))
#endif
{
+ if (match_all_wide_chars)
+ {
+ *class_uchardata++ = XCL_RANGE;
+ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT,
+ class_uchardata);
+ }
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
code += LINK_SIZE;
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/testdata/testinput4 2015-11-20 16:55:36 UTC (rev 443)
@@ -2236,4 +2236,40 @@
/[[:punct:]]/utf,ucp
\x{b4}
+/[[:^ascii:]]/utf,ucp
+ \x{100}
+ \x{200}
+ \x{300}
+ \x{37e}
+\= Expect no match
+ aa
+ 99
+
+/[[:^ascii:]\w]/utf,ucp
+ aa
+ 99
+ gg
+ \x{100}
+ \x{200}
+ \x{300}
+ \x{37e}
+
+/[\w[:^ascii:]]/utf,ucp
+ aa
+ 99
+ gg
+ \x{100}
+ \x{200}
+ \x{300}
+ \x{37e}
+
+/[^[:ascii:]\W]/utf,ucp
+ \x{100}
+ \x{200}
+\= Expect no match
+ aa
+ 99
+ gg
+ \x{37e}
+
# End of testinput4
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/testdata/testinput5 2015-11-20 16:55:36 UTC (rev 443)
@@ -1699,4 +1699,19 @@
/a[b[:punct:]]/utf,ucp,bincode
+/[[:^ascii:]]/utf,ucp,bincode
+
+/[[:^ascii:]\w]/utf,ucp,bincode
+
+/[\w[:^ascii:]]/utf,ucp,bincode
+
+/[^[:ascii:]\W]/utf,ucp,bincode
+ \x{de}
+ \x{200}
+\= Expect no match
+ \x{300}
+ \x{37e}
+
+/[[:^ascii:]a]/utf,ucp,bincode
+
# End of testinput5
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/testdata/testoutput4 2015-11-20 16:55:36 UTC (rev 443)
@@ -3624,4 +3624,66 @@
\x{b4}
No match
+/[[:^ascii:]]/utf,ucp
+ \x{100}
+ 0: \x{100}
+ \x{200}
+ 0: \x{200}
+ \x{300}
+ 0: \x{300}
+ \x{37e}
+ 0: \x{37e}
+\= Expect no match
+ aa
+No match
+ 99
+No match
+
+/[[:^ascii:]\w]/utf,ucp
+ aa
+ 0: a
+ 99
+ 0: 9
+ gg
+ 0: g
+ \x{100}
+ 0: \x{100}
+ \x{200}
+ 0: \x{200}
+ \x{300}
+ 0: \x{300}
+ \x{37e}
+ 0: \x{37e}
+
+/[\w[:^ascii:]]/utf,ucp
+ aa
+ 0: a
+ 99
+ 0: 9
+ gg
+ 0: g
+ \x{100}
+ 0: \x{100}
+ \x{200}
+ 0: \x{200}
+ \x{300}
+ 0: \x{300}
+ \x{37e}
+ 0: \x{37e}
+
+/[^[:ascii:]\W]/utf,ucp
+ \x{100}
+ 0: \x{100}
+ \x{200}
+ 0: \x{200}
+\= Expect no match
+ aa
+No match
+ 99
+No match
+ gg
+No match
+ \x{37e}
+No match
+
# End of testinput4
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2015-11-18 08:01:33 UTC (rev 442)
+++ code/trunk/testdata/testoutput5 2015-11-20 16:55:36 UTC (rev 443)
@@ -4099,4 +4099,53 @@
End
------------------------------------------------------------------
+/[[:^ascii:]]/utf,ucp,bincode
+------------------------------------------------------------------
+ Bra
+ [\x80-\xff] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+
+/[[:^ascii:]\w]/utf,ucp,bincode
+------------------------------------------------------------------
+ Bra
+ [\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
+ Ket
+ End
+------------------------------------------------------------------
+
+/[\w[:^ascii:]]/utf,ucp,bincode
+------------------------------------------------------------------
+ Bra
+ [\x80-\xff\p{Xwd}\x{100}-\x{10ffff}]
+ Ket
+ End
+------------------------------------------------------------------
+
+/[^[:ascii:]\W]/utf,ucp,bincode
+------------------------------------------------------------------
+ Bra
+ [^\x00-\x7f\P{Xwd}]
+ Ket
+ End
+------------------------------------------------------------------
+ \x{de}
+ 0: \x{de}
+ \x{200}
+ 0: \x{200}
+\= Expect no match
+ \x{300}
+No match
+ \x{37e}
+No match
+
+/[[:^ascii:]a]/utf,ucp,bincode
+------------------------------------------------------------------
+ Bra
+ [a\x80-\xff] (neg)
+ Ket
+ End
+------------------------------------------------------------------
+
# End of testinput5