Revision: 554
http://www.exim.org/viewvc/pcre2?view=rev&revision=554
Author: ph10
Date: 2016-08-03 18:22:59 +0100 (Wed, 03 Aug 2016)
Log Message:
-----------
Fix bug that caused chars > 255 not to be matched by classes like [\W\pL] when
PCRE2_UCP was not set.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput10
code/trunk/testdata/testinput12
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput12-16
code/trunk/testdata/testoutput12-32
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/ChangeLog 2016-08-03 17:22:59 UTC (rev 554)
@@ -8,7 +8,13 @@
1. Extended pcre2test with the utf8_input modifier so that it is able to
generate all possible 16-bit and 32-bit code unit values in non-UTF modes.
+2. In any wide-character mode (8-bit UTF or any 16-bit or 32-bit mode), without
+PCRE2_UCP set, a negative character type such as \D in a positive class should
+cause all characters greater than 255 to match, whatever else is in the class.
+There was a bug that caused this not to happen if a Unicode property item was
+added to such a class, for example [\D\P{Nd}] or [\W\pL].
+
Version 10.22 29-July-2016
--------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/src/pcre2_compile.c 2016-08-03 17:22:59 UTC (rev 554)
@@ -4950,11 +4950,11 @@
}
#ifdef SUPPORT_WIDE_CHARS
- /* If any wide characters have been encountered, set xclass = TRUE. Then,
- in the pre-compile phase, accumulate the length of the wide characters
- and reset the pointer. This is so that very large classes that contain a
- zillion wide characters do not overwrite the work space (which is on the
- stack). */
+ /* If any wide characters or Unicode properties have been encountered,
+ set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
+ of the wide characters etc. and reset the pointer. This is so that very
+ large classes that contain a zillion wide characters do not overwrite the
+ work space (which is on the stack). */
if (class_uchardata > class_uchardata_base)
{
@@ -4994,22 +4994,43 @@
negated). This requirement is indicated by match_all_or_no_wide_chars being
true. We do this by including an explicit range, which works in both cases.
+ When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
+ class where \S etc is present without PCRE2_UCP, causing an extended class
+ to be compiled, we make sure that all characters > 255 are included by
+ forcing match_all_or_no_wide_chars to be true.
+
If, when generating an xclass, there are no characters < 256, we can omit
the bitmap in the actual compiled code. */
-#ifdef SUPPORT_WIDE_CHARS
+#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
+ if (xclass && (
#ifdef SUPPORT_UNICODE
- if (xclass && (xclass_has_prop || !should_flip_negation ||
- (options & PCRE2_UCP) != 0))
-#elif PCRE2_CODE_UNIT_WIDTH != 8
- if (xclass && (xclass_has_prop || !should_flip_negation))
+ (options & PCRE2_UCP) != 0 ||
#endif
+ xclass_has_prop || !should_flip_negation))
{
- if (match_all_or_no_wide_chars)
+ if (match_all_or_no_wide_chars || (
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ utf &&
+#endif
+ should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
{
*class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
- class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
+ if (utf) /* Will always be utf in the 8-bit library */
+ {
+ class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
+ class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
+ }
+ else /* Can only happen for the 16-bit & 32-bit libraries */
+ {
+#if PCRE2_CODE_UNIT_WIDTH == 16
+ *class_uchardata++ = 0x100;
+ *class_uchardata++ = 0xffffu;
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+ *class_uchardata++ = 0x100;
+ *class_uchardata++ = 0xffffffffu;
+#endif
+ }
}
*class_uchardata++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
@@ -5037,7 +5058,7 @@
PUT(previous, 1, (int)(code - previous));
break; /* End of class handling */
}
-#endif
+#endif /* SUPPORT_WIDE_CHARS */
/* If there are no characters > 255, or they are all to be included or
excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testinput10 2016-08-03 17:22:59 UTC (rev 554)
@@ -445,4 +445,13 @@
/(?<=(a)(?-1))x/I,utf
a\x80zx\=offset=3
+/[\W\p{Any}]/B
+ abc
+ 123
+
+/[\W\pL]/B
+ abc
+\= Expect no match
+ 123
+
# End of testinput10
Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testinput12 2016-08-03 17:22:59 UTC (rev 554)
@@ -347,4 +347,15 @@
/ab\x{7FFFFFFF}z/utf
+/[\W\p{Any}]/B
+ abc
+ 123
+
+/[\W\pL]/B
+ abc
+ \x{100}
+ \x{308}
+\= Expect no match
+ 123
+
# End of testinput12
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testinput5 2016-08-03 17:22:59 UTC (rev 554)
@@ -1677,15 +1677,6 @@
/((?<digit>\d)|(?<letter>\p{L}))/g,substitute_extended,replace=<${digit:+digit; :not digit; }${letter:+letter:not a letter}>
ab12cde
-/[\W\p{Any}]/B
- abc
- 123
-
-/[\W\pL]/B
- abc
-\= Expect no match
- 123
-
/(*UCP)(*UTF)[[:>:]]X/B
/abc/utf,replace=xyz
@@ -1718,4 +1709,21 @@
/(*UTF)C\x09((?<!'(?x)!*H? #\xcc\x9a[^$]/
+/[\D]/utf
+ \x{1d7cf}
+
+/[\D\P{Nd}]/utf
+ \x{1d7cf}
+
+/[^\D]/utf
+ a9b
+\= Expect no match
+ \x{1d7cf}
+
+/[^\D\P{Nd}]/utf
+ a9b
+ \x{1d7cf}
+\= Expect no match
+ \x{10000}
+
# End of testinput5
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testoutput10 2016-08-03 17:22:59 UTC (rev 554)
@@ -1539,4 +1539,29 @@
a\x80zx\=offset=3
Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
+/[\W\p{Any}]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{Any}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+ 123
+ 0: 1
+
+/[\W\pL]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{L}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+\= Expect no match
+ 123
+No match
+
# End of testinput10
Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testoutput12-16 2016-08-03 17:22:59 UTC (rev 554)
@@ -1378,4 +1378,33 @@
/ab\x{7FFFFFFF}z/utf
** Failed: character value greater than 0x10ffff cannot be converted to UTF
+/[\W\p{Any}]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffff}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+ 123
+ 0: 1
+
+/[\W\pL]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffff}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+ \x{100}
+ 0: \x{100}
+ \x{308}
+ 0: \x{308}
+\= Expect no match
+ 123
+No match
+
# End of testinput12
Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testoutput12-32 2016-08-03 17:22:59 UTC (rev 554)
@@ -1372,4 +1372,33 @@
/ab\x{7FFFFFFF}z/utf
** Failed: character value greater than 0x10ffff cannot be converted to UTF
+/[\W\p{Any}]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{Any}\x{100}-\x{ffffffff}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+ 123
+ 0: 1
+
+/[\W\pL]/B
+------------------------------------------------------------------
+ Bra
+ [\x00-/:-@[-^`{-\xff\p{L}\x{100}-\x{ffffffff}]
+ Ket
+ End
+------------------------------------------------------------------
+ abc
+ 0: a
+ \x{100}
+ 0: \x{100}
+ \x{308}
+ 0: \x{308}
+\= Expect no match
+ 123
+No match
+
# End of testinput12
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2016-08-03 09:01:02 UTC (rev 553)
+++ code/trunk/testdata/testoutput5 2016-08-03 17:22:59 UTC (rev 554)
@@ -4022,31 +4022,6 @@
ab12cde
7: <not digit; letter><not digit; letter><digit; not a letter><digit; not a letter><not digit; letter><not digit; letter><not digit; letter>
-/[\W\p{Any}]/B
-------------------------------------------------------------------
- Bra
- [\x00-/:-@[-^`{-\xff\p{Any}]
- Ket
- End
-------------------------------------------------------------------
- abc
- 0: a
- 123
- 0: 1
-
-/[\W\pL]/B
-------------------------------------------------------------------
- Bra
- [\x00-/:-@[-^`{-\xff\p{L}]
- Ket
- End
-------------------------------------------------------------------
- abc
- 0: a
-\= Expect no match
- 123
-No match
-
/(*UCP)(*UTF)[[:>:]]X/B
------------------------------------------------------------------
Bra
@@ -4163,4 +4138,28 @@
/(*UTF)C\x09((?<!'(?x)!*H? #\xcc\x9a[^$]/
Failed: error 114 at offset 39: missing closing parenthesis
+/[\D]/utf
+ \x{1d7cf}
+ 0: \x{1d7cf}
+
+/[\D\P{Nd}]/utf
+ \x{1d7cf}
+ 0: \x{1d7cf}
+
+/[^\D]/utf
+ a9b
+ 0: 9
+\= Expect no match
+ \x{1d7cf}
+No match
+
+/[^\D\P{Nd}]/utf
+ a9b
+ 0: 9
+ \x{1d7cf}
+ 0: \x{1d7cf}
+\= Expect no match
+ \x{10000}
+No match
+
# End of testinput5