Revision: 1231
http://www.exim.org/viewvc/pcre2?view=rev&revision=1231
Author: ph10
Date: 2020-02-26 16:53:39 +0000 (Wed, 26 Feb 2020)
Log Message:
-----------
Fix bugs in new UCP casing code for back references and characters with more
than 2 cases.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/src/pcre2_match.c
code/trunk/testdata/testinput12
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput12-16
code/trunk/testdata/testoutput12-32
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/ChangeLog 2020-02-26 16:53:39 UTC (rev 1231)
@@ -69,7 +69,7 @@
18. Changes in many areas of the code so that when Unicode is supported and
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
upper/lower case computations on characters whose code points are greater than
-127. Documentation is not yet updated. JIT is not yet updated.
+127.
19. The function for checking UTF-16 validity was returning an incorrect offset
for the start of the error when a high surrogate was not followed by a valid
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/src/pcre2_compile.c 2020-02-26 16:53:39 UTC (rev 1231)
@@ -5565,12 +5565,12 @@
zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags;
- /* For caseless UTF mode, check whether this character has more than
- one other case. If so, generate a special OP_NOTPROP item instead of
+ /* For caseless UTF or UCP mode, check whether this character has more
+ than one other case. If so, generate a special OP_NOTPROP item instead of
OP_NOTI. */
#ifdef SUPPORT_UNICODE
- if (utf && (options & PCRE2_CASELESS) != 0 &&
+ if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
(d = UCD_CASESET(c)) != 0)
{
*code++ = OP_NOTPROP;
@@ -7824,11 +7824,12 @@
NORMAL_CHAR_SET: /* Character is already in meta */
matched_char = TRUE;
- /* For caseless UTF mode, check whether this character has more than one
- other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
+ /* For caseless UTF or UCP mode, check whether this character has more than
+ one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
+ */
#ifdef SUPPORT_UNICODE
- if (utf && (options & PCRE2_CASELESS) != 0)
+ if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
{
uint32_t caseset = UCD_CASESET(meta);
if (caseset != 0)
Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/src/pcre2_match.c 2020-02-26 16:53:39 UTC (rev 1231)
@@ -381,8 +381,12 @@
if (caseless)
{
#if defined SUPPORT_UNICODE
- if ((mb->poptions & PCRE2_UTF) != 0)
+ BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+
+ if (utf || (mb->poptions & PCRE2_UCP) != 0)
{
+ PCRE2_SPTR endptr = p + length;
+
/* Match characters up to the end of the reference. NOTE: the number of
code units matched may differ, because in UTF-8 there are some characters
whose upper and lower case codes have different numbers of bytes. For
@@ -390,16 +394,25 @@
bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
sequence of two of the latter. It is important, therefore, to check the
length along the reference, not along the subject (earlier code did this
- wrong). */
-
- PCRE2_SPTR endptr = p + length;
+ wrong). UCP without uses Unicode properties but without UTF encoding. */
+
while (p < endptr)
{
uint32_t c, d;
const ucd_record *ur;
if (eptr >= mb->end_subject) return 1; /* Partial match */
- GETCHARINC(c, eptr);
- GETCHARINC(d, p);
+
+ if (utf)
+ {
+ GETCHARINC(c, eptr);
+ GETCHARINC(d, p);
+ }
+ else
+ {
+ c = *eptr++;
+ d = *p++;
+ }
+
ur = GET_UCD(d);
if (c != d && c != (uint32_t)((int)d + ur->other_case))
{
@@ -415,7 +428,7 @@
else
#endif
- /* Not in UTF mode */
+ /* Not in UTF or UCP mode */
{
for (; length > 0; length--)
{
@@ -432,7 +445,8 @@
}
/* In the caseful case, we can just compare the code units, whether or not we
-are in UTF mode. When partial matching, we have to do this unit-by-unit. */
+are in UTF and/or UCP mode. When partial matching, we have to do this unit by
+unit. */
else
{
Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/testdata/testinput12 2020-02-26 16:53:39 UTC (rev 1231)
@@ -530,6 +530,20 @@
/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
X\x{121}Y
+/s/i,ucp
+ \x{17f}
+
+/s/i,utf
+ \x{17f}
+
+/[^s]/i,ucp
+\= Expect no match
+ \x{17f}
+
+/[^s]/i,utf
+\= Expect no match
+ \x{17f}
+
# ----------------------------------------------------
# End of testinput12
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/testdata/testinput5 2020-02-26 16:53:39 UTC (rev 1231)
@@ -2184,4 +2184,7 @@
/(|\xDF)7/caseless,ucp
+/(\xc1)\1/i,ucp
+ \xc1\xe1\=no_jit
+
# End of testinput5
Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/testdata/testoutput12-16 2020-02-26 16:53:39 UTC (rev 1231)
@@ -1761,6 +1761,24 @@
X\x{121}Y
1: >\x{120}<
+/s/i,ucp
+ \x{17f}
+ 0: \x{17f}
+
+/s/i,utf
+ \x{17f}
+ 0: \x{17f}
+
+/[^s]/i,ucp
+\= Expect no match
+ \x{17f}
+No match
+
+/[^s]/i,utf
+\= Expect no match
+ \x{17f}
+No match
+
# ----------------------------------------------------
# End of testinput12
Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/testdata/testoutput12-32 2020-02-26 16:53:39 UTC (rev 1231)
@@ -1759,6 +1759,24 @@
X\x{121}Y
1: >\x{120}<
+/s/i,ucp
+ \x{17f}
+ 0: \x{17f}
+
+/s/i,utf
+ \x{17f}
+ 0: \x{17f}
+
+/[^s]/i,ucp
+\= Expect no match
+ \x{17f}
+No match
+
+/[^s]/i,utf
+\= Expect no match
+ \x{17f}
+No match
+
# ----------------------------------------------------
# End of testinput12
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2020-02-26 10:18:43 UTC (rev 1230)
+++ code/trunk/testdata/testoutput5 2020-02-26 16:53:39 UTC (rev 1231)
@@ -4943,4 +4943,9 @@
/(|\xDF)7/caseless,ucp
+/(\xc1)\1/i,ucp
+ \xc1\xe1\=no_jit
+ 0: \xc1\xe1
+ 1: \xc1
+
# End of testinput5