Revision: 1185
http://www.exim.org/viewvc/pcre2?view=rev&revision=1185
Author: ph10
Date: 2019-11-16 17:30:07 +0000 (Sat, 16 Nov 2019)
Log Message:
-----------
Fix sometimes failing caseless non-ASCII matching in assertion.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput4
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2019-11-12 13:10:44 UTC (rev 1184)
+++ code/trunk/ChangeLog 2019-11-16 17:30:07 UTC (rev 1185)
@@ -177,7 +177,12 @@
37. Add NEON vectorization to JIT to speed up matching of first character and
pairs of characters on ARM64 CPUs.
+38. If a non-ASCII character was the first in a starting assertion in a
+caseless match, the "first code unit" optimization did not get the casing
+right, and the assertion failed to match a character in the other case if it
+did not start with the same code unit.
+
Version 10.33 16-April-2019
---------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2019-11-12 13:10:44 UTC (rev 1184)
+++ code/trunk/src/pcre2_compile.c 2019-11-16 17:30:07 UTC (rev 1185)
@@ -8741,6 +8741,19 @@
case OP_MINPLUSI:
case OP_POSPLUSI:
if (inassert == 0) return 0;
+
+ /* If the character is more than one code unit long, we cannot set its
+ first code unit when matching caselessly. Later scanning may pick up
+ multiple code units. */
+
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (scode[1] >= 0x80) return 0;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+ if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
+#endif
+#endif
+
if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
else if (c != scode[1]) return 0;
break;
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2019-11-12 13:10:44 UTC (rev 1184)
+++ code/trunk/testdata/testinput4 2019-11-16 17:30:07 UTC (rev 1185)
@@ -2483,4 +2483,12 @@
/\X*/
\xF3aaa\xE4\xEA\xEB\xFEa
+/Я/i,utf
+ \x{42f}
+ \x{44f}
+
+/(?=Я)/i,utf
+ \x{42f}
+ \x{44f}
+
# End of testinput4
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2019-11-12 13:10:44 UTC (rev 1184)
+++ code/trunk/testdata/testoutput4 2019-11-16 17:30:07 UTC (rev 1185)
@@ -4016,4 +4016,16 @@
\xF3aaa\xE4\xEA\xEB\xFEa
0: \xf3aaa\xe4\xea\xeb\xfea
+/Я/i,utf
+ \x{42f}
+ 0: \x{42f}
+ \x{44f}
+ 0: \x{44f}
+
+/(?=Я)/i,utf
+ \x{42f}
+ 0:
+ \x{44f}
+ 0:
+
# End of testinput4