Revision: 1170
http://www.exim.org/viewvc/pcre2?view=rev&revision=1170
Author: ph10
Date: 2019-09-10 16:38:42 +0100 (Tue, 10 Sep 2019)
Log Message:
-----------
Improve starting-byte bit map for UTF-8 patterns with wide characters in
classes.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_study.c
code/trunk/testdata/testinput10
code/trunk/testdata/testoutput10
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/ChangeLog 2019-09-10 15:38:42 UTC (rev 1170)
@@ -153,7 +153,14 @@
same character, to be treated as a single caseless character. This causes the
first and required code unit optimizations to kick in where relevant.
+34. Improve the bitmap of starting bytes for positive classes that include wide
+characters, but no property types, in UTF-8 mode. Previously, on encountering
+such a class, the bits for all bytes greater than \xc4 were set, thus
+specifying any character with codepoint >= 0x100. Now the only bits that are
+set are for the relevant bytes that start the wide characters. This can give a
+noticeable performance improvement.
+
Version 10.33 16-April-2019
---------------------------
Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c 2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/src/pcre2_study.c 2019-09-10 15:38:42 UTC (rev 1170)
@@ -909,7 +909,7 @@
/*************************************************
-* Create bitmap of starting bytes *
+* Create bitmap of starting code units *
*************************************************/
/* This function scans a compiled unanchored expression recursively and
@@ -959,6 +959,9 @@
{
int rc;
uint8_t *classmap = NULL;
+#ifdef SUPPORT_WIDE_CHARS
+ PCRE2_UCHAR xclassflags;
+#endif
switch(*tcode)
{
@@ -1467,20 +1470,59 @@
negative XCLASS without a map, give up. If there are no property checks,
there must be wide characters on the XCLASS list, because otherwise an
XCLASS would not have been created. This means that code points >= 255
- are always potential starters. */
+ are potential starters. In the UTF-8 case we can scan them and set bits
+ for the relevant leading bytes. */
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
- if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
- (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
+ xclassflags = tcode[1 + LINK_SIZE];
+ if ((xclassflags & XCL_HASPROP) != 0 ||
+ (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
return SSB_FAIL;
/* We have a positive XCLASS or a negative one without a map. Set up the
map pointer if there is one, and fall through. */
- classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
+ classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
-#endif
+
+ /* In UTF-8 mode, scan the character list and set bits for leading bytes,
+ then jump to handle the map. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (utf && (xclassflags & XCL_NOT) == 0)
+ {
+ PCRE2_UCHAR b, e;
+ PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
+ tcode += GET(tcode, 1);
+
+ for (;;) switch (*p++)
+ {
+ case XCL_SINGLE:
+ b = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ re->start_bitmap[b/8] |= (1u << (b&7));
+ break;
+
+ case XCL_RANGE:
+ b = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ e = *p++;
+ while ((*p & 0xc0) == 0x80) p++;
+ for (; b <= e; b++)
+ re->start_bitmap[b/8] |= (1u << (b&7));
+ break;
+
+ case XCL_END:
+ goto HANDLE_CLASSMAP;
+
+ default:
+ return SSB_UNKNOWN; /* Internal error, should not occur */
+ }
+ }
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+#endif /* SUPPORT_WIDE_CHARS */
+
/* It seems that the fall through comment must be outside the #ifdef if
it is to avoid the gcc compiler warning. */
@@ -1522,6 +1564,9 @@
greater than 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
+#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
+ HANDLE_CLASSMAP:
+#endif
if (classmap != NULL)
{
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/testdata/testinput10 2019-09-10 15:38:42 UTC (rev 1170)
@@ -561,4 +561,10 @@
/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+/[,]/BI,utf
+
+/[\x{fff4}-\x{ffff8}]/I,utf
+
+/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
+
# End of testinput10
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/testdata/testoutput10 2019-09-10 15:38:42 UTC (rev 1170)
@@ -1256,11 +1256,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: utf
-Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
- \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
- \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
- \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
- \xfb \xfc \xfd \xfe \xff
+Starting code units: Z \xc4
Subject length lower bound = 1
Z\x{100}
0: Z
@@ -1278,11 +1274,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: utf
-Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9
- \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8
- \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7
- \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6
- \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
+Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4
Subject length lower bound = 1
/[z\Qa-d]Ā\E]/IB,utf
@@ -1294,11 +1286,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: utf
-Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc
- \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
- \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
- \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
- \xfa \xfb \xfc \xfd \xfe \xff
+Starting code units: - ] a d z \xc4
Subject length lower bound = 1
\x{100}
0: \x{100}
@@ -1319,11 +1307,7 @@
------------------------------------------------------------------
Capture group count = 1
Options: utf
-Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
- \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
- \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
- \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
- \xfb \xfc \xfd \xfe \xff
+Starting code units: a b \xc4
Last code unit = 'z'
Subject length lower bound = 7
@@ -1440,11 +1424,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: caseless utf
-Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce
- \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd
- \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec
- \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb
- \xfc \xfd \xfe \xff
+Starting code units: \xc4
Subject length lower bound = 1
\x{104}
0: \x{104}
@@ -1467,11 +1447,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
- \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
- \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
- \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
- \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
Subject length lower bound = 1
Z
0: Z
@@ -1508,11 +1484,7 @@
------------------------------------------------------------------
Capture group count = 0
Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8
- \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7
- \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6
- \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5
- \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2
Subject length lower bound = 1
/\x{3a3}B/IBi,utf
@@ -1773,4 +1745,28 @@
Last code unit = 'X'
Subject length lower bound = 3
+/[,]/BI,utf
+------------------------------------------------------------------
+ Bra
+ [,\x{fff9f}]
+ Ket
+ End
+------------------------------------------------------------------
+Capture group count = 0
+Options: utf
+Starting code units: , \xf3
+Subject length lower bound = 1
+
+/[\x{fff4}-\x{ffff8}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xef \xf0 \xf1 \xf2 \xf3
+Subject length lower bound = 1
+
+/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xef \xf0 \xf1 \xf2 \xf4
+Subject length lower bound = 1
+
# End of testinput10