[Pcre-svn] [1170] code/trunk: Improve starting-byte bit map for UTF-8 patterns with wide characters in

Author: Subversion repository
Date:
To: pcre-svn
Subject: [Pcre-svn] [1170] code/trunk: Improve starting-byte bit map for UTF-8 patterns with wide characters in

Revision: 1170

          http://www.exim.org/viewvc/pcre2?view=rev&revision=1170
Author:   ph10
Date:     2019-09-10 16:38:42 +0100 (Tue, 10 Sep 2019)
Log Message:
-----------
Improve starting-byte bit map for UTF-8 patterns with wide characters in 
classes.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testoutput10

Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/ChangeLog    2019-09-10 15:38:42 UTC (rev 1170)
@@ -153,7 +153,14 @@
 same character, to be treated as a single caseless character. This causes the 
 first and required code unit optimizations to kick in where relevant.

+34. Improve the bitmap of starting bytes for positive classes that include wide
+characters, but no property types, in UTF-8 mode. Previously, on encountering
+such a class, the bits for all bytes greater than \xc4 were set, thus
+specifying any character with codepoint >= 0x100. Now the only bits that are
+set are for the relevant bytes that start the wide characters. This can give a
+noticeable performance improvement.

+
Version 10.33 16-April-2019
---------------------------

Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/src/pcre2_study.c    2019-09-10 15:38:42 UTC (rev 1170)
@@ -909,7 +909,7 @@

 /*************************************************
-*          Create bitmap of starting bytes       *
+*      Create bitmap of starting code units      *
 *************************************************/

 /* This function scans a compiled unanchored expression recursively and
@@ -959,6 +959,9 @@
     {
     int rc;
     uint8_t *classmap = NULL;
+#ifdef SUPPORT_WIDE_CHARS
+    PCRE2_UCHAR xclassflags;
+#endif

     switch(*tcode)
       {
@@ -1467,20 +1470,59 @@
       negative XCLASS without a map, give up. If there are no property checks,
       there must be wide characters on the XCLASS list, because otherwise an
       XCLASS would not have been created. This means that code points >= 255
-      are always potential starters. */
+      are potential starters. In the UTF-8 case we can scan them and set bits
+      for the relevant leading bytes. */

 #ifdef SUPPORT_WIDE_CHARS
       case OP_XCLASS:
-      if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
-          (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
+      xclassflags = tcode[1 + LINK_SIZE];
+      if ((xclassflags & XCL_HASPROP) != 0 ||
+          (xclassflags & (XCL_MAP|XCL_NOT)) == XCL_NOT)
         return SSB_FAIL;

       /* We have a positive XCLASS or a negative one without a map. Set up the
       map pointer if there is one, and fall through. */

-      classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
+      classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
-#endif
+
+      /* In UTF-8 mode, scan the character list and set bits for leading bytes,
+      then jump to handle the map. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      if (utf && (xclassflags & XCL_NOT) == 0)
+        {
+        PCRE2_UCHAR b, e;
+        PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
+        tcode += GET(tcode, 1);
+
+        for (;;) switch (*p++)
+          {
+          case XCL_SINGLE:
+          b = *p++;
+          while ((*p & 0xc0) == 0x80) p++;
+          re->start_bitmap[b/8] |= (1u << (b&7));
+          break;
+
+          case XCL_RANGE:
+          b = *p++;
+          while ((*p & 0xc0) == 0x80) p++;
+          e = *p++;
+          while ((*p & 0xc0) == 0x80) p++;
+          for (; b <= e; b++)
+            re->start_bitmap[b/8] |= (1u << (b&7));
+          break;
+
+          case XCL_END:
+          goto HANDLE_CLASSMAP;
+
+          default:
+          return SSB_UNKNOWN;   /* Internal error, should not occur */
+          }
+        }
+#endif  /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+#endif  /* SUPPORT_WIDE_CHARS */
+
       /* It seems that the fall through comment must be outside the #ifdef if
       it is to avoid the gcc compiler warning. */

@@ -1522,6 +1564,9 @@
       greater than 127. In fact, there are only two possible starting bytes for
       characters in the range 128 - 255. */

+#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
+      HANDLE_CLASSMAP:
+#endif
       if (classmap != NULL)
         {
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8

Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/testdata/testinput10    2019-09-10 15:38:42 UTC (rev 1170)
@@ -561,4 +561,10 @@

/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf

+/[󿾟,]/BI,utf
+
+/[\x{fff4}-\x{ffff8}]/I,utf
+
+/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
+
# End of testinput10

Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2019-09-10 13:22:08 UTC (rev 1169)
+++ code/trunk/testdata/testoutput10    2019-09-10 15:38:42 UTC (rev 1170)
@@ -1256,11 +1256,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: Z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
-  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
-  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
-  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
-  \xfb \xfc \xfd \xfe \xff 
+Starting code units: Z \xc4 
 Subject length lower bound = 1
     Z\x{100}
  0: Z
@@ -1278,11 +1274,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 
-  \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 
-  \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 
-  \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 
-  \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Starting code units: z { | } ~ \x7f \xc2 \xc3 \xc4 
 Subject length lower bound = 1

 /[z\Qa-d]Ā\E]/IB,utf
@@ -1294,11 +1286,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: utf
-Starting code units: - ] a d z \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc 
-  \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb 
-  \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea 
-  \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 
-  \xfa \xfb \xfc \xfd \xfe \xff 
+Starting code units: - ] a d z \xc4 
 Subject length lower bound = 1
     \x{100}
  0: \x{100}
@@ -1319,11 +1307,7 @@
 ------------------------------------------------------------------
 Capture group count = 1
 Options: utf
-Starting code units: a b \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd 
-  \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc 
-  \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb 
-  \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa 
-  \xfb \xfc \xfd \xfe \xff 
+Starting code units: a b \xc4 
 Last code unit = 'z'
 Subject length lower bound = 7

@@ -1440,11 +1424,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce 
-  \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd 
-  \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec 
-  \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb 
-  \xfc \xfd \xfe \xff 
+Starting code units: \xc4 
 Subject length lower bound = 1
     \x{104}
  0: \x{104}
@@ -1467,11 +1447,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 
-  \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 
-  \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 
-  \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 
-  \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 
 Subject length lower bound = 1
     Z
  0: Z
@@ -1508,11 +1484,7 @@
 ------------------------------------------------------------------
 Capture group count = 0
 Options: caseless utf
-Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 
-  \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 
-  \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 
-  \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 
-  \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Starting code units: Z z { | } ~ \x7f \xc2 \xc3 \xc4 \xc5 \xce \xe1 \xe2 
 Subject length lower bound = 1

/\x{3a3}B/IBi,utf
@@ -1773,4 +1745,28 @@
Last code unit = 'X'
Subject length lower bound = 3

+/[󿾟,]/BI,utf
+------------------------------------------------------------------
+        Bra
+        [,\x{fff9f}]
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: utf
+Starting code units: , \xf3 
+Subject length lower bound = 1
+
+/[\x{fff4}-\x{ffff8}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xef \xf0 \xf1 \xf2 \xf3 
+Subject length lower bound = 1
+
+/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xef \xf0 \xf1 \xf2 \xf4 
+Subject length lower bound = 1
+
 # End of testinput10

This message is part of the following thread:
	the complete thread tree sorted by date

[Pcre-svn] [1170] code/trunk: Improve starting-byte bit map …