[Pcre-svn] [1171] code/trunk: Optimize certain starting code…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1171] code/trunk: Optimize certain starting code unit bit maps into a single starting code unit.
Revision: 1171
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1171
Author:   ph10
Date:     2019-09-13 18:02:06 +0100 (Fri, 13 Sep 2019)
Log Message:
-----------
Optimize certain starting code unit bit maps into a single starting code unit.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput12-16
    code/trunk/testdata/testoutput12-32
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/ChangeLog    2019-09-13 17:02:06 UTC (rev 1171)
@@ -160,7 +160,17 @@
 set are for the relevant bytes that start the wide characters. This can give a 
 noticeable performance improvement.


+35. If the bitmap of starting code units contains only 1 or 2 bits, replace it
+with a single starting code unit (1 bit) or a caseless single starting code
+unit if the two relevant characters are case-partners. This is particularly
+relevant to the 8-bit library, though it applies to all. It can give a
+performance boost for patterns such as [Ww]ord and (word|WORD). However, this
+optimization doesn't happen if there is a "required" code unit of the same
+value (because the search for a "required" code unit starts at the match start
+for non-unique first code unit patterns, but after a unique first code unit,
+and patterns such as a*a need the former action).

+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/src/pcre2_study.c    2019-09-13 17:02:06 UTC (rev 1171)
@@ -1666,7 +1666,101 @@
   {
   int rc = set_start_bits(re, code, utf);
   if (rc == SSB_UNKNOWN) return 1;
-  if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
+
+  /* If a list of starting code units was set up, scan the list to see if only
+  one or two were listed. Having only one listed is rare because usually a
+  single starting code unit will have been recognized and PCRE2_FIRSTSET set.
+  If two are listed, see if they are caseless versions of the same character;
+  if so we can replace the list with a caseless first code unit. This gives
+  better performance and is plausibly worth doing for patterns such as [Ww]ord
+  or (word|WORD). */
+
+  if (rc == SSB_DONE)
+    {
+    int i;
+    int a = -1;
+    int b = -1;
+    uint8_t *p = re->start_bitmap;
+    uint32_t flags = PCRE2_FIRSTMAPSET;
+
+    for (i = 0; i < 256; p++, i += 8)
+      {
+      uint8_t x = *p;
+      if (x != 0)
+        {
+        int c;
+        uint8_t y = x & (~x + 1);   /* Least significant bit */
+        if (y != x) goto DONE;      /* More than one bit set */
+
+        /* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
+        all wide characters", so we cannot use it here. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 8
+        if (i == 248 && x == 0x80) goto DONE;
+#endif
+
+        /* Compute the character value */
+
+        c = i;
+        switch (x)
+          {
+          case 1:   break;
+          case 2:   c += 1; break;  case 4:  c += 2; break;
+          case 8:   c += 3; break;  case 16: c += 4; break;
+          case 32:  c += 5; break;  case 64: c += 6; break;
+          case 128: c += 7; break;
+          }
+
+        /* c contains the code unit value, in the range 0-255. In 8-bit UTF
+        mode, only values < 128 can be used. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+        if (c > 127) goto DONE;
+#endif
+        if (a < 0) a = c;   /* First one found */
+        else if (b < 0)     /* Second one found */
+          {
+          int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
+
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
+#else   /* 16-bit or 32-bit */
+          if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
+          if (utf && c > 127) d = UCD_OTHERCASE(c);
+#endif  /* Code width */
+#endif  /* SUPPORT_UNICODE */
+
+          if (d != a) goto DONE;   /* Not other case of a */
+          b = c;
+          }
+        else goto DONE;   /* More than two characters found */
+        }
+      }
+
+    /* Replace the start code unit bits with a first code unit, but only if it 
+    is not the same as a required later code unit. This is because a search for
+    a required code unit starts after an explicit first code unit, but at a
+    code unit found from the bitmap. Patterns such as /a*a/ don't work 
+    if both the start unit and required unit are the same. */
+
+    if (a >= 0 && 
+        (
+        (re->flags & PCRE2_LASTSET) == 0 || 
+          (
+          re->last_codeunit != (uint32_t)a && 
+          (b < 0 || re->last_codeunit != (uint32_t)b)
+          )
+        ))
+      {
+      re->first_codeunit = a;
+      flags = PCRE2_FIRSTSET;
+      if (b >= 0) flags |= PCRE2_FIRSTCASELESS;
+      }
+
+    DONE:
+    re->flags |= flags;
+    }
   }


/* Find the minimum length of subject string. If the pattern can match an empty

Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testinput10    2019-09-13 17:02:06 UTC (rev 1171)
@@ -567,4 +567,16 @@


/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf

+/[\xff\x{ffff}]/I,utf
+
+/[\xff\x{ff}]/I,utf
+
+/[\xff\x{ff}]/I
+
+/[Ss]/I
+
+/[Ss]/I,utf
+
+/(?:\x{ff}|\x{3000})/I,utf
+
# End of testinput10

Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testinput12    2019-09-13 17:02:06 UTC (rev 1171)
@@ -451,4 +451,16 @@


/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf

+/[\xff\x{ffff}]/I,utf
+
+/[\xff\x{ff}]/I,utf
+
+/[\xff\x{ff}]/I
+
+/[Ss]/I
+
+/[Ss]/I,utf
+
+/(?:\x{ff}|\x{3000})/I,utf
+
# End of testinput12

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testinput2    2019-09-13 17:02:06 UTC (rev 1171)
@@ -1625,6 +1625,7 @@
 /^a*A\d/IBi
     aaaA5
     aaaa5
+    a5 


/(a*|b*)[cd]/I

@@ -5760,4 +5761,15 @@

/[aA]b[cC]/IB

+/[cc]abcd/I
+
+/[Cc]abcd/I
+
+/[c]abcd/I
+
+/(?:c|C)abcd/I
+
+/(a)?a/I
+    manm
+
 # End of testinput2


Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testoutput10    2019-09-13 17:02:06 UTC (rev 1171)
@@ -1769,4 +1769,38 @@
 Starting code units: \xef \xf0 \xf1 \xf2 \xf4 
 Subject length lower bound = 1


+/[\xff\x{ffff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xc3 \xef
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xc3
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I
+Capture group count = 0
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[Ss]/I
+Capture group count = 0
+First code unit = 'S' (caseless)
+Subject length lower bound = 1
+
+/[Ss]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: S s
+Subject length lower bound = 1
+
+/(?:\x{ff}|\x{3000})/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xc3 \xe3
+Subject length lower bound = 1
+
# End of testinput10

Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testoutput12-16    2019-09-13 17:02:06 UTC (rev 1171)
@@ -1594,4 +1594,38 @@
 Last code unit = \x{145} (caseless)
 Subject length lower bound = 3


+/[\xff\x{ffff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I
+Capture group count = 0
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[Ss]/I
+Capture group count = 0
+Starting code units: S s
+Subject length lower bound = 1
+
+/[Ss]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: S s
+Subject length lower bound = 1
+
+/(?:\x{ff}|\x{3000})/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
# End of testinput12

Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testoutput12-32    2019-09-13 17:02:06 UTC (rev 1171)
@@ -1592,4 +1592,38 @@
 Last code unit = \x{145} (caseless)
 Subject length lower bound = 3


+/[\xff\x{ffff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[\xff\x{ff}]/I
+Capture group count = 0
+Starting code units: \xff
+Subject length lower bound = 1
+
+/[Ss]/I
+Capture group count = 0
+Starting code units: S s
+Subject length lower bound = 1
+
+/[Ss]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: S s
+Subject length lower bound = 1
+
+/(?:\x{ff}|\x{3000})/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xff
+Subject length lower bound = 1
+
# End of testinput12

Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2019-09-10 15:38:42 UTC (rev 1170)
+++ code/trunk/testdata/testoutput2    2019-09-13 17:02:06 UTC (rev 1171)
@@ -816,7 +816,7 @@
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
-Starting code units: a 
+First code unit = 'a'
 Subject length lower bound = 4
 \= Expect no match
     aaaa
@@ -6492,6 +6492,8 @@
  0: aaaA5
     aaaa5
  0: aaaa5
+    a5 
+ 0: a5


/(a*|b*)[cd]/I
Capture group count = 1
@@ -17401,6 +17403,38 @@
Last code unit = 'c' (caseless)
Subject length lower bound = 3

+/[cc]abcd/I
+Capture group count = 0
+First code unit = 'c'
+Last code unit = 'd'
+Subject length lower bound = 5
+
+/[Cc]abcd/I
+Capture group count = 0
+First code unit = 'C' (caseless)
+Last code unit = 'd'
+Subject length lower bound = 5
+
+/[c]abcd/I
+Capture group count = 0
+First code unit = 'c'
+Last code unit = 'd'
+Subject length lower bound = 5
+
+/(?:c|C)abcd/I
+Capture group count = 0
+First code unit = 'C' (caseless)
+Last code unit = 'd'
+Subject length lower bound = 5
+
+/(a)?a/I
+Capture group count = 1
+Starting code units: a 
+Last code unit = 'a'
+Subject length lower bound = 1
+    manm
+ 0: a
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data