[Pcre-svn] [1168] code/trunk: Optimize classes such as [Aa] …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1168] code/trunk: Optimize classes such as [Aa] to be a single caseless character.
Revision: 1168
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1168
Author:   ph10
Date:     2019-09-09 18:00:19 +0100 (Mon, 09 Sep 2019)
Log Message:
-----------
Optimize classes such as [Aa] to be a single caseless character.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput12-16
    code/trunk/testdata/testoutput12-32
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/ChangeLog    2019-09-09 17:00:19 UTC (rev 1168)
@@ -146,10 +146,14 @@
 31. Installed a .gitignore file on a user's suggestion. When using the svn
 repository with git (through git svn) this helps keep it tidy.


-32. Add underflow check in JIT which may occure when the value of subject
+32. Add underflow check in JIT which may occur when the value of subject
string pointer is close to 0.

+33. Arrange for classes such as [Aa] which contain just the two cases of the
+same character, to be treated as a single caseless character. This causes the
+first and required code unit optimizations to kick in where relevant.

+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/src/pcre2_compile.c    2019-09-09 17:00:19 UTC (rev 1168)
@@ -132,7 +132,7 @@
     compile_block *);


 static BOOL
-  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *, 
+  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
     compile_block *);


 static int
@@ -3635,6 +3635,8 @@
       if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
       }     /* End of class-processing loop */


+    /* -] at the end of a class is a literal '-' */
+
     if (class_range_state == RANGE_STARTED)
       {
       parsed_pattern[-1] = CHAR_MINUS;
@@ -5302,6 +5304,7 @@
 BOOL had_accept = FALSE;
 BOOL matched_char = FALSE;
 BOOL previous_matched_char = FALSE;
+BOOL reset_caseful = FALSE;
 const uint8_t *cbits = cb->cbits;
 uint8_t classbits[32];


@@ -5578,8 +5581,38 @@
       }        /* End of 1-char optimization */


     /* Handle character classes that contain more than just one literal
-    character. */
+    character. If there are exactly two characters in a positive class, see if
+    they are case partners. This can be optimized to generate a caseless single
+    character match (which also sets first/required code units if relevant). */


+    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END && 
+        pptr[3] == META_CLASS_END)
+      {
+      uint32_t c = pptr[1];
+
+#ifdef SUPPORT_UNICODE
+      if (UCD_CASESET(c) == 0)
+#endif
+        {
+        uint32_t d = TABLE_GET(c, cb->fcc, c);
+#ifdef SUPPORT_UNICODE
+        if (utf && c > 127) d = UCD_OTHERCASE(c);
+#endif
+        if (c != d && pptr[2] == d)
+          {
+          pptr += 3;                 /* Move on to class end */
+          meta = c;
+          if ((options & PCRE2_CASELESS) == 0)
+            {
+            reset_caseful = TRUE;
+            options |= PCRE2_CASELESS;
+            req_caseopt = REQ_CASELESS;
+            } 
+          goto CLASS_CASELESS_CHAR;
+          }
+        }
+      }
+
     /* If a non-extended class contains a negative special such as \S, we need
     to flip the negation flag at the end, so that support for characters > 255
     works correctly (they are all included in the class). An extended class may
@@ -7818,10 +7851,16 @@
       }
 #endif


-    /* Caseful matches, or not one of the multicase characters. Get the
-    character's code units into mcbuffer, with the length in mclength. When not
-    in UTF mode, the length is always 1. */
+    /* Caseful matches, or caseless and not one of the multicase characters. We
+    come here by goto in the case of a positive class that contains only
+    case-partners of a character with just two cases; matched_char has already
+    been set TRUE and options fudged if necessary. */


+    CLASS_CASELESS_CHAR:
+
+    /* Get the character's code units into mcbuffer, with the length in
+    mclength. When not in UTF mode, the length is always 1. */
+
 #ifdef SUPPORT_UNICODE
     if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
 #endif
@@ -7852,8 +7891,9 @@
       zeroreqcu = reqcu;
       zeroreqcuflags = reqcuflags;


-      /* If the character is more than one code unit long, we can set firstcu
-      only if it is not to be matched caselessly. */
+      /* If the character is more than one code unit long, we can set a single
+      firstcu only if it is not to be matched caselessly. Multiple possible 
+      starting code units may be picked up later in the studying code. */


       if (mclength == 1 || req_caseopt == 0)
         {
@@ -7883,7 +7923,17 @@
         reqcuflags = req_caseopt | cb->req_varyopt;
         }
       }
-    break;    /* End default meta handling */
+
+    /* If caselessness was temporarily instated, reset it. */
+
+    if (reset_caseful)
+      {
+      options &= ~PCRE2_CASELESS;
+      req_caseopt = 0;
+      reset_caseful = FALSE;
+      }
+
+    break;    /* End literal character handling */
     }         /* End of big switch */
   }           /* End of big loop */


@@ -8051,7 +8101,7 @@
     /* If this is not the first branch, the first char and reqcu have to
     match the values from all the previous branches, except that if the
     previous value for reqcu didn't have REQ_VARY set, it can still match,
-    and we set REQ_VARY for the regex. */
+    and we set REQ_VARY for the group from this branch's value. */


     else
       {
@@ -8090,7 +8140,7 @@
       else
         {
         reqcu = branchreqcu;
-        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
+        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
         }
       }
     }
@@ -8938,7 +8988,7 @@
   *pptrptr += 1;   /* Skip META_ALT */
   }


-if (group > 0)
+if (group > 0)
cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
return grouplength;

@@ -9235,7 +9285,7 @@
     in the cache. */


     gptr++;
-    grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group, 
+    grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
       &this_recurse, cb);
     if (grouplength < 0)
       {
@@ -9273,7 +9323,7 @@
     case META_SCRIPT_RUN:
     pptr++;
     CHECK_GROUP:
-    grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group, 
+    grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
       recurses, cb);
     if (grouplength < 0) return -1;
     itemlength = grouplength;
@@ -9372,7 +9422,7 @@
 */


 static BOOL
-set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr, 
+set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
   parsed_recurse_check *recurses, compile_block *cb)
 {
 PCRE2_SIZE offset;
@@ -10329,9 +10379,10 @@
            is_startline(codestart, 0, &cb, 0, FALSE))
     re->flags |= PCRE2_STARTLINE;


- /* Handle the "required code unit", if one is set. We can increment the
- minimum minimum length only if we are sure this really is a different
- character, because the count is in characters, not code units. */
+ /* Handle the "required code unit", if one is set. In the UTF case we can
+ increment the minimum minimum length only if we are sure this really is a
+ different character and not a non-starting code unit of the first character,
+ because the minimum length count is in characters, not code units. */

   if (reqcuflags >= 0)
     {


Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testinput10    2019-09-09 17:00:19 UTC (rev 1168)
@@ -559,4 +559,6 @@


/(*UTF)(?=\x{123})/I

+/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+
# End of testinput10

Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testinput12    2019-09-09 17:00:19 UTC (rev 1168)
@@ -449,4 +449,6 @@


/(*UTF)(?=\x{123})/I

+/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+
# End of testinput12

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testinput2    2019-09-09 17:00:19 UTC (rev 1168)
@@ -5758,4 +5758,6 @@


/(?(VERSION=10.4)b)((?<=b).*)/B

+/[aA]b[cC]/IB
+
# End of testinput2

Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testoutput10    2019-09-09 17:00:19 UTC (rev 1168)
@@ -1766,4 +1766,11 @@
 Last code unit = \xa3
 Subject length lower bound = 1


+/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+Capture group count = 0
+Options: utf
+Starting code units: \xc3
+Last code unit = 'X'
+Subject length lower bound = 3
+
# End of testinput10

Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testoutput12-16    2019-09-09 17:00:19 UTC (rev 1168)
@@ -1587,4 +1587,11 @@
 First code unit = \x{123}
 Subject length lower bound = 1


+/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+Capture group count = 0
+Options: utf
+First code unit = \xc1 (caseless)
+Last code unit = \x{145} (caseless)
+Subject length lower bound = 3
+
# End of testinput12

Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testoutput12-32    2019-09-09 17:00:19 UTC (rev 1168)
@@ -1585,4 +1585,11 @@
 First code unit = \x{123}
 Subject length lower bound = 1


+/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
+Capture group count = 0
+Options: utf
+First code unit = \xc1 (caseless)
+Last code unit = \x{145} (caseless)
+Subject length lower bound = 3
+
# End of testinput12

Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2019-09-09 07:12:00 UTC (rev 1167)
+++ code/trunk/testdata/testoutput2    2019-09-09 17:00:19 UTC (rev 1168)
@@ -17387,6 +17387,20 @@
         End
 ------------------------------------------------------------------


+/[aA]b[cC]/IB
+------------------------------------------------------------------
+        Bra
+     /i a
+        b
+     /i c
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+First code unit = 'a' (caseless)
+Last code unit = 'c' (caseless)
+Subject length lower bound = 3
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data