[Pcre-svn] [1097] code/trunk: Tweak limits on "must have" co…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1097] code/trunk: Tweak limits on "must have" code unit searches ( improves some performance).
Revision: 1097
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1097
Author:   ph10
Date:     2019-05-28 17:34:28 +0100 (Tue, 28 May 2019)
Log Message:
-----------
Tweak limits on "must have" code unit searches (improves some performance).


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_dfa_match.c
    code/trunk/src/pcre2_internal.h
    code/trunk/src/pcre2_match.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-05-28 14:14:22 UTC (rev 1096)
+++ code/trunk/ChangeLog    2019-05-28 16:34:28 UTC (rev 1097)
@@ -22,7 +22,10 @@


6. Add support for invalid UTF-8 to pcre2grep.

+7. Adjust the limit for "must have" code unit searching, in particular,
+increase it substantially for non-anchored patterns.

+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/src/pcre2_dfa_match.c
===================================================================
--- code/trunk/src/pcre2_dfa_match.c    2019-05-28 14:14:22 UTC (rev 1096)
+++ code/trunk/src/pcre2_dfa_match.c    2019-05-28 16:34:28 UTC (rev 1097)
@@ -3294,10 +3294,10 @@
 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
   return PCRE2_ERROR_BADOPTION;
-  
+
 /* Invalid UTF support is not available for DFA matching. */


-if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
+if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
return PCRE2_ERROR_DFA_UINVALID_UTF;

 /* Check that the first field in the block is the magic number. If it is not,
@@ -3658,7 +3658,7 @@
           while (start_match < end_subject && UCHAR21TEST(start_match) !=
                  first_cu)
             start_match++;
-#else
+#else  /* 8-bit code units */
           start_match = memchr(start_match, first_cu, end_subject - start_match);
           if (start_match == NULL) start_match = end_subject;
 #endif
@@ -3745,6 +3745,8 @@


     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
       {
+      PCRE2_SPTR p;
+
       /* The minimum matching length is a lower bound; no actual string of that
       length may actually match the pattern. Although the value is, strictly,
       in characters, we treat it as code units to avoid spending too much time
@@ -3758,37 +3760,63 @@
       point. This optimization can save a huge amount of backtracking in
       patterns with nested unlimited repeats that aren't going to match.
       Writing separate code for cased/caseless versions makes it go faster, as
-      does using an autoincrement and backing off on a match.
+      does using an autoincrement and backing off on a match. As in the case of
+      the first code unit, using memchr() in the 8-bit library gives a big
+      speed up. Unlike the first_cu check above, we do not need to call
+      memchr() twice in the caseless case because we only need to check for the
+      presence of the character in either case, not find the first occurrence.


+      The search can be skipped if the code unit was found later than the
+      current starting point in a previous iteration of the bumpalong loop.
+
       HOWEVER: when the subject string is very, very long, searching to its end
       can take a long time, and give bad performance on quite ordinary
       patterns. This showed up when somebody was matching something like
       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
-      sufficiently long. */
+      sufficiently long, but it's worth searching a lot more for unanchored
+      patterns. */


-      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+      p = start_match + (has_first_cu? 1:0);
+      if (has_req_cu && p > req_cu_ptr)
         {
-        PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+        PCRE2_SIZE check_length = end_subject - start_match;


-        /* We don't need to repeat the search if we haven't yet reached the
-        place we found it at last time. */
-
-        if (p > req_cu_ptr)
+        if (check_length < REQ_CU_MAX ||
+              (!anchored && check_length < REQ_CU_MAX * 1000))
           {
-          if (req_cu != req_cu2)
+          if (req_cu != req_cu2)  /* Caseless */
             {
+#if PCRE2_CODE_UNIT_WIDTH != 8
             while (p < end_subject)
               {
               uint32_t pp = UCHAR21INCTEST(p);
               if (pp == req_cu || pp == req_cu2) { p--; break; }
               }
+#else  /* 8-bit code units */
+            PCRE2_SPTR pp = p;
+            p = memchr(pp, req_cu, end_subject - pp);
+            if (p == NULL)
+              {
+              p = memchr(pp, req_cu2, end_subject - pp);
+              if (p == NULL) p = end_subject;
+              }
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
             }
+
+          /* The caseful case */
+
           else
             {
+#if PCRE2_CODE_UNIT_WIDTH != 8
             while (p < end_subject)
               {
               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
               }
+
+#else  /* 8-bit code units */
+            p = memchr(p, req_cu, end_subject - p);
+            if (p == NULL) p = end_subject;
+#endif
             }


           /* If we can't find the required code unit, break the matching loop,


Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h    2019-05-28 14:14:22 UTC (rev 1096)
+++ code/trunk/src/pcre2_internal.h    2019-05-28 16:34:28 UTC (rev 1097)
@@ -535,13 +535,14 @@
 #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */


/* The maximum remaining length of subject we are prepared to search for a
-req_unit match. In 8-bit mode, memchr() is used and is much faster than the
-search loop that has to be used in 16-bit and 32-bit modes. */
+req_unit match from an anchored pattern. In 8-bit mode, memchr() is used and is
+much faster than the search loop that has to be used in 16-bit and 32-bit
+modes. */

 #if PCRE2_CODE_UNIT_WIDTH == 8
-#define REQ_CU_MAX 2000
+#define REQ_CU_MAX       5000
 #else
-#define REQ_CU_MAX 1000
+#define REQ_CU_MAX       2000
 #endif


/* Offsets for the bitmap tables in the cbits set of tables. Each table

Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c    2019-05-28 14:14:22 UTC (rev 1096)
+++ code/trunk/src/pcre2_match.c    2019-05-28 16:34:28 UTC (rev 1097)
@@ -6783,6 +6783,8 @@


     if (!mb->partial)
       {
+      PCRE2_SPTR p;
+
       /* The minimum matching length is a lower bound; no string of that length
       may actually match the pattern. Although the value is, strictly, in
       characters, we treat it as code units to avoid spending too much time in
@@ -6806,60 +6808,57 @@
       memchr() twice in the caseless case because we only need to check for the
       presence of the character in either case, not find the first occurrence.


+      The search can be skipped if the code unit was found later than the
+      current starting point in a previous iteration of the bumpalong loop.
+
       HOWEVER: when the subject string is very, very long, searching to its end
       can take a long time, and give bad performance on quite ordinary
-      patterns. This showed up when somebody was matching something like
-      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
-      sufficiently long. */
+      anchored patterns. This showed up when somebody was matching something
+      like /^\d+C/ on a 32-megabyte string... so we don't do this when the
+      string is sufficiently long, but it's worth searching a lot more for
+      unanchored patterns. */


-      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+      p = start_match + (has_first_cu? 1:0);
+      if (has_req_cu && p > req_cu_ptr)
         {
-        PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+        PCRE2_SIZE check_length = end_subject - start_match;


-        /* We don't need to repeat the search if we haven't yet reached the
-        place we found it last time round the bumpalong loop. */
-
-        if (p > req_cu_ptr)
+        if (check_length < REQ_CU_MAX ||
+              (!anchored && check_length < REQ_CU_MAX * 1000))
           {
-          if (p < end_subject)
+          if (req_cu != req_cu2)  /* Caseless */
             {
-            if (req_cu != req_cu2)  /* Caseless */
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            while (p < end_subject)
               {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-              do
-                {
-                uint32_t pp = UCHAR21INCTEST(p);
-                if (pp == req_cu || pp == req_cu2) { p--; break; }
-                }
-              while (p < end_subject);
-
+              uint32_t pp = UCHAR21INCTEST(p);
+              if (pp == req_cu || pp == req_cu2) { p--; break; }
+              }
 #else  /* 8-bit code units */
-              PCRE2_SPTR pp = p;
-              p = memchr(pp, req_cu, end_subject - pp);
-              if (p == NULL)
-                {
-                p = memchr(pp, req_cu2, end_subject - pp);
-                if (p == NULL) p = end_subject;
-                }
+            PCRE2_SPTR pp = p;
+            p = memchr(pp, req_cu, end_subject - pp);
+            if (p == NULL)
+              {
+              p = memchr(pp, req_cu2, end_subject - pp);
+              if (p == NULL) p = end_subject;
+              }
 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
-              }
+            }


-            /* The caseful case */
+          /* The caseful case */


-            else
+          else
+            {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            while (p < end_subject)
               {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-              do
-                {
-                if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
-                }
-              while (p < end_subject);
+              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
+              }


 #else  /* 8-bit code units */
-              p = memchr(p, req_cu, end_subject - p);
-              if (p == NULL) p = end_subject;
+            p = memchr(p, req_cu, end_subject - p);
+            if (p == NULL) p = end_subject;
 #endif
-              }
             }


           /* If we can't find the required code unit, break the bumpalong loop,