[Pcre-svn] [342] code/trunk: Slight performance improvement …

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [342] code/trunk: Slight performance improvement by using the new OP_ALLANY opcode for cases of
Revision: 342
          http://vcs.pcre.org/viewvc?view=rev&revision=342
Author:   ph10
Date:     2008-04-20 18:10:13 +0100 (Sun, 20 Apr 2008)


Log Message:
-----------
Slight performance improvement by using the new OP_ALLANY opcode for cases of
the metacharacter "." when DOTALL is set. Also, some tidies consequent upon its
invention.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/HACKING
    code/trunk/pcre_compile.c
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/pcre_study.c
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/ChangeLog    2008-04-20 17:10:13 UTC (rev 342)
@@ -86,6 +86,13 @@
     it was being rejected as not supported by pcre_dfa_exec(), even though 
     other assertions are supported. I have made pcre_dfa_exec() support 
     (*FAIL). 
+    
+16. The implementation of 13c above involved the invention of a new opcode,
+    OP_ALLANY, which is like OP_ANY but doesn't check the /s flag. Since /s 
+    cannot be changed at match time, I realized I could make a small 
+    improvement to matching performance by compiling OP_ALLANY instead of 
+    OP_ANY for "." when DOTALL was set, and then removing the runtime tests 
+    on the OP_ANY path. 



Version 7.6 28-Jan-08

Modified: code/trunk/HACKING
===================================================================
--- code/trunk/HACKING    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/HACKING    2008-04-20 17:10:13 UTC (rev 342)
@@ -125,7 +125,8 @@
 These items are all just one byte long


   OP_END                 end of pattern
-  OP_ANY                 match any character
+  OP_ANY                 match any one character other than newline
+  OP_ALLANY              match any one character, including newline
   OP_ANYBYTE             match any single byte, even in UTF-8 mode
   OP_SOD                 match start of data: \A
   OP_SOM,                start of match (subject + offset): \G


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_compile.c    2008-04-20 17:10:13 UTC (rev 342)
@@ -1301,6 +1301,7 @@
     case OP_NOT_WORDCHAR:
     case OP_WORDCHAR:
     case OP_ANY:
+    case OP_ALLANY:
     branchlength++;
     cc++;
     break;
@@ -1679,6 +1680,7 @@
     case OP_NOT_WORDCHAR:
     case OP_WORDCHAR:
     case OP_ANY:
+    case OP_ALLANY: 
     case OP_ANYBYTE:
     case OP_CHAR:
     case OP_CHARNC:
@@ -2665,7 +2667,7 @@
     zerofirstbyte = firstbyte;
     zeroreqbyte = reqbyte;
     previous = code;
-    *code++ = OP_ANY;
+    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
     break;



@@ -5753,14 +5755,14 @@
      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
      }


- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
- are or may be referenced. */
+ /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
+ it isn't in brackets that are or may be referenced. */

    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
-             op == OP_TYPEPOSSTAR) &&
-            (*options & PCRE_DOTALL) != 0)
+             op == OP_TYPEPOSSTAR))
      {
-     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) 
+       return FALSE;
      }


    /* Check for explicit anchoring */


Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_dfa_exec.c    2008-04-20 17:10:13 UTC (rev 342)
@@ -739,7 +739,7 @@


       /*-----------------------------------------------------------------*/
       case OP_ANY:
-      if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
+      if (clen > 0 && !IS_NEWLINE(ptr))
         { ADD_NEW(state_offset + 1, 0); }
       break;


@@ -877,10 +877,7 @@
         {
         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
             (c < 256 &&
-              (d != OP_ANY ||
-               (ims & PCRE_DOTALL) != 0 ||
-               !IS_NEWLINE(ptr)
-              ) &&
+              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
           {
           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
@@ -903,10 +900,7 @@
         {
         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
             (c < 256 &&
-              (d != OP_ANY ||
-               (ims & PCRE_DOTALL) != 0 ||
-               !IS_NEWLINE(ptr)
-              ) &&
+              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
           {
           if (codevalue == OP_TYPEPOSQUERY)
@@ -928,10 +922,7 @@
         {
         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
             (c < 256 &&
-              (d != OP_ANY ||
-               (ims & PCRE_DOTALL) != 0 ||
-               !IS_NEWLINE(ptr)
-              ) &&
+              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
           {
           if (codevalue == OP_TYPEPOSSTAR)
@@ -951,10 +942,7 @@
         {
         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
             (c < 256 &&
-              (d != OP_ANY ||
-               (ims & PCRE_DOTALL) != 0 ||
-               !IS_NEWLINE(ptr)
-              ) &&
+              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
           {
           if (++count >= GET2(code, 1))
@@ -975,10 +963,7 @@
         {
         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
             (c < 256 &&
-              (d != OP_ANY ||
-               (ims & PCRE_DOTALL) != 0 ||
-               !IS_NEWLINE(ptr)
-              ) &&
+              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
           {
           if (codevalue == OP_TYPEPOSUPTO)


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_exec.c    2008-04-20 17:10:13 UTC (rev 342)
@@ -1429,16 +1429,12 @@
     /* Match a single character type; inline for speed */


     case OP_ANY:
-    if ((ims & PCRE_DOTALL) == 0)
-      {
-      if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
-      }
+    if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
     /* Fall through */   


     case OP_ALLANY:
     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
-    if (utf8)
-      while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+    if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
     ecode++;
     break;


@@ -2955,8 +2951,7 @@
         case OP_ANY:
         for (i = 1; i <= min; i++)
           {
-          if (eptr >= md->end_subject ||
-               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+          if (eptr >= md->end_subject || IS_NEWLINE(eptr))
             RRETURN(MATCH_NOMATCH);
           eptr++;
           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -3180,15 +3175,11 @@
       switch(ctype)
         {
         case OP_ANY:
-        if ((ims & PCRE_DOTALL) == 0)
+        for (i = 1; i <= min; i++)
           {
-          for (i = 1; i <= min; i++)
-            {
-            if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
-            eptr++;
-            }
+          if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+          eptr++;
           }
-        else eptr += min;
         break;


         case OP_ALLANY:
@@ -3449,14 +3440,13 @@
           RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject ||
-               (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
-                IS_NEWLINE(eptr)))
+               (ctype == OP_ANY && IS_NEWLINE(eptr)))
             RRETURN(MATCH_NOMATCH);


           GETCHARINC(c, eptr);
           switch(ctype)
             {
-            case OP_ANY:        /* This is the DOTALL case */
+            case OP_ANY:        /* This is the non-NL case */
             case OP_ALLANY: 
             case OP_ANYBYTE:
             break;
@@ -3609,13 +3599,13 @@
           RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject ||
-               ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+               (ctype == OP_ANY && IS_NEWLINE(eptr)))
             RRETURN(MATCH_NOMATCH);


           c = *eptr++;
           switch(ctype)
             {
-            case OP_ANY:     /* This is the DOTALL case */
+            case OP_ANY:     /* This is the non-NL case */
             case OP_ALLANY: 
             case OP_ANYBYTE:
             break;
@@ -3870,43 +3860,24 @@
           case OP_ANY:
           if (max < INT_MAX)
             {
-            if ((ims & PCRE_DOTALL) == 0)
+            for (i = min; i < max; i++)
               {
-              for (i = min; i < max; i++)
-                {
-                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
-                eptr++;
-                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
-                }
+              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+              eptr++;
+              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
               }
-            else
-              {
-              for (i = min; i < max; i++)
-                {
-                if (eptr >= md->end_subject) break;
-                eptr++;
-                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
-                }
-              }
             }


           /* Handle unlimited UTF-8 repeat */


           else
             {
-            if ((ims & PCRE_DOTALL) == 0)
+            for (i = min; i < max; i++)
               {
-              for (i = min; i < max; i++)
-                {
-                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
-                eptr++;
-                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
-                }
+              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+              eptr++;
+              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
               }
-            else
-              {
-              eptr = md->end_subject;
-              }
             }
           break;


@@ -4108,16 +4079,12 @@
         switch(ctype)
           {
           case OP_ANY:
-          if ((ims & PCRE_DOTALL) == 0)
+          for (i = min; i < max; i++)
             {
-            for (i = min; i < max; i++)
-              {
-              if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
-              eptr++;
-              }
-            break;
+            if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+            eptr++;
             }
-          /* For DOTALL case, fall through */
+          break;


           case OP_ALLANY:
           case OP_ANYBYTE:


Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_study.c    2008-04-20 17:10:13 UTC (rev 342)
@@ -348,6 +348,7 @@
       switch(tcode[1])
         {
         case OP_ANY:
+        case OP_ALLANY: 
         return SSB_FAIL;


         case OP_NOT_DIGIT:


Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/testdata/testoutput10    2008-04-20 17:10:13 UTC (rev 342)
@@ -21,7 +21,7 @@
 ------------------------------------------------------------------
   0  21 Bra
   3   9 CBra 1
-  8     Any*
+  8     AllAny*
  10     X
  12   6 Alt
  15     ^
@@ -37,7 +37,7 @@
   0  25 Bra
   3   9 Bra
   6  04 Opt
-  8     Any*
+  8     AllAny*
  10     X
  12   8 Alt
  15  04 Opt


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/testdata/testoutput2    2008-04-20 17:10:13 UTC (rev 342)
@@ -1126,7 +1126,7 @@
 /.*X/IDZs
 ------------------------------------------------------------------
         Bra
-        Any*
+        AllAny*
         X
         Ket
         End
@@ -1160,7 +1160,7 @@
 ------------------------------------------------------------------
         Bra
         CBra 1
-        Any*
+        AllAny*
         X
         Alt
         ^
@@ -1179,7 +1179,7 @@
 ------------------------------------------------------------------
         Bra
         CBra 1
-        Any*
+        AllAny*
         X
         Alt
         ^
@@ -1199,7 +1199,7 @@
         Bra
         Bra
      04 Opt
-        Any*
+        AllAny*
         X
         Alt
      04 Opt
@@ -1212,8 +1212,8 @@
 ------------------------------------------------------------------
 Capturing subpattern count = 0
 Partial matching not supported
-No options
-First char at start or follows newline
+Options: anchored
+No first char
 No need char


/\Biss\B/I+