[Pcre-svn] [1311] code/trunk: Use tail recursion in maximizi…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1311] code/trunk: Use tail recursion in maximizing character and character type repetitions, to
Revision: 1311
          http://vcs.pcre.org/viewvc?view=rev&revision=1311
Author:   ph10
Date:     2013-04-22 18:35:23 +0100 (Mon, 22 Apr 2013)


Log Message:
-----------
Use tail recursion in maximizing character and character type repetitions, to
reduce stack usage.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput4
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput4


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/ChangeLog    2013-04-22 17:35:23 UTC (rev 1311)
@@ -134,6 +134,9 @@


 35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular, 
     blocking (*UTF) etc.
+    
+36. In the interpreter, maximizing pattern repetitions for characters and 
+    character types now use tail recursion, which reduces stack usage.



Version 8.32 30-November-2012

Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/pcre_exec.c    2013-04-22 17:35:23 UTC (rev 1311)
@@ -3387,7 +3387,22 @@
     max = rep_max[c];                 /* zero for max => infinity */
     if (max == 0) max = INT_MAX;


-    /* Common code for all repeated single-character matches. */
+    /* Common code for all repeated single-character matches. We first check 
+    for the minimum number of characters. If the minimum equals the maximum, we 
+    are done. Otherwise, if minimizing, check the rest of the pattern for a 
+    match; if there isn't one, advance up to the maximum, one character at a 
+    time.
+    
+    If maximizing, advance up to the maximum number of matching characters, 
+    until eptr is past the end of the maximum run. If possessive, we are
+    then done (no backing up). Otherwise, match at this position; anything
+    other than no match is immediately returned. For nomatch, back up one
+    character, unless we are matching \R and the last thing matched was
+    \r\n, in which case, back up two bytes. When we reach the first optional 
+    character position, we can save stack by doing a tail recurse. 
+    
+    The various UTF/non-UTF and caseful/caseless cases are handled separately,
+    for speed. */


     REPEATCHAR:
 #ifdef SUPPORT_UTF
@@ -3471,13 +3486,13 @@
               }
             }


-          if (possessive) continue;
-
+          if (possessive) continue;    /* No backtracking */
           for(;;)
             {
+            if (eptr == pp) goto TAIL_RECURSE;
             RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+            /* if (eptr == pp) { RRETURN(MATCH_NOMATCH); } */
 #ifdef SUPPORT_UCP
             eptr--;
             BACKCHAR(eptr);
@@ -3577,10 +3592,10 @@
           eptr++;
           }


-        if (possessive) continue;
-
-        while (eptr >= pp)
+        if (possessive) continue;       /* No backtracking */
+        for (;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE; 
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
           eptr--;
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3635,10 +3650,10 @@
           if (fc != RAWUCHARTEST(eptr)) break;
           eptr++;
           }
-        if (possessive) continue;
-
-        while (eptr >= pp)
+        if (possessive) continue;    /* No backtracking */
+        for (;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE;
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
           eptr--;
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3815,7 +3830,7 @@
           }
         }
       else
-#endif
+#endif  /* SUPPORT_UTF */
       /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
@@ -3853,7 +3868,7 @@
             }
           }
         else
-#endif
+#endif  /*SUPPORT_UTF */
         /* Not UTF mode */
           {
           for (fi = min;; fi++)
@@ -3895,17 +3910,18 @@
             if (fc == d || (unsigned int)foc == d) break;
             eptr += len;
             }
-          if (possessive) continue;
+          if (possessive) continue;    /* No backtracking */
           for(;;)
             {
+            if (eptr == pp) goto TAIL_RECURSE;
             RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (eptr-- == pp) break;        /* Stop if tried at original pos */
+            eptr--;
             BACKCHAR(eptr);
             }
           }
         else
-#endif
+#endif  /* SUPPORT_UTF */
         /* Not UTF mode */
           {
           for (i = min; i < max; i++)
@@ -3918,9 +3934,10 @@
             if (fc == *eptr || foc == *eptr) break;
             eptr++;
             }
-          if (possessive) continue;
-          while (eptr >= pp)
+          if (possessive) continue;    /* No backtracking */
+          for (;;)
             {
+            if (eptr == pp) goto TAIL_RECURSE;
             RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
             eptr--;
@@ -4030,12 +4047,13 @@
             if (fc == d) break;
             eptr += len;
             }
-          if (possessive) continue;
+          if (possessive) continue;    /* No backtracking */
           for(;;)
             {
+            if (eptr == pp) goto TAIL_RECURSE;
             RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-            if (eptr-- == pp) break;        /* Stop if tried at original pos */
+            eptr--;
             BACKCHAR(eptr);
             }
           }
@@ -4053,9 +4071,10 @@
             if (fc == *eptr) break;
             eptr++;
             }
-          if (possessive) continue;
-          while (eptr >= pp)
+          if (possessive) continue;    /* No backtracking */
+          for (;;)
             {
+            if (eptr == pp) goto TAIL_RECURSE; 
             RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
             eptr--;
@@ -5613,12 +5632,13 @@


         /* eptr is now past the end of the maximum run */


-        if (possessive) continue;
+        if (possessive) continue;    /* No backtracking */
         for(;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE;
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (eptr-- == pp) break;        /* Stop if tried at original pos */
+          eptr--;
           if (utf) BACKCHAR(eptr);
           }
         }
@@ -5655,13 +5675,13 @@


         /* eptr is now past the end of the maximum run */


-        if (possessive) continue;
-
+        if (possessive) continue;    /* No backtracking */
         for(;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE;
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (eptr-- == pp) break;        /* Stop if tried at original pos */
+          eptr--; 
           for (;;)                        /* Move back over one extended */
             {
             if (!utf) c = *eptr; else
@@ -5936,18 +5956,13 @@
           RRETURN(PCRE_ERROR_INTERNAL);
           }


-        /* eptr is now past the end of the maximum run. If possessive, we are
-        done (no backing up). Otherwise, match at this position; anything other
-        than no match is immediately returned. For nomatch, back up one
-        character, unless we are matching \R and the last thing matched was
-        \r\n, in which case, back up two bytes. */
-
-        if (possessive) continue;
+        if (possessive) continue;    /* No backtracking */
         for(;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE; 
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
-          if (eptr-- == pp) break;        /* Stop if tried at original pos */
+          eptr--;
           BACKCHAR(eptr);
           if (ctype == OP_ANYNL && eptr > pp  && RAWUCHAR(eptr) == CHAR_NL &&
               RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
@@ -6185,15 +6200,10 @@
           RRETURN(PCRE_ERROR_INTERNAL);
           }


-        /* eptr is now past the end of the maximum run. If possessive, we are
-        done (no backing up). Otherwise, match at this position; anything other
-        than no match is immediately returned. For nomatch, back up one
-        character (byte), unless we are matching \R and the last thing matched
-        was \r\n, in which case, back up two bytes. */
-
-        if (possessive) continue;
-        while (eptr >= pp)
+        if (possessive) continue;    /* No backtracking */
+        for (;;)
           {
+          if (eptr == pp) goto TAIL_RECURSE;
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           eptr--;


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testinput1    2013-04-22 17:35:23 UTC (rev 1311)
@@ -5588,4 +5588,24 @@
 /(*THEN:m(m)(?&y)(?(DEFINE)(?<y>b))/K
     abc


+/^\d*\w{4}/
+    1234
+    123 
+
+/^[^b]*\w{4}/
+    aaaa
+    aaa     
+
+/^[^b]*\w{4}/i
+    aaaa
+    aaa     
+
+/^a*\w{4}/
+    aaaa
+    aaa     
+
+/^a*\w{4}/i
+    aaaa
+    aaa     
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testinput4    2013-04-22 17:35:23 UTC (rev 1311)
@@ -691,4 +691,32 @@
     \x{fdee}
     \x{fdef}


+/^\d*\w{4}/8
+    1234
+    123 
+    
+/^\p{Any}*\d{4}/8
+    1234
+    123 
+ 
+/^\X*\w{4}/8
+    1234
+    123  
+    
+/^[^b]*\w{4}/8
+    aaaa
+    aaa  
+ 
+/^[^b]*\w{4}/8i
+    aaaa
+    aaa  
+ 
+/^\x{100}*.{4}/8
+    \x{100}\x{100}\x{100}\x{100}
+    \x{100}\x{100}\x{100}
+
+/^\x{100}*.{4}/8i
+    \x{100}\x{100}\x{100}\x{100}
+    \x{100}\x{100}\x{100}
+
 /-- End of testinput4 --/


Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput1    2013-04-22 17:35:23 UTC (rev 1311)
@@ -9172,4 +9172,34 @@
  0: b
 MK: m(m


+/^\d*\w{4}/
+    1234
+ 0: 1234
+    123 
+No match
+
+/^[^b]*\w{4}/
+    aaaa
+ 0: aaaa
+    aaa     
+No match
+
+/^[^b]*\w{4}/i
+    aaaa
+ 0: aaaa
+    aaa     
+No match
+
+/^a*\w{4}/
+    aaaa
+ 0: aaaa
+    aaa     
+No match
+
+/^a*\w{4}/i
+    aaaa
+ 0: aaaa
+    aaa     
+No match
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput2    2013-04-22 17:35:23 UTC (rev 1311)
@@ -4348,7 +4348,7 @@
  1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
   aaaaaaaaaaaaaz\M
 Minimum match() limit = 32768
-Minimum match() recursion limit = 42
+Minimum match() recursion limit = 29
 No match


/(aaa(?C1)bbb|ab)/I

Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4    2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput4    2013-04-22 17:35:23 UTC (rev 1311)
@@ -1227,4 +1227,46 @@
     \x{fdef}
  0: \x{fdef}


+/^\d*\w{4}/8
+    1234
+ 0: 1234
+    123 
+No match
+    
+/^\p{Any}*\d{4}/8
+    1234
+ 0: 1234
+    123 
+No match
+ 
+/^\X*\w{4}/8
+    1234
+ 0: 1234
+    123  
+No match
+    
+/^[^b]*\w{4}/8
+    aaaa
+ 0: aaaa
+    aaa  
+No match
+ 
+/^[^b]*\w{4}/8i
+    aaaa
+ 0: aaaa
+    aaa  
+No match
+ 
+/^\x{100}*.{4}/8
+    \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+    \x{100}\x{100}\x{100}
+No match
+
+/^\x{100}*.{4}/8i
+    \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+    \x{100}\x{100}\x{100}
+No match
+
 /-- End of testinput4 --/