[Pcre-svn] [609] code/trunk: Fix bug with /\A.*?(?:a|b(*THEN…

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [609] code/trunk: Fix bug with /\A.*?(?:a|b(*THEN)c)/ by removing the tail recursion optimization
Revision: 609
          http://vcs.pcre.org/viewvc?view=rev&revision=609
Author:   ph10
Date:     2011-06-15 19:09:23 +0100 (Wed, 15 Jun 2011)


Log Message:
-----------
Fix bug with /\A.*?(?:a|b(*THEN)c)/ by removing the tail recursion optimization
for the final branch. Also fix a similar bug for conditional subpatterns.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput11
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput11
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/ChangeLog    2011-06-15 18:09:23 UTC (rev 609)
@@ -90,6 +90,13 @@
     empty string, and PCRE_NOTEMPTY was set, pcre_exec() thought the whole
     pattern had matched an empty string, and so incorrectly returned a no
     match.
+    
+17. There was optimizing code for the last branch of non-capturing parentheses,
+    and also for the obeyed branch of a conditional subexpression, which used 
+    tail recursion to cut down on stack usage. Unfortunately, not that there is 
+    the possibility of (*THEN) occurring in these branches, tail recursion is 
+    no longer possible because the return has to be checked for (*THEN). These 
+    two optimizations have therefore been removed.  



Version 8.12 15-Jan-2011

Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/pcre_exec.c    2011-06-15 18:09:23 UTC (rev 609)
@@ -276,7 +276,7 @@
        RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
        RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
        RM51,  RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
-       RM61,  RM62, RM63, RM64 };
+       RM61,  RM62, RM63};


 /* These versions of the macros use the stack, as normal. There are debugging
 versions and production versions. Note that the "rw" argument of RMATCH isn't
@@ -858,7 +858,7 @@
       md->offset_vector[offset+1] = save_offset2;
       md->offset_vector[md->offset_end - number] = save_offset3;


-      if (rrc != MATCH_THEN) md->mark = markptr;
+      if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
       RRETURN(MATCH_NOMATCH);
       }


@@ -875,34 +875,17 @@

     /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
     for all the alternatives. When we get to the final alternative within the
-    brackets, we would return the result of a recursive call to match()
-    whatever happened. We can reduce stack usage by turning this into a tail
-    recursion, except in the case of a possibly empty group.*/
+    brackets, we used to return the result of a recursive call to match()
+    whatever happened so it was possible to reduce stack usage by turning this
+    into a tail recursion, except in the case of a possibly empty group. 
+    However, now that there is the possiblity of (*THEN) occurring in the final 
+    alternative, this optimization is no longer possible. */


     case OP_BRA:
     case OP_SBRA:
     DPRINTF(("start non-capturing bracket\n"));
     for (;;)
       {
-      if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */
-        {
-        if (op >= OP_SBRA)   /* Possibly empty group */
-          {
-          md->match_function_type = MATCH_CBEGROUP; 
-          RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb, 
-            RM48);
-          if (rrc == MATCH_NOMATCH) md->mark = markptr;
-          RRETURN(rrc);
-          } 
-        /* Not a possibly empty group; use tail recursion */
-        ecode += _pcre_OP_lengths[*ecode];
-        DPRINTF(("bracket 0 tail recursion\n"));
-        goto TAIL_RECURSE;
-        }
-
-      /* For non-final alternatives, continue the loop for a NOMATCH result;
-      otherwise return. */
-
       if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
       RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb, 
         RM2);
@@ -910,9 +893,12 @@
           (rrc != MATCH_THEN || md->start_match_ptr != ecode))
         RRETURN(rrc);
       ecode += GET(ecode, 1);
+      if (*ecode != OP_ALT) break; 
       }
-    /* Control never reaches here. */


+    if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
+    RRETURN(MATCH_NOMATCH);
+
     /* Handle possessive capturing brackets with an unlimited repeat. We come 
     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
     handled similarly to the normal case above. However, the matching is
@@ -988,7 +974,7 @@
         md->offset_vector[md->offset_end - number] = save_offset3;
         }


-      if (rrc != MATCH_THEN) md->mark = markptr;
+      if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
       if (allow_zero || matched_once)
         { 
         ecode += 1 + LINK_SIZE;
@@ -1026,7 +1012,7 @@
       {
       if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;   
       RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
-        eptrb, RM64);
+        eptrb, RM48);
       if (rrc == MATCH_KETRPOS)
         {
         eptr = md->end_match_ptr;
@@ -1053,8 +1039,7 @@
     /* Conditional group: compilation checked that there are no more than
     two branches. If the condition is false, skipping the first branch takes us
     past the end if there is only one branch, but that's OK because that is
-    exactly what going to the ket would do. As there is only one branch to be
-    obeyed, we can use tail recursion to avoid using another stack frame. */
+    exactly what going to the ket would do. */


     case OP_COND:
     case OP_SCOND:
@@ -1259,20 +1244,20 @@
       }


     /* We are now at the branch that is to be obeyed. As there is only one,
-    we can use tail recursion to avoid using another stack frame, except when
-    we have an unlimited repeat of a possibly empty group. If the second
-    alternative doesn't exist, we can just plough on. */
+    we used to use tail recursion to avoid using another stack frame, except
+    when there was unlimited repeat of a possibly empty group. However, that 
+    strategy no longer works because of the possibilty of (*THEN) being 
+    encountered in the branch. A recursive call to match() is always required,
+    unless the second alternative doesn't exist, in which case we can just
+    plough on. */


     if (condition || *ecode == OP_ALT)
       {
-      ecode += 1 + LINK_SIZE;
-      if (op == OP_SCOND)        /* Possibly empty group */
-        {
-        md->match_function_type = MATCH_CBEGROUP; 
-        RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
-        RRETURN(rrc);
-        }
-      else goto TAIL_RECURSE;
+      if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP; 
+      RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
+      if (rrc == MATCH_THEN && md->start_match_ptr == ecode) 
+        rrc = MATCH_NOMATCH;
+      RRETURN(rrc);
       }
     else                         /* Condition false & no alternative */
       {
@@ -5703,7 +5688,7 @@
   LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
   LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
   LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
-  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
+  LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
 #ifdef SUPPORT_UTF8
   LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
   LBL(32) LBL(34) LBL(42) LBL(46)


Modified: code/trunk/testdata/testinput11
===================================================================
--- code/trunk/testdata/testinput11    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testinput11    2011-06-15 18:09:23 UTC (rev 609)
@@ -593,4 +593,34 @@
 /(A (A|B(*ACCEPT)|C) D)(E)/x
     AB


+/\A.*?(?:a|b(*THEN)c)/
+    ba
+
+/\A.*?(?:a|bc)/
+    ba
+
+/\A.*?(a|b(*THEN)c)/
+    ba
+
+/\A.*?(a|bc)/
+    ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+    ba
+
+/\A.*?(?:a|bc)++/
+    ba
+
+/\A.*?(a|b(*THEN)c)++/
+    ba
+
+/\A.*?(a|bc)++/
+    ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+    ba
+
+/\A.*?(?:a|bc|d)/
+    ba
+
 /-- End of testinput11 --/


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testinput2    2011-06-15 18:09:23 UTC (rev 609)
@@ -3673,6 +3673,12 @@
     c
     c\N 


+/^.*?(?(?=a)a|b(*THEN)c)/
+    ba
+
+/^.*?(?(?=a)a|bc)/
+    ba
+
 /-- --/


/-- End of testinput2 --/

Modified: code/trunk/testdata/testoutput11
===================================================================
--- code/trunk/testdata/testoutput11    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testoutput11    2011-06-15 18:09:23 UTC (rev 609)
@@ -1142,4 +1142,48 @@
  1: AB
  2: B


+/\A.*?(?:a|b(*THEN)c)/
+    ba
+ 0: ba
+
+/\A.*?(?:a|bc)/
+    ba
+ 0: ba
+
+/\A.*?(a|b(*THEN)c)/
+    ba
+ 0: ba
+ 1: a
+
+/\A.*?(a|bc)/
+    ba
+ 0: ba
+ 1: a
+
+/\A.*?(?:a|b(*THEN)c)++/
+    ba
+ 0: ba
+
+/\A.*?(?:a|bc)++/
+    ba
+ 0: ba
+
+/\A.*?(a|b(*THEN)c)++/
+    ba
+ 0: ba
+ 1: a
+
+/\A.*?(a|bc)++/
+    ba
+ 0: ba
+ 1: a
+
+/\A.*?(?:a|b(*THEN)c|d)/
+    ba
+ 0: ba
+
+/\A.*?(?:a|bc|d)/
+    ba
+ 0: ba
+
 /-- End of testinput11 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2011-06-12 16:25:55 UTC (rev 608)
+++ code/trunk/testdata/testoutput2    2011-06-15 18:09:23 UTC (rev 609)
@@ -4414,12 +4414,12 @@
 Need char = 'z'
   aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\M
 Minimum match() limit = 8
-Minimum match() recursion limit = 6
+Minimum match() recursion limit = 7
  0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
  1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
   aaaaaaaaaaaaaz\M
 Minimum match() limit = 32768
-Minimum match() recursion limit = 42
+Minimum match() recursion limit = 43
 No match


 /(aaa(?C1)bbb|ab)/I
@@ -6411,7 +6411,7 @@
 No need char
    /* this is a C style comment */\M
 Minimum match() limit = 120
-Minimum match() recursion limit = 6
+Minimum match() recursion limit = 35
  0: /* this is a C style comment */
  1: /* this is a C style comment */


@@ -11567,6 +11567,14 @@
     c\N 
  0: c


+/^.*?(?(?=a)a|b(*THEN)c)/
+    ba
+ 0: ba
+
+/^.*?(?(?=a)a|bc)/
+    ba
+ 0: ba
+
 /-- --/


/-- End of testinput2 --/