[Pcre-svn] [1296] code/trunk: Code changes for simpler backt…

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [1296] code/trunk: Code changes for simpler backtracking handling ( docs to follow).
Revision: 1296
          http://vcs.pcre.org/viewvc?view=rev&revision=1296
Author:   ph10
Date:     2013-03-19 16:29:12 +0000 (Tue, 19 Mar 2013)


Log Message:
-----------
Code changes for simpler backtracking handling (docs to follow).

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/ChangeLog    2013-03-19 16:29:12 UTC (rev 1296)
@@ -110,13 +110,18 @@


30. Update RunTest with additional test selector options.

-31. PCRE has been changed to be more compatible with Perl when there is more
-    than one backtracking verb present. Previously, in something like 
-    (*COMMIT)(*SKIP), COMMIT would override SKIP. Apart from one anomaly (which 
-    has been reported), Perl seems to act on whichever backtracking verb is 
-    reached first, so PCRE has been changed to follow this behaviour.
+31. The way PCRE handles backtracking verbs has been changed in to ways.


+    (1) Previously, in something like (*COMMIT)(*SKIP), COMMIT would override
+    SKIP. Now, PCRE acts on whichever backtracking verb is reached first by
+    backtracking. In some cases this makes it more Perl-compatible, but Perl's
+    rather obscure rules do not always do the same thing.
+    
+    (2) Previously, backtracking verbs were confined within assertions. This is 
+    no longer the case. Again, this sometimes improves Perl compatibility, and 
+    sometimes does not. 


+
Version 8.32 30-November-2012
-----------------------------


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/pcre_exec.c    2013-03-19 16:29:12 UTC (rev 1296)
@@ -1603,6 +1603,8 @@
       }
     else condassert = FALSE;


+    /* Loop for each branch */
+     
     do
       {
       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
@@ -1613,18 +1615,28 @@
         }
       md->mark = save_mark;


-      /* A COMMIT failure must fail the entire assertion, without trying any
-      subsequent branches. */
+      /* See comment in the code for capturing groups above about handling
+      THEN. */


-      if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
+      if (rrc == MATCH_THEN)
+        {
+        next = ecode + GET(ecode,1);
+        if (md->start_match_ptr < next &&
+            (*ecode == OP_ALT || *next == OP_ALT))
+          rrc = MATCH_NOMATCH;
+        }
+        
+      /* Anything other than NOMATCH causes the assertion to fail. This 
+      includes COMMIT, SKIP, and PRUNE. However, this consistent approach does 
+      not always have exactly the same effect as in Perl. */


-      /* PCRE does not allow THEN to escape beyond an assertion; it
-      is treated as NOMATCH. */
-
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode, 1);
       }
     while (*ecode == OP_ALT);
+    
+    /* If we have tried all the alternative branches, the assertion has
+    failed. */ 


     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);


@@ -1632,17 +1644,16 @@

     if (condassert) RRETURN(MATCH_MATCH);


-    /* Continue from after the assertion, updating the offsets high water
-    mark, since extracts may have been taken during the assertion. */
+    /* Continue from after a successful assertion, updating the offsets high
+    water mark, since extracts may have been taken during the assertion. */


     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
     ecode += 1 + LINK_SIZE;
     offset_top = md->end_offset_top;
     continue;


-    /* Negative assertion: all branches must fail to match. Encountering SKIP,
-    PRUNE, or COMMIT means we must assume failure without checking subsequent
-    branches. */
+    /* Negative assertion: all branches must fail to match for the assertion to 
+    succeed. */


     case OP_ASSERT_NOT:
     case OP_ASSERTBACK_NOT:
@@ -1654,28 +1665,42 @@
       }
     else condassert = FALSE;


+    /* Loop for each alternative branch. */
+     
     do
       {
       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
       md->mark = save_mark;
+      
+      /* A successful match means the assertion has failed. */
+       
       if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
-      if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
+
+      /* See comment in the code for capturing groups above about handling
+      THEN. */
+
+      if (rrc == MATCH_THEN)
         {
-        do ecode += GET(ecode,1); while (*ecode == OP_ALT);
-        break;
+        next = ecode + GET(ecode,1);
+        if (md->start_match_ptr < next &&
+            (*ecode == OP_ALT || *next == OP_ALT))
+          rrc = MATCH_NOMATCH;
         }
+        
+      /* No match on a branch means we must carry on and try the next branch. 
+      Anything else, in particular, SKIP, PRUNE, etc. causes a failure in the 
+      enclosing branch. This is a consistent approach, but does not always have 
+      the same effect as in Perl. */ 


-      /* PCRE does not allow THEN to escape beyond an assertion; it is treated
-      as NOMATCH. */
-
-      if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
+      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
       ecode += GET(ecode,1);
       }
     while (*ecode == OP_ALT);
+    
+    /* All branches in the assertion failed to match. */


     if (condassert) RRETURN(MATCH_MATCH);  /* Condition assertion */
-
-    ecode += 1 + LINK_SIZE;
+    ecode += 1 + LINK_SIZE;                /* Continue with current branch */
     continue;


     /* Move the subject pointer back. This occurs only at the start of


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/testdata/testinput1    2013-03-19 16:29:12 UTC (rev 1296)
@@ -4656,16 +4656,10 @@
 /(?<pn> \( ( [^()]++ | (?&pn) )* \) )/x
     (ab(cd)ef)


-/^(?!a(*SKIP)b)/
-    ac
-    
 /^(?=a(*SKIP)b|ac)/
     ** Failers
     ac


-/^(?=a(*THEN)b|ac)/
-    ac
-    
 /^(?=a(*PRUNE)b)/
     ab  
     ** Failers 
@@ -4674,9 +4668,6 @@
 /^(?=a(*ACCEPT)b)/
     ac


-/^(?(?!a(*SKIP)b))/
-    ac
-
 /(?>a\Kb)/
     ab


@@ -4899,33 +4890,15 @@
 /(A (A|B(*ACCEPT)|C) D)(E)/x
     AB


-/\A.*?(?:a|b(*THEN)c)/
-    ba
-
-/\A.*?(?:a|bc)/
-    ba
-
-/\A.*?(a|b(*THEN)c)/
-    ba
-
 /\A.*?(a|bc)/
     ba


-/\A.*?(?:a|b(*THEN)c)++/
-    ba
-
 /\A.*?(?:a|bc)++/
     ba


-/\A.*?(a|b(*THEN)c)++/
-    ba
-
 /\A.*?(a|bc)++/
     ba


-/\A.*?(?:a|b(*THEN)c|d)/
-    ba
-
 /\A.*?(?:a|bc|d)/
     ba


@@ -5253,9 +5226,6 @@
 /(a(*COMMIT)b){0}a(?1)|aac/
     aac


-/(?!a(*COMMIT)b)ac|cd/
-    ac
-
 /((?:a?)*)*c/
   aac   


@@ -5309,9 +5279,6 @@
 /(?:(a(*SKIP)b)){0}(?:(?1)|ac)/
     ac 


-/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
-    aac 
-
 /(?<=(*SKIP)ac)a/
     aa


@@ -5442,4 +5409,39 @@
 /a(*:m)a(*COMMIT)(*SKIP:m)b|a+c/K
     aaaaaac


+/.?(a|b(*THEN)c)/
+    ba
+
+/(a(*COMMIT)b)c|abd/
+    abc
+    abd
+
+/(?=a(*COMMIT)b)abc|abd/
+    abc
+    abd
+
+/(?>a(*COMMIT)b)c|abd/
+    abc
+    abd
+
+/a(?=b(*COMMIT)c)[^d]|abd/
+    abd
+    abc 
+
+/a(?=bc).|abd/
+    abd
+    abc 
+    
+/a(?>b(*COMMIT)c)d|abd/
+    abceabd 
+
+/a(?>bc)d|abd/
+    abceabd 
+
+/(?>a(*COMMIT)b)c|abd/
+    abd
+
+/(?>a(*COMMIT)c)d|abd/
+    abd
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/testdata/testinput2    2013-03-19 16:29:12 UTC (rev 1296)
@@ -3855,5 +3855,66 @@


 /aaaaa(*COMMIT)(*PRUNE)b|a+c/
     aaaaaac
+    
+/-- Here are some that Perl treats differently because of the way it handles
+backtracking verbs. --/


+ /^(?!a(*SKIP)b)/
+     ac
+
+ /^(?!a(*SKIP)b)../
+     acd
+
+/(?!a(*SKIP)b)../
+     acd
+
+/^(?(?!a(*SKIP)b))/
+     ac
+
+/^(?!a(*PRUNE)b)../
+     acd
+
+/(?!a(*PRUNE)b)../
+     acd
+
+ /(?!a(*COMMIT)b)ac|cd/
+     ac
+
+ /(?!a(*COMMIT)b)ac|ad/
+     ac
+     ad 
+
+/^(?!a(*THEN)b|ac)../
+     ac
+     ad 
+
+/^(?=a(*THEN)b|ac)/
+    ac
+    
+/\A.*?(?:a|b(*THEN)c)/
+    ba
+
+/\A.*?(?:a|bc)/
+    ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+    ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+    ba
+
+/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
+    aac 
+
+/\A.*?(a|b(*THEN)c)/
+    ba
+
+/^(A(*THEN)B|C(*THEN)D)/
+    CD           
+
+/^(A(*THEN)B|A(*THEN)D)/
+    AD           
+
+/-- End of Perl diffences --/ 
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/testdata/testoutput1    2013-03-19 16:29:12 UTC (rev 1296)
@@ -7770,20 +7770,12 @@
  1: (ab(cd)ef)
  2: ef


-/^(?!a(*SKIP)b)/
-    ac
- 0: 
-    
 /^(?=a(*SKIP)b|ac)/
     ** Failers
 No match
     ac
 No match


-/^(?=a(*THEN)b|ac)/
-    ac
- 0: 
-    
 /^(?=a(*PRUNE)b)/
     ab  
  0: 
@@ -7796,10 +7788,6 @@
     ac
  0: 


-/^(?(?!a(*SKIP)b))/
-    ac
- 0: 
-
 /(?>a\Kb)/
     ab
  0: b
@@ -8169,46 +8157,20 @@
  1: AB
  2: B


-/\A.*?(?:a|b(*THEN)c)/
-    ba
- 0: ba
-
-/\A.*?(?:a|bc)/
-    ba
- 0: ba
-
-/\A.*?(a|b(*THEN)c)/
-    ba
- 0: ba
- 1: a
-
 /\A.*?(a|bc)/
     ba
  0: ba
  1: a


-/\A.*?(?:a|b(*THEN)c)++/
-    ba
- 0: ba
-
 /\A.*?(?:a|bc)++/
     ba
  0: ba


-/\A.*?(a|b(*THEN)c)++/
-    ba
- 0: ba
- 1: a
-
 /\A.*?(a|bc)++/
     ba
  0: ba
  1: a


-/\A.*?(?:a|b(*THEN)c|d)/
-    ba
- 0: ba
-
 /\A.*?(?:a|bc|d)/
     ba
  0: ba
@@ -8719,10 +8681,6 @@
     aac
  0: aac


-/(?!a(*COMMIT)b)ac|cd/
-    ac
- 0: ac
-
 /((?:a?)*)*c/
   aac   
  0: aac
@@ -8803,10 +8761,6 @@
     ac 
  0: ac


-/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
-    aac 
- 0: aac
-
 /(?<=(*SKIP)ac)a/
     aa
 No match
@@ -8967,4 +8921,56 @@
     aaaaaac
  0: ac


+/.?(a|b(*THEN)c)/
+    ba
+ 0: ba
+ 1: a
+
+/(a(*COMMIT)b)c|abd/
+    abc
+ 0: abc
+ 1: ab
+    abd
+No match
+
+/(?=a(*COMMIT)b)abc|abd/
+    abc
+ 0: abc
+    abd
+ 0: abd
+
+/(?>a(*COMMIT)b)c|abd/
+    abc
+ 0: abc
+    abd
+ 0: abd
+
+/a(?=b(*COMMIT)c)[^d]|abd/
+    abd
+No match
+    abc 
+ 0: ab
+
+/a(?=bc).|abd/
+    abd
+ 0: abd
+    abc 
+ 0: ab
+    
+/a(?>b(*COMMIT)c)d|abd/
+    abceabd 
+No match
+
+/a(?>bc)d|abd/
+    abceabd 
+ 0: abd
+
+/(?>a(*COMMIT)b)c|abd/
+    abd
+ 0: abd
+
+/(?>a(*COMMIT)c)d|abd/
+    abd
+No match
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2013-03-17 17:22:10 UTC (rev 1295)
+++ code/trunk/testdata/testoutput2    2013-03-19 16:29:12 UTC (rev 1296)
@@ -12345,13 +12345,11 @@


 /(?=a(*COMMIT)b|ac)ac|ac/
     ac
- 0: ac
+No match


 /(?=a(*COMMIT)b|(ac)) ac | (a)c/x
     ac
- 0: ac
- 1: <unset>
- 2: a
+No match


 "AB(C(D))(E(F))?(?(?=\2)(?=\4))"
     ABCDGHI\O03
@@ -12648,5 +12646,89 @@
 /aaaaa(*COMMIT)(*PRUNE)b|a+c/
     aaaaaac
  0: aaaac
+    
+/-- Here are some that Perl treats differently because of the way it handles
+backtracking verbs. --/


+ /^(?!a(*SKIP)b)/
+     ac
+No match
+
+ /^(?!a(*SKIP)b)../
+     acd
+No match
+
+/(?!a(*SKIP)b)../
+     acd
+ 0: cd
+
+/^(?(?!a(*SKIP)b))/
+     ac
+No match
+
+/^(?!a(*PRUNE)b)../
+     acd
+No match
+
+/(?!a(*PRUNE)b)../
+     acd
+ 0: cd
+
+ /(?!a(*COMMIT)b)ac|cd/
+     ac
+No match
+
+ /(?!a(*COMMIT)b)ac|ad/
+     ac
+No match
+     ad 
+No match
+
+/^(?!a(*THEN)b|ac)../
+     ac
+No match
+     ad 
+ 0: ad
+
+/^(?=a(*THEN)b|ac)/
+    ac
+ 0: 
+    
+/\A.*?(?:a|b(*THEN)c)/
+    ba
+ 0: ba
+
+/\A.*?(?:a|bc)/
+    ba
+ 0: ba
+
+/\A.*?(?:a|b(*THEN)c)++/
+    ba
+ 0: ba
+
+/\A.*?(?:a|b(*THEN)c|d)/
+    ba
+ 0: ba
+
+/(?:(a(*MARK:X)a+(*SKIP:X)b)){0}(?:(?1)|aac)/
+    aac 
+ 0: aac
+
+/\A.*?(a|b(*THEN)c)/
+    ba
+ 0: ba
+ 1: a
+
+/^(A(*THEN)B|C(*THEN)D)/
+    CD           
+ 0: CD
+ 1: CD
+
+/^(A(*THEN)B|A(*THEN)D)/
+    AD           
+ 0: AD
+ 1: AD
+
+/-- End of Perl diffences --/ 
+
 /-- End of testinput2 --/