[Pcre-svn] [1478] code/trunk: Fix empty-matching possessive …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1478] code/trunk: Fix empty-matching possessive zero-repeat groups bug.
Revision: 1478
          http://vcs.pcre.org/viewvc?view=rev&revision=1478
Author:   ph10
Date:     2014-05-27 14:18:31 +0100 (Tue, 27 May 2014)


Log Message:
-----------
Fix empty-matching possessive zero-repeat groups bug.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput8
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput8


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/ChangeLog    2014-05-27 13:18:31 UTC (rev 1478)
@@ -25,6 +25,21 @@


 6.  Inserted two casts and changed some ints to size_t in the light of some
     reported 64-bit compiler warnings (Bugzilla 1477). 
+    
+7.  Fixed a bug concerned with zero-minimum possessive groups that could match 
+    an empty string, which sometimes were behaving incorrectly in the
+    interpreter (though correctly in the JIT matcher). This pcretest input is
+    an example:
+    
+      '\A(?:[^"]++|"(?:[^"]*+|"")*+")++'
+      NON QUOTED "QUOT""ED" AFTER "NOT MATCHED
+  
+    the interpreter was reporting a match of 'NON QUOTED ' only, whereas the
+    JIT matcher and Perl both matched 'NON QUOTED "QUOT""ED" AFTER '. The test
+    for an empty string was breaking the inner loop and carrying on at a lower
+    level, when possessive repeated groups should always return to a higher
+    level as they have no backtrack points in them. The empty string test now
+    occurs at the outer level.



Version 8.35 04-April-2014

Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/pcre_exec.c    2014-05-27 13:18:31 UTC (rev 1478)
@@ -1167,11 +1167,16 @@
         if (rrc == MATCH_KETRPOS)
           {
           offset_top = md->end_offset_top;
-          eptr = md->end_match_ptr;
           ecode = md->start_code + code_offset;
           save_capture_last = md->capture_last;
           matched_once = TRUE;
           mstart = md->start_match_ptr;    /* In case \K changed it */
+          if (eptr == md->end_match_ptr)   /* Matched an empty string */
+            {
+            do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+            break;
+            }  
+          eptr = md->end_match_ptr;
           continue;
           }


@@ -1241,10 +1246,15 @@
       if (rrc == MATCH_KETRPOS)
         {
         offset_top = md->end_offset_top;
-        eptr = md->end_match_ptr;
         ecode = md->start_code + code_offset;
         matched_once = TRUE;
         mstart = md->start_match_ptr;   /* In case \K reset it */
+        if (eptr == md->end_match_ptr)  /* Matched an empty string */
+          {
+          do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+          break;
+          }  
+        eptr = md->end_match_ptr;
         continue;
         }


@@ -1894,7 +1904,7 @@
     case OP_KETRMAX:
     case OP_KETRPOS:
     prev = ecode - GET(ecode, 1);
-
+    
     /* If this was a group that remembered the subject start, in order to break
     infinite repeats of empty string matches, retrieve the subject start from
     the chain. Otherwise, set it NULL. */
@@ -1919,7 +1929,7 @@
       md->start_match_ptr = mstart;
       RRETURN(MATCH_MATCH);         /* Sets md->mark */
       }
-
+      
     /* For capturing groups we have to check the group number back at the start
     and if necessary complete handling an extraction by setting the offsets and
     bumping the high water mark. Whole-pattern recursion is coded as a recurse
@@ -1979,6 +1989,19 @@
         }
       }


+    /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
+    and return the MATCH_KETRPOS. This makes it possible to do the repeats one
+    at a time from the outer level, thus saving stack. This must precede the 
+    empty string test - in this case that test is done at the outer level. */
+
+    if (*ecode == OP_KETRPOS)
+      {
+      md->start_match_ptr = mstart;    /* In case \K reset it */
+      md->end_match_ptr = eptr;
+      md->end_offset_top = offset_top;
+      RRETURN(MATCH_KETRPOS);
+      }
+
     /* For an ordinary non-repeating ket, just continue at this level. This
     also happens for a repeating ket if no characters were matched in the
     group. This is the forcible breaking of infinite loops as implemented in
@@ -2001,18 +2024,6 @@
       break;
       }


-    /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
-    and return the MATCH_KETRPOS. This makes it possible to do the repeats one
-    at a time from the outer level, thus saving stack. */
-
-    if (*ecode == OP_KETRPOS)
-      {
-      md->start_match_ptr = mstart;    /* In case \K reset it */
-      md->end_match_ptr = eptr;
-      md->end_offset_top = offset_top;
-      RRETURN(MATCH_KETRPOS);
-      }
-
     /* The normal repeating kets try the rest of the pattern or restart from
     the preceding bracket, in the appropriate order. In the second case, we can
     use tail recursion to avoid using another stack frame, unless we have an


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testinput1    2014-05-27 13:18:31 UTC (rev 1478)
@@ -5669,4 +5669,13 @@
 /(?:x|(?:(xx|yy)+|x|x|x|x|x)|a|a|a)bc/
     acb


+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")++\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testinput8    2014-05-27 13:18:31 UTC (rev 1478)
@@ -4831,4 +4831,10 @@
 /[ab]{2,}?/
     aaaa    


+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
 /-- End of testinput8 --/


Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testoutput1    2014-05-27 13:18:31 UTC (rev 1478)
@@ -9317,4 +9317,16 @@
     acb
 No match


+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER 
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER 
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")++\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER 
+
 /-- End of testinput1 --/


Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8    2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testoutput8    2014-05-27 13:18:31 UTC (rev 1478)
@@ -7777,4 +7777,12 @@
  1: aaa
  2: aa


+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER 
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+    NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER 
+
 /-- End of testinput8 --/