Revision: 1478
http://vcs.pcre.org/viewvc?view=rev&revision=1478
Author: ph10
Date: 2014-05-27 14:18:31 +0100 (Tue, 27 May 2014)
Log Message:
-----------
Fix empty-matching possessive zero-repeat groups bug.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput8
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput8
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/ChangeLog 2014-05-27 13:18:31 UTC (rev 1478)
@@ -25,6 +25,21 @@
6. Inserted two casts and changed some ints to size_t in the light of some
reported 64-bit compiler warnings (Bugzilla 1477).
+
+7. Fixed a bug concerned with zero-minimum possessive groups that could match
+ an empty string, which sometimes were behaving incorrectly in the
+ interpreter (though correctly in the JIT matcher). This pcretest input is
+ an example:
+
+ '\A(?:[^"]++|"(?:[^"]*+|"")*+")++'
+ NON QUOTED "QUOT""ED" AFTER "NOT MATCHED
+
+ the interpreter was reporting a match of 'NON QUOTED ' only, whereas the
+ JIT matcher and Perl both matched 'NON QUOTED "QUOT""ED" AFTER '. The test
+ for an empty string was breaking the inner loop and carrying on at a lower
+ level, when possessive repeated groups should always return to a higher
+ level as they have no backtrack points in them. The empty string test now
+ occurs at the outer level.
Version 8.35 04-April-2014
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/pcre_exec.c 2014-05-27 13:18:31 UTC (rev 1478)
@@ -1167,11 +1167,16 @@
if (rrc == MATCH_KETRPOS)
{
offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
save_capture_last = md->capture_last;
matched_once = TRUE;
mstart = md->start_match_ptr; /* In case \K changed it */
+ if (eptr == md->end_match_ptr) /* Matched an empty string */
+ {
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+ break;
+ }
+ eptr = md->end_match_ptr;
continue;
}
@@ -1241,10 +1246,15 @@
if (rrc == MATCH_KETRPOS)
{
offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
matched_once = TRUE;
mstart = md->start_match_ptr; /* In case \K reset it */
+ if (eptr == md->end_match_ptr) /* Matched an empty string */
+ {
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
+ break;
+ }
+ eptr = md->end_match_ptr;
continue;
}
@@ -1894,7 +1904,7 @@
case OP_KETRMAX:
case OP_KETRPOS:
prev = ecode - GET(ecode, 1);
-
+
/* If this was a group that remembered the subject start, in order to break
infinite repeats of empty string matches, retrieve the subject start from
the chain. Otherwise, set it NULL. */
@@ -1919,7 +1929,7 @@
md->start_match_ptr = mstart;
RRETURN(MATCH_MATCH); /* Sets md->mark */
}
-
+
/* For capturing groups we have to check the group number back at the start
and if necessary complete handling an extraction by setting the offsets and
bumping the high water mark. Whole-pattern recursion is coded as a recurse
@@ -1979,6 +1989,19 @@
}
}
+ /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
+ and return the MATCH_KETRPOS. This makes it possible to do the repeats one
+ at a time from the outer level, thus saving stack. This must precede the
+ empty string test - in this case that test is done at the outer level. */
+
+ if (*ecode == OP_KETRPOS)
+ {
+ md->start_match_ptr = mstart; /* In case \K reset it */
+ md->end_match_ptr = eptr;
+ md->end_offset_top = offset_top;
+ RRETURN(MATCH_KETRPOS);
+ }
+
/* For an ordinary non-repeating ket, just continue at this level. This
also happens for a repeating ket if no characters were matched in the
group. This is the forcible breaking of infinite loops as implemented in
@@ -2001,18 +2024,6 @@
break;
}
- /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
- and return the MATCH_KETRPOS. This makes it possible to do the repeats one
- at a time from the outer level, thus saving stack. */
-
- if (*ecode == OP_KETRPOS)
- {
- md->start_match_ptr = mstart; /* In case \K reset it */
- md->end_match_ptr = eptr;
- md->end_offset_top = offset_top;
- RRETURN(MATCH_KETRPOS);
- }
-
/* The normal repeating kets try the rest of the pattern or restart from
the preceding bracket, in the appropriate order. In the second case, we can
use tail recursion to avoid using another stack frame, unless we have an
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testinput1 2014-05-27 13:18:31 UTC (rev 1478)
@@ -5669,4 +5669,13 @@
/(?:x|(?:(xx|yy)+|x|x|x|x|x)|a|a|a)bc/
acb
+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")++\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testinput8 2014-05-27 13:18:31 UTC (rev 1478)
@@ -4831,4 +4831,10 @@
/[ab]{2,}?/
aaaa
+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+
/-- End of testinput8 --/
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testoutput1 2014-05-27 13:18:31 UTC (rev 1478)
@@ -9317,4 +9317,16 @@
acb
No match
+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")++\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8 2014-05-21 17:53:49 UTC (rev 1477)
+++ code/trunk/testdata/testoutput8 2014-05-27 13:18:31 UTC (rev 1478)
@@ -7777,4 +7777,12 @@
1: aaa
2: aa
+'\A(?:[^\"]++|\"(?:[^\"]*+|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER
+
+'\A(?:[^\"]++|\"(?:[^\"]++|\"\")*+\")++'
+ NON QUOTED \"QUOT\"\"ED\" AFTER \"NOT MATCHED
+ 0: NON QUOTED "QUOT""ED" AFTER
+
/-- End of testinput8 --/