Revision: 500
http://vcs.pcre.org/viewvc?view=rev&revision=500
Author: ph10
Date: 2010-03-06 19:00:29 +0000 (Sat, 06 Mar 2010)
Log Message:
-----------
Fix bugs with \K in atomic groups, subroutines, and assertions.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrepattern.3
code/trunk/pcre_compile.c
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/testdata/testinput11
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput11
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/ChangeLog 2010-03-06 19:00:29 UTC (rev 500)
@@ -38,6 +38,13 @@
counting zeros. There was no check for running off the end of the string,
which could happen if a new error number was added without updating the
string.
+
+10. \K gave a compile-time error if it appeared in a lookbehind assersion.
+
+11. \K was not working if it appeared in an atomic group or in a group that
+ was called as a "subroutine", or in an assertion. Perl 5.11 documents that
+ \K is "not well defined" if used in an assertion. PCRE now accepts it if
+ the assertion is positive, but not if it is negative.
Version 8.01 19-Jan-2010
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/doc/pcrepattern.3 2010-03-06 19:00:29 UTC (rev 500)
@@ -737,6 +737,10 @@
(foo)\eKbar
.sp
matches "foobar", the first substring is still set to "foo".
+.P
+Perl documents that the use of \eK within assertions is "not well defined". In
+PCRE, \eK is acted upon when it occurs inside positive assertions, but is
+ignored in negative assertions.
.
.
.\" HTML <a name="smallassertions"></a>
@@ -2453,6 +2457,6 @@
.rs
.sp
.nf
-Last updated: 01 March 2010
+Last updated: 06 March 2010
Copyright (c) 1997-2010 University of Cambridge.
.fi
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/pcre_compile.c 2010-03-06 19:00:29 UTC (rev 500)
@@ -1450,6 +1450,7 @@
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
+ case OP_SET_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/pcre_exec.c 2010-03-06 19:00:29 UTC (rev 500)
@@ -1070,7 +1070,6 @@
memmove(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
offset_top = rec->save_offset_top;
- mstart = rec->save_start;
ims = original_ims;
ecode = rec->after_call;
break;
@@ -1114,7 +1113,11 @@
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
RM4);
- if (rrc == MATCH_MATCH) break;
+ if (rrc == MATCH_MATCH)
+ {
+ mstart = md->start_match_ptr; /* In case \K reset it */
+ break;
+ }
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode, 1);
}
@@ -1267,9 +1270,7 @@
memcpy(new_recursive.offset_save, md->offset_vector,
new_recursive.saved_max * sizeof(int));
- new_recursive.save_start = mstart;
new_recursive.save_offset_top = offset_top;
- mstart = eptr;
/* OK, now we can do the recursion. For each top-level alternative we
restore the offset and recursion data. */
@@ -1316,7 +1317,8 @@
a move back into the brackets. Friedl calls these "atomic" subpatterns.
Check the alternative branches in turn - the matching won't pass the KET
for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer. */
+ the end of a normal bracket, leaving the subject pointer, but resetting
+ the start-of-match value in case it was changed by \K. */
case OP_ONCE:
prev = ecode;
@@ -1325,7 +1327,11 @@
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH) break;
+ if (rrc == MATCH_MATCH)
+ {
+ mstart = md->start_match_ptr;
+ break;
+ }
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode,1);
}
@@ -1444,9 +1450,10 @@
}
else saved_eptr = NULL;
- /* If we are at the end of an assertion group, stop matching and return
- MATCH_MATCH, but record the current high water mark for use by positive
- assertions. Do this also for the "once" (atomic) groups. */
+ /* If we are at the end of an assertion group or an atomic group, stop
+ matching and return MATCH_MATCH, but record the current high water mark for
+ use by positive assertions. We also need to record the match start in case
+ it was changed by \K. */
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
@@ -1454,6 +1461,7 @@
{
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
+ md->start_match_ptr = mstart;
RRETURN(MATCH_MATCH);
}
@@ -1490,7 +1498,6 @@
recursion_info *rec = md->recursive;
DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
md->recursive = rec->prevrec;
- mstart = rec->save_start;
memcpy(md->offset_vector, rec->offset_save,
rec->saved_max * sizeof(int));
offset_top = rec->save_offset_top;
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/pcre_internal.h 2010-03-06 19:00:29 UTC (rev 500)
@@ -1617,7 +1617,6 @@
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
int group_num; /* Number of group that was called */
const uschar *after_call; /* "Return value": points after the call in the expr */
- USPTR save_start; /* Old value of mstart */
int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */
int save_offset_top; /* Current value of offset_top */
Modified: code/trunk/testdata/testinput11
===================================================================
--- code/trunk/testdata/testinput11 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/testdata/testinput11 2010-03-06 19:00:29 UTC (rev 500)
@@ -357,4 +357,26 @@
/^(?(?!a(*SKIP)b))/
ac
+/(?>a\Kb)/
+ ab
+
+/((?>a\Kb))/
+ ab
+
+/(a\Kb)/
+ ab
+
+/^a\Kcz|ac/
+ ac
+
+/(?>a\Kbz|ab)/
+ ab
+
+/^(?&t)(?(DEFINE)(?<t>a\Kb))$/
+ ab
+
+/^([^()]|\((?1)*\))*$/
+ a(b)c
+ a(b(c)d)e
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/testdata/testinput2 2010-03-06 19:00:29 UTC (rev 500)
@@ -3208,4 +3208,26 @@
/^(?&t)*(?(DEFINE)(?<t>.))$/BZ
+/ -- The first four of these are not in the Perl 5.10 test because Perl
+ documents that the use of \K in assertions is "not well defined". The
+ last is here because Perl gives the match as "b" rather than "ab". I
+ believe this to be a Perl bug. --/
+
+/(?=a\Kb)ab/
+ ab
+
+/(?!a\Kb)ac/
+ ac
+
+/^abc(?<=b\Kc)d/
+ abcd
+
+/^abc(?<!b\Kq)d/
+ abcd
+
+/(?>a\Kb)z|(ab)/
+ ab
+
+/----------------------/
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput11
===================================================================
--- code/trunk/testdata/testoutput11 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/testdata/testoutput11 2010-03-06 19:00:29 UTC (rev 500)
@@ -742,4 +742,38 @@
ac
0:
+/(?>a\Kb)/
+ ab
+ 0: b
+
+/((?>a\Kb))/
+ ab
+ 0: b
+ 1: ab
+
+/(a\Kb)/
+ ab
+ 0: b
+ 1: ab
+
+/^a\Kcz|ac/
+ ac
+ 0: ac
+
+/(?>a\Kbz|ab)/
+ ab
+ 0: ab
+
+/^(?&t)(?(DEFINE)(?<t>a\Kb))$/
+ ab
+ 0: b
+
+/^([^()]|\((?1)*\))*$/
+ a(b)c
+ 0: a(b)c
+ 1: c
+ a(b(c)d)e
+ 0: a(b(c)d)e
+ 1: e
+
/-- End of testinput11 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2010-03-03 20:09:39 UTC (rev 499)
+++ code/trunk/testdata/testoutput2 2010-03-06 19:00:29 UTC (rev 500)
@@ -10637,4 +10637,32 @@
End
------------------------------------------------------------------
+/ -- The first four of these are not in the Perl 5.10 test because Perl
+ documents that the use of \K in assertions is "not well defined". The
+ last is here because Perl gives the match as "b" rather than "ab". I
+ believe this to be a Perl bug. --/
+
+/(?=a\Kb)ab/
+ ab
+ 0: b
+
+/(?!a\Kb)ac/
+ ac
+ 0: ac
+
+/^abc(?<=b\Kc)d/
+ abcd
+ 0: cd
+
+/^abc(?<!b\Kq)d/
+ abcd
+ 0: abcd
+
+/(?>a\Kb)z|(ab)/
+ ab
+ 0: ab
+ 1: ab
+
+/----------------------/
+
/-- End of testinput2 --/