Revision: 1248
http://vcs.pcre.org/viewvc?view=rev&revision=1248
Author: ph10
Date: 2013-02-13 17:36:38 +0000 (Wed, 13 Feb 2013)
Log Message:
-----------
Fix various save/revert cases for capture_last in recursion and also don't
diagnose overflow when it has reverted.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrecallout.3
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcretest.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/ChangeLog 2013-02-13 17:36:38 UTC (rev 1248)
@@ -45,7 +45,19 @@
WARNING: Callouts are not release ready! <- this line needs to be removed when it is.
+11. The value of capture_last that is passed to callouts was incorrect in some
+ cases when there was a capture on one path that was subsequently abandoned
+ after a backtrack. Also, the capture_last value is now reset after a
+ recursion, since all captures are also reset in this case.
+
+12. The interpreter no longer returns the "too many substrings" error in the
+ case when an overflowing capture is in a branch that is subsequently
+ abandoned after a backtrack.
+
+13. In the pathological case when an offset vector of size 2 is used, pcretest
+ now prints out the matched string after a yield of 0 or 1.
+
Version 8.32 30-November-2012
-----------------------------
Modified: code/trunk/doc/pcrecallout.3
===================================================================
--- code/trunk/doc/pcrecallout.3 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/doc/pcrecallout.3 2013-02-13 17:36:38 UTC (rev 1248)
@@ -1,4 +1,4 @@
-.TH PCRECALLOUT 3 "24 June 2012" "PCRE 8.30"
+.TH PCRECALLOUT 3 "13 January 2013" "PCRE 8.33"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH SYNOPSIS
@@ -49,10 +49,6 @@
command has an option that sets automatic callouts; when it is used, the output
indicates how the pattern is matched. This is useful information when you are
trying to optimize the performance of a particular pattern.
-.P
-The use of callouts in a pattern makes it ineligible for optimization by the
-just-in-time compiler. Studying such a pattern with the PCRE_STUDY_JIT_COMPILE
-option always fails.
.
.
.SH "MISSING CALLOUTS"
@@ -144,8 +140,10 @@
functions are used, because they do not support captured substrings.
.P
The \fIcapture_last\fP field contains the number of the most recently captured
-substring. If no substrings have been captured, its value is -1. This is always
-the case for the DFA matching functions.
+substring. However, when a recursion exits, the value reverts to what it was
+outside the recursion, as do the values of all captured substrings. If no
+substrings have been captured, the value of \fIcapture_last\fP is -1. This is
+always the case for the DFA matching functions.
.P
The \fIcallout_data\fP field contains a value that is passed to a matching
function specifically so that it can be passed back in callouts. It is passed
@@ -209,6 +207,6 @@
.rs
.sp
.nf
-Last updated: 24 June 2012
-Copyright (c) 1997-2012 University of Cambridge.
+Last updated: 13 January 2013
+Copyright (c) 1997-2013 University of Cambridge.
.fi
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/pcre_exec.c 2013-02-13 17:36:38 UTC (rev 1248)
@@ -56,6 +56,20 @@
#undef min
#undef max
+/* The md->capture_last field uses the lower 16 bits for the last captured
+substring (which can never be greater than 65535) and a bit in the top half
+to mean "capture vector overflowed". This odd way of doing things was
+implemented when it was realized that preserving and restoring the overflow bit
+whenever the last capture number was saved/restored made for a neater
+interface, and doing it this way saved on (a) another variable, which would
+have increased the stack frame size (a big NO-NO in PCRE) and (b) another
+separate set of save/restore instructions. The following defines are used in
+implementing this. */
+
+#define CAPLMASK 0x0000ffff /* The bits used for last_capture */
+#define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
+#define OVFLBIT 0x00010000 /* The bit that is set for overflow */
+
/* Values for setting in md->match_function_type to indicate two special types
of call to match(). We do it this way to save on using another stack variable,
as stack usage is to be discouraged. */
@@ -419,7 +433,7 @@
unsigned int Xnumber;
int Xoffset;
unsigned int Xop;
- int Xsave_capture_last;
+ pcre_int32 Xsave_capture_last;
int Xsave_offset1, Xsave_offset2, Xsave_offset3;
int Xstacksave[REC_STACK_SAVE_MAX];
@@ -635,7 +649,7 @@
unsigned int number;
int offset;
unsigned int op;
-int save_capture_last;
+pcre_int32 save_capture_last;
int save_offset1, save_offset2, save_offset3;
int stacksave[REC_STACK_SAVE_MAX];
@@ -1066,6 +1080,7 @@
/* In all other cases, we have to make another call to match(). */
save_mark = md->mark;
+ save_capture_last = md->capture_last;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
RM2);
@@ -1097,6 +1112,7 @@
ecode += GET(ecode, 1);
md->mark = save_mark;
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
RRETURN(MATCH_NOMATCH);
@@ -1218,6 +1234,7 @@
POSSESSIVE_NON_CAPTURE:
matched_once = FALSE;
code_offset = (int)(ecode - md->start_code);
+ save_capture_last = md->capture_last;
for (;;)
{
@@ -1247,6 +1264,7 @@
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
+ md->capture_last = save_capture_last;
}
if (matched_once || allow_zero)
@@ -1291,7 +1309,9 @@
cb.pattern_position = GET(ecode, LINK_SIZE + 3);
cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
@@ -1513,7 +1533,7 @@
to close any currently open capturing brackets. */
case OP_CLOSE:
- number = GET2(ecode, 1);
+ number = GET2(ecode, 1); /* Must be less than 65536 */
offset = number << 1;
#ifdef PCRE_DEBUG
@@ -1521,8 +1541,8 @@
printf("\n");
#endif
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
@@ -1716,7 +1736,9 @@
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
+ cb.capture_last = md->capture_last & CAPLMASK;
+ /* Internal change requires this for API compatibility. */
+ if (cb.capture_last == 0) cb.capture_last = -1;
cb.callout_data = md->callout_data;
cb.mark = md->nomatch_mark;
if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
@@ -1762,6 +1784,7 @@
/* Add to "recursing stack" */
new_recursive.group_num = recno;
+ new_recursive.saved_capture_last = md->capture_last;
new_recursive.subject_position = eptr;
new_recursive.prevrec = md->recursive;
md->recursive = &new_recursive;
@@ -1785,8 +1808,9 @@
new_recursive.saved_max * sizeof(int));
/* OK, now we can do the recursion. After processing each alternative,
- restore the offset data. If there were nested recursions, md->recursive
- might be changed, so reset it before looping. */
+ restore the offset data and the last captured value. If there were nested
+ recursions, md->recursive might be changed, so reset it before looping.
+ */
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
cbegroup = (*callpat >= OP_SBRA);
@@ -1797,6 +1821,7 @@
md, eptrb, RM6);
memcpy(md->offset_vector, new_recursive.offset_save,
new_recursive.saved_max * sizeof(int));
+ md->capture_last = new_recursive.saved_capture_last;
md->recursive = new_recursive.prevrec;
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
{
@@ -1947,8 +1972,8 @@
/* Deal with capturing */
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
+ md->capture_last = (md->capture_last & OVFLMASK) | number;
+ if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
{
/* If offset is greater than offset_top, it means that we are
"skipping" a capturing group, and that group's offsets must be marked
@@ -6539,11 +6564,9 @@
DPRINTF(("Got memory to hold back references\n"));
}
else md->offset_vector = offsets;
-
md->offset_end = ocount;
md->offset_max = (2*ocount)/3;
-md->offset_overflow = FALSE;
-md->capture_last = -1;
+md->capture_last = 0;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
@@ -6940,7 +6963,7 @@
(arg_offset_max - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
- if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
+ if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
DPRINTF(("Freeing temporary memory\n"));
(PUBL(free))(md->offset_vector);
}
@@ -6948,7 +6971,8 @@
/* Set the return code to the number of captured strings, or 0 if there were
too many to fit into the vector. */
- rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
+ rc = ((md->capture_last & OVFLBIT) != 0 &&
+ md->end_offset_top >= arg_offset_max)?
0 : md->end_offset_top/2;
/* If there is space in the offset vector, set any unused pairs at the end of
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/pcre_internal.h 2013-02-13 17:36:38 UTC (rev 1248)
@@ -2431,6 +2431,7 @@
unsigned int group_num; /* Number of group that was called */
int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */
+ int saved_capture_last; /* Last capture number */
PCRE_PUCHAR subject_position; /* Position at start of recursion */
} recursion_info;
@@ -2472,7 +2473,6 @@
const pcre_uint8 *lcc; /* Points to lower casing table */
const pcre_uint8 *fcc; /* Points to case-flipping table */
const pcre_uint8 *ctypes; /* Points to table of type maps */
- BOOL offset_overflow; /* Set if too many extractions */
BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */
BOOL utf; /* UTF-8 / UTF-16 flag */
@@ -2493,7 +2493,7 @@
PCRE_PUCHAR start_used_ptr; /* Earliest consulted character */
int partial; /* PARTIAL options */
int end_offset_top; /* Highwater mark at end of match */
- int capture_last; /* Most recent capture number */
+ pcre_int32 capture_last; /* Most recent capture number + overflow flag */
int start_offset; /* The start offset value */
int match_function_type; /* Set for certain special calls of MATCH() */
eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/pcretest.c 2013-02-13 17:36:38 UTC (rev 1248)
@@ -5029,7 +5029,8 @@
if (count == 0)
{
fprintf(outfile, "Matched, but too many substrings\n");
- count = use_size_offsets/3;
+ /* 2 is a special case; match can be returned */
+ count = (use_size_offsets == 2)? 1 : use_size_offsets/3;
}
}
@@ -5043,7 +5044,8 @@
#if !defined NODFA
if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
#endif
- maxcount = use_size_offsets/3;
+ /* 2 is a special case; match can be returned */
+ maxcount = (use_size_offsets == 2)? 1 : use_size_offsets/3;
/* This is a check against a lunatic return value. */
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/testdata/testinput2 2013-02-13 17:36:38 UTC (rev 1248)
@@ -3810,4 +3810,31 @@
/.?/S!I
+/(?:(a)+(?C1)bb|aa(?C2)b)/
+ aab\C+
+
+/(?:(a)++(?C1)bb|aa(?C2)b)/
+ aab\C+
+
+/(?:(?>(a))(?C1)bb|aa(?C2)b)/
+ aab\C+
+
+/(?:(?1)(?C1)x|ab(?C2))((a)){0}/
+ aab\C+
+
+/(?1)(?C1)((a)(?C2)){0}/
+ aab\C+
+
+/(?:(a)+(?C1)bb|aa(?C2)b)++/
+ aab\C+
+ aab\C+\O2
+
+/(ab)x|ab/
+ ab\O3
+ ab\O2
+
+/(ab)/
+ ab\O3
+ ab\O2
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2013-02-11 21:37:46 UTC (rev 1247)
+++ code/trunk/testdata/testoutput2 2013-02-13 17:36:38 UTC (rev 1248)
@@ -11319,7 +11319,6 @@
/(a)b|ac/++SS
ac\O3
-Matched, but too many substrings
0: ac
0+
@@ -12481,4 +12480,124 @@
Subject length lower bound = -1
No set of starting bytes
+/(?:(a)+(?C1)bb|aa(?C2)b)/
+ aab\C+
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^ ^ b
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^^ b
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ b
+ 0: aab
+
+/(?:(a)++(?C1)bb|aa(?C2)b)/
+ aab\C+
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^ ^ b
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ b
+ 0: aab
+
+/(?:(?>(a))(?C1)bb|aa(?C2)b)/
+ aab\C+
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^^ b
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ b
+ 0: aab
+
+/(?:(?1)(?C1)x|ab(?C2))((a)){0}/
+ aab\C+
+Callout 1: last capture = -1
+ 0: <unset>
+--->aab
+ ^^ x
+Callout 1: last capture = -1
+ 0: <unset>
+--->aab
+ ^^ x
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ )
+ 0: ab
+
+/(?1)(?C1)((a)(?C2)){0}/
+ aab\C+
+Callout 2: last capture = 2
+ 0: <unset>
+ 1: <unset>
+ 2: a
+--->aab
+ ^^ )
+Callout 1: last capture = -1
+ 0: <unset>
+--->aab
+ ^^ ((a)(?C2)){0}
+ 0: a
+
+/(?:(a)+(?C1)bb|aa(?C2)b)++/
+ aab\C+
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^ ^ b
+Callout 1: last capture = 1
+ 0: <unset>
+ 1: a
+--->aab
+ ^^ b
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ b
+ 0: aab
+ aab\C+\O2
+Callout 1: last capture = 1
+ 0: <unset>
+--->aab
+ ^ ^ b
+Callout 1: last capture = 1
+ 0: <unset>
+--->aab
+ ^^ b
+Callout 2: last capture = -1
+ 0: <unset>
+--->aab
+ ^ ^ b
+ 0: aab
+
+/(ab)x|ab/
+ ab\O3
+ 0: ab
+ ab\O2
+ 0: ab
+
+/(ab)/
+ ab\O3
+Matched, but too many substrings
+ 0: ab
+ ab\O2
+Matched, but too many substrings
+ 0: ab
+
/-- End of testinput2 --/