Revision: 1007
http://www.exim.org/viewvc/pcre2?view=rev&revision=1007
Author: ph10
Date: 2018-09-15 18:10:39 +0100 (Sat, 15 Sep 2018)
Log Message:
-----------
Add "allvector" to pcre2test.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/configure.ac
code/trunk/doc/pcre2test.1
code/trunk/src/pcre2test.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/ChangeLog 2018-09-15 17:10:39 UTC (rev 1007)
@@ -2,9 +2,17 @@
--------------------
-Version 10.32-RC1 10-September-2018
+Version 10.33-RC1 15-September-2018
-----------------------------------
+1. Added "allvector" to pcre2test to make it easy to check the part of the
+ovector that shouldn't be changed, in particular after substitute and failed or
+partial matches.
+
+
+Version 10.32 10-September-2018
+-------------------------------
+
1. When matching using the the REG_STARTEND feature of the POSIX API with a
non-zero starting offset, unset capturing groups with lower numbers than a
group that did capture something were not being correctly returned as "unset"
Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/configure.ac 2018-09-15 17:10:39 UTC (rev 1007)
@@ -9,9 +9,9 @@
dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre2_major, [10])
-m4_define(pcre2_minor, [32])
-m4_define(pcre2_prerelease, [])
-m4_define(pcre2_date, [2018-09-10])
+m4_define(pcre2_minor, [33])
+m4_define(pcre2_prerelease, [-RC1])
+m4_define(pcre2_date, [2018-09-14])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.
Modified: code/trunk/doc/pcre2test.1
===================================================================
--- code/trunk/doc/pcre2test.1 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/doc/pcre2test.1 2018-09-15 17:10:39 UTC (rev 1007)
@@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "21 July 2018" "PCRE 10.32"
+.TH PCRE2TEST 1 "15 September 2018" "PCRE 10.33"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@@ -1003,6 +1003,7 @@
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
+ allvector show the entire ovector
allusedtext show all consulted text
altglobal alternative global matching
/g global global matching
@@ -1154,6 +1155,7 @@
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
+ allvector show the entire ovector
allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching
callout_capture show captures at callout time
@@ -1248,9 +1250,27 @@
highest one actually used in the match are output (corresponding to the return
code from \fBpcre2_match()\fP). Groups that did not take part in the match
are output as "<unset>". This modifier is not relevant for DFA matching (which
-does no capturing); it is ignored, with a warning message, if present.
+does no capturing) and does not apply when \fBreplace\fP is specified; it is
+ignored, with a warning message, if present.
.
.
+.SS "Showing the entire ovector, for all outcomes"
+.rs
+.sp
+The \fBallvector\fP modifier requests that the entire ovector be shown,
+whatever the outcome of the match. Compare \fBallcaptures\fP, which shows only
+up to the maximum number of capture groups for the pattern, and then only for a
+successful complete non-DFA match. This modifier, which acts after any match
+result, and also for DFA matching, provides a means of checking that there are
+no unexpected modifications to ovector fields. Before each match attempt, the
+ovector is filled with a special value, and if this is found in both elements
+of a capturing pair, "<unchanged>" is output. After a successful match, this
+applies to all groups after the maximum capture group for the pattern. In other
+cases it applies to the entire ovector. After a partial match, the first two
+elements are the only ones that should be set. After a DFA match, the amount of
+ovector that is used depends on the number of matches that were found.
+.
+.
.SS "Testing callouts"
.rs
.sp
@@ -1982,6 +2002,6 @@
.rs
.sp
.nf
-Last updated: 21 July 2018
+Last updated: 15 September 2018
Copyright (c) 1997-2018 University of Cambridge.
.fi
Modified: code/trunk/src/pcre2test.c
===================================================================
--- code/trunk/src/pcre2test.c 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/src/pcre2test.c 2018-09-15 17:10:39 UTC (rev 1007)
@@ -491,6 +491,7 @@
#define CTL2_SUBJECT_LITERAL 0x00000010u
#define CTL2_CALLOUT_NO_WHERE 0x00000020u
#define CTL2_CALLOUT_EXTRA 0x00000040u
+#define CTL2_ALLVECTOR 0x00000080u
#define CTL2_NL_SET 0x40000000u /* Informational */
#define CTL2_BSR_SET 0x80000000u /* Informational */
@@ -513,7 +514,8 @@
#define CTL2_ALLPD (CTL2_SUBSTITUTE_EXTENDED|\
CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\
CTL2_SUBSTITUTE_UNKNOWN_UNSET|\
- CTL2_SUBSTITUTE_UNSET_EMPTY)
+ CTL2_SUBSTITUTE_UNSET_EMPTY|\
+ CTL2_ALLVECTOR)
/* Structures for holding modifier information for patterns and subject strings
(data). Fields containing modifiers that can be set either for a pattern or a
@@ -592,6 +594,7 @@
{ "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) },
{ "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) },
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
+ { "allvector", MOD_PND, MOD_CTL, CTL2_ALLVECTOR, PO(control2) },
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
@@ -888,6 +891,7 @@
static uint32_t maxlookbehind;
static uint32_t max_oveccount;
static uint32_t callout_count;
+static uint32_t maxcapcount;
static uint16_t local_newline_default = 0;
@@ -4018,12 +4022,13 @@
static void
show_controls(uint32_t controls, uint32_t controls2, const char *before)
{
-fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
((controls & CTL_ALLCAPTURES) != 0)? " allcaptures" : "",
((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "",
+ ((controls2 & CTL2_ALLVECTOR) != 0)? " allvector" : "",
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
((controls & CTL_BINCODE) != 0)? " bincode" : "",
((controls2 & CTL2_BSR_SET) != 0)? " bsr" : "",
@@ -5717,6 +5722,11 @@
if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
return PR_ABEND;
+/* Remember the number of captures. */
+
+if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0)
+ return PR_ABEND;
+
/* If an explicit newline modifier was given, set the information flag in the
pattern so that it is preserved over push/pop. */
@@ -6318,6 +6328,42 @@
/*************************************************
+* Show an entire ovector *
+*************************************************/
+
+/* This function is called after partial matching or match failure, when the
+"allvector" modifier is set. It is a means of checking the contents of the
+entire ovector, to ensure no modification of fields that should be unchanged.
+
+Arguments:
+ ovector points to the ovector
+ oveccount number of pairs
+
+Returns: nothing
+*/
+
+static void
+show_ovector(PCRE2_SIZE *ovector, uint32_t oveccount)
+{
+uint32_t i;
+for (i = 0; i < 2*oveccount; i += 2)
+ {
+ PCRE2_SIZE start = ovector[i];
+ PCRE2_SIZE end = ovector[i+1];
+
+ fprintf(outfile, "%2d: ", i/2);
+ if (start == PCRE2_UNSET && end == PCRE2_UNSET)
+ fprintf(outfile, "<unset>\n");
+ else if (start == JUNK_OFFSET && end == JUNK_OFFSET)
+ fprintf(outfile, "<unchanged>\n");
+ else
+ fprintf(outfile, "%ld %ld\n", (unsigned long int)start,
+ (unsigned long int)end);
+ }
+}
+
+
+/*************************************************
* Process a data line *
*************************************************/
@@ -6342,7 +6388,10 @@
void *use_dat_context;
BOOL utf;
BOOL subject_literal;
+
+PCRE2_SIZE *ovector;
PCRE2_SIZE ovecsave[3];
+uint32_t oveccount;
#ifdef SUPPORT_PCRE2_8
uint8_t *q8 = NULL;
@@ -6722,11 +6771,23 @@
}
}
-if (pat_patctl.replacement[0] != 0 &&
- (dat_datctl.control & CTL_NULLCONTEXT) != 0)
+if (pat_patctl.replacement[0] != 0)
{
- fprintf(outfile, "** Replacement text is not supported with null_context.\n");
- return PR_OK;
+ if ((dat_datctl.control & CTL_NULLCONTEXT) != 0)
+ {
+ fprintf(outfile, "** Replacement text is not supported with null_context.\n");
+ return PR_OK;
+ }
+ if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
+ fprintf(outfile, "** Ignored with replacement text: allcaptures\n");
+ }
+
+/* Warn for modifiers that are ignored for DFA. */
+
+if ((dat_datctl.control & CTL_DFA) != 0)
+ {
+ if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
+ fprintf(outfile, "** Ignored after DFA matching: allcaptures\n");
}
/* We now have the subject in dbuffer, with len containing the byte length, and
@@ -6955,6 +7016,9 @@
return PR_OK;
}
+ovector = FLD(match_data, ovector);
+PCRE2_GET_OVECTOR_COUNT(oveccount, match_data);
+
/* Replacement processing is ignored for DFA matching. */
if (dat_datctl.replacement[0] != 0 && (dat_datctl.control & CTL_DFA) != 0)
@@ -6974,7 +7038,7 @@
uint8_t rbuffer[REPLACE_BUFFSIZE];
uint8_t nbuffer[REPLACE_BUFFSIZE];
uint32_t xoptions;
- PCRE2_SIZE rlen, nsize, erroroffset;
+ PCRE2_SIZE j, rlen, nsize, erroroffset;
BOOL badutf = FALSE;
#ifdef SUPPORT_PCRE2_8
@@ -6987,6 +7051,11 @@
uint32_t *r32 = NULL;
#endif
+ /* Fill the ovector with junk to detect elements that do not get set
+ when they should be (relevant only when "allvector" is specified). */
+
+ for (j = 0; j < 2*oveccount; j++) ovector[j] = JUNK_OFFSET;
+
if (timeitm)
fprintf(outfile, "** Timing is not supported with replace: ignored\n");
@@ -7112,6 +7181,12 @@
fprintf(outfile, "\n");
show_memory = FALSE;
+
+ /* Show final ovector contents if requested. */
+
+ if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
+ show_ovector(ovector, oveccount);
+
return PR_OK;
} /* End of substitution handling */
@@ -7125,14 +7200,11 @@
{
PCRE2_SIZE j;
int capcount;
- PCRE2_SIZE *ovector;
- ovector = FLD(match_data, ovector);
-
/* Fill the ovector with junk to detect elements that do not get set
when they should be. */
- for (j = 0; j < 2*dat_datctl.oveccount; j++) ovector[j] = JUNK_OFFSET;
+ for (j = 0; j < 2*oveccount; j++) ovector[j] = JUNK_OFFSET;
/* When matching is via pcre2_match(), we will detect the use of JIT via the
stack callback function. */
@@ -7280,12 +7352,8 @@
if (capcount >= 0)
{
int i;
- uint32_t oveccount;
- /* This is a check against a lunatic return value. */
-
- PCRE2_GET_OVECTOR_COUNT(oveccount, match_data);
- if (capcount > (int)oveccount)
+ if (capcount > (int)oveccount) /* Check for lunatic return value */
{
fprintf(outfile,
"** PCRE2 error: returned count %d is too big for ovector count %d\n",
@@ -7325,24 +7393,18 @@
/* "allcaptures" requests showing of all captures in the pattern, to check
unset ones at the end. It may be set on the pattern or the data. Implement
by setting capcount to the maximum. This is not relevant for DFA matching,
- so ignore it. */
+ so ignore it (warning given above). */
- if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
+ if ((dat_datctl.control & (CTL_ALLCAPTURES|CTL_DFA)) == CTL_ALLCAPTURES)
{
- uint32_t maxcapcount;
- if ((dat_datctl.control & CTL_DFA) != 0)
- {
- fprintf(outfile, "** Ignored after DFA matching: allcaptures\n");
- }
- else
- {
- if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0)
- return PR_SKIP;
- capcount = maxcapcount + 1; /* Allow for full match */
- if (capcount > (int)oveccount) capcount = oveccount;
- }
+ capcount = maxcapcount + 1; /* Allow for full match */
+ if (capcount > (int)oveccount) capcount = oveccount;
}
+ /* "allvector" request showing the entire ovector. */
+
+ if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0) capcount = oveccount;
+
/* Output the captured substrings. Note that, for the matched string,
the use of \K in an assertion can make the start later than the end. */
@@ -7364,7 +7426,7 @@
/* Check for an unset group */
- if (start == PCRE2_UNSET)
+ if (start == PCRE2_UNSET && end == PCRE2_UNSET)
{
fprintf(outfile, "<unset>\n");
continue;
@@ -7371,12 +7433,19 @@
}
/* Check for silly offsets, in particular, values that have not been
- set when they should have been. */
+ set when they should have been. However, if we are past the end of the
+ captures for this pattern ("allvector" causes this), or if we are DFA
+ matching, it isn't an error if the entry is unchanged. */
if (start > ulen || end > ulen)
{
- fprintf(outfile, "ERROR: bad value(s) for offset(s): 0x%lx 0x%lx\n",
- (unsigned long int)start, (unsigned long int)end);
+ if (((dat_datctl.control & CTL_DFA) != 0 ||
+ i >= (int)(2*maxcapcount + 2)) &&
+ start == JUNK_OFFSET && end == JUNK_OFFSET)
+ fprintf(outfile, "<unchanged>\n");
+ else
+ fprintf(outfile, "ERROR: bad value(s) for offset(s): 0x%lx 0x%lx\n",
+ (unsigned long int)start, (unsigned long int)end);
continue;
}
@@ -7517,10 +7586,19 @@
fprintf(outfile, "\n");
}
+ if (ulen != ovector[1])
+ fprintf(outfile, "** ovector[1] is not equal to the subject length: "
+ "%ld != %ld\n", (unsigned long int)ovector[1], (unsigned long int)ulen);
+
/* Process copy/get strings */
if (!copy_and_get(utf, 1)) return PR_ABEND;
+ /* "allvector" outputs the entire vector */
+
+ if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
+ show_ovector(ovector, oveccount);
+
break; /* Out of the /g loop */
} /* End of handling partial match */
@@ -7590,6 +7668,11 @@
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
fprintf(outfile, " (JIT)");
fprintf(outfile, "\n");
+
+ /* "allvector" outputs the entire vector */
+
+ if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
+ show_ovector(ovector, oveccount);
}
break;
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/testdata/testinput2 2018-09-15 17:10:39 UTC (rev 1007)
@@ -5505,4 +5505,13 @@
bbc
xbc
+/a(b)c|xyz/g,allvector,replace=<$0>
+ abcdefabcpqr\=ovector=4
+ abxyz\=ovector=4
+ abcdefxyz\=ovector=4
+
+/a(b)c|xyz/allvector
+ abcdef\=ovector=4
+ abxyz\=ovector=4
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2018-09-15 12:35:56 UTC (rev 1006)
+++ code/trunk/testdata/testoutput2 2018-09-15 17:10:39 UTC (rev 1007)
@@ -16763,6 +16763,38 @@
0: b
0+ c
+/a(b)c|xyz/g,allvector,replace=<$0>
+ abcdefabcpqr\=ovector=4
+ 2: <abc>def<abc>pqr
+ 0: 6 9
+ 1: 7 8
+ 2: <unchanged>
+ 3: <unchanged>
+ abxyz\=ovector=4
+ 1: ab<xyz>
+ 0: 2 5
+ 1: <unset>
+ 2: <unchanged>
+ 3: <unchanged>
+ abcdefxyz\=ovector=4
+ 2: <abc>def<xyz>
+ 0: 6 9
+ 1: <unset>
+ 2: <unchanged>
+ 3: <unchanged>
+
+/a(b)c|xyz/allvector
+ abcdef\=ovector=4
+ 0: abc
+ 1: b
+ 2: <unchanged>
+ 3: <unchanged>
+ abxyz\=ovector=4
+ 0: xyz
+ 1: <unset>
+ 2: <unchanged>
+ 3: <unchanged>
+
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data