Revision: 1111
http://www.exim.org/viewvc/pcre2?view=rev&revision=1111
Author: ph10
Date: 2019-06-19 17:27:50 +0100 (Wed, 19 Jun 2019)
Log Message:
-----------
Don't ignore {1}+ when it is applied to a parenthesized item.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput1
code/trunk/testdata/testoutput1
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2019-06-18 16:07:43 UTC (rev 1110)
+++ code/trunk/ChangeLog 2019-06-19 16:27:50 UTC (rev 1111)
@@ -61,7 +61,12 @@
50, (b) the new --om-capture option changes the limit, (c) an error is raised
if -o asks for a group that is above the limit.
+12. The quantifier {1} was always being ignored, but this is incorrect when it
+is made possessive and applied to an item in parentheses, because a
+parenthesized item may contain multiple branches or other backtracking points,
+for example /(a|ab){1}+c/ or /(a+){1}+a/.
+
Version 10.33 16-April-2019
---------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2019-06-18 16:07:43 UTC (rev 1110)
+++ code/trunk/src/pcre2_compile.c 2019-06-19 16:27:50 UTC (rev 1111)
@@ -6758,10 +6758,6 @@
reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
op_type = 0;
- /* If the repeat is {1} we can ignore it. */
-
- if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
-
/* Adjust first and required code units for a zero repeat. */
if (repeat_min == 0)
@@ -6804,7 +6800,10 @@
tempcode = previous;
op_previous = *previous;
- /* Now handle repetition for the different types of item. */
+ /* Now handle repetition for the different types of item. If the repeat
+ minimum and the repeat maximum are both 1, we can ignore the quantifier for
+ non-parenthesized items, as they have only one alternative. For anything in
+ parentheses, we must not ignore if {1} is possessive. */
switch (op_previous)
{
@@ -6818,6 +6817,7 @@
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
+ if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
op_type = chartypeoffset[op_previous - OP_CHAR];
/* Deal with UTF characters that take up more than one code unit. */
@@ -6864,6 +6864,7 @@
code = previous;
goto END_REPEAT;
}
+ if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
*code++ = OP_CRSTAR + repeat_type;
@@ -6898,6 +6899,8 @@
repetition. */
case OP_RECURSE:
+ if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
+ goto END_REPEAT;
/* Generate unwrapped repeats for a non-zero minimum, except when the
minimum is 1 and the maximum unlimited, because that can be handled with
@@ -6980,6 +6983,9 @@
PCRE2_UCHAR *bralink = NULL;
PCRE2_UCHAR *brazeroptr = NULL;
+ if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
+ goto END_REPEAT;
+
/* Repeating a DEFINE group (or any group where the condition is always
FALSE and there is only one branch) is pointless, but Perl allows the
syntax, so we just ignore the repeat. */
@@ -7196,11 +7202,12 @@
and SCRIPT_RUN groups at runtime, but in a different way.]
Then, if the quantifier was possessive and the bracket is not a
- conditional, we convert the BRA code to the POS form, and the KET code to
- KETRPOS. (It turns out to be convenient at runtime to detect this kind of
- subpattern at both the start and at the end.) The use of special opcodes
- makes it possible to reduce greatly the stack usage in pcre2_match(). If
- the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
+ conditional, we convert the BRA code to the POS form, and the KET code
+ to KETRPOS. (It turns out to be convenient at runtime to detect this
+ kind of subpattern at both the start and at the end.) The use of
+ special opcodes makes it possible to reduce greatly the stack usage in
+ pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
+ OP_BRAPOSZERO.
Then, if the minimum number of matches is 1 or 0, cancel the possessive
flag so that the default action below, of wrapping everything inside
@@ -7301,6 +7308,8 @@
int prop_type, prop_value;
PCRE2_UCHAR *oldcode;
+ if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
+
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
mclength = 0; /* Not a character */
@@ -10041,7 +10050,7 @@
{
reqcu = 0; /* Must disable after (*ACCEPT) */
reqcuflags = REQ_NONE;
- re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
+ re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
}
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2019-06-18 16:07:43 UTC (rev 1110)
+++ code/trunk/testdata/testinput1 2019-06-19 16:27:50 UTC (rev 1111)
@@ -6351,4 +6351,18 @@
acb
abc
+/(?:a|ab){1}+c/
+\= Expect no match
+ abc
+
+/(a|ab){1}+c/
+ abc
+
+/(a+){1}+a/
+\= Expect no match
+ aaaa
+
+/(?(DEFINE)(a|ab))(?1){1}+c/
+ abc
+
# End of testinput1
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2019-06-18 16:07:43 UTC (rev 1110)
+++ code/trunk/testdata/testoutput1 2019-06-19 16:27:50 UTC (rev 1111)
@@ -10063,4 +10063,22 @@
0: a
MK: 2
+/(?:a|ab){1}+c/
+\= Expect no match
+ abc
+No match
+
+/(a|ab){1}+c/
+ abc
+No match
+
+/(a+){1}+a/
+\= Expect no match
+ aaaa
+No match
+
+/(?(DEFINE)(a|ab))(?1){1}+c/
+ abc
+No match
+
# End of testinput1