Revision: 1072
http://www.exim.org/viewvc/pcre2?view=rev&revision=1072
Author: ph10
Date: 2019-02-13 17:30:24 +0000 (Wed, 13 Feb 2019)
Log Message:
-----------
Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/ChangeLog 2019-02-13 17:30:24 UTC (rev 1072)
@@ -128,7 +128,10 @@
31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh}
construct.
+32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
+from auto-anchoring if \p{Any}* starts a pattern.
+
Version 10.32 10-September-2018
-------------------------------
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/src/pcre2_compile.c 2019-02-13 17:30:24 UTC (rev 1072)
@@ -1459,7 +1459,7 @@
int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
- int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
+ int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
@@ -1551,7 +1551,7 @@
/* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
-\o, and \x are recognized (\u and \U can never appear as they are used for case
+\o, and \x are recognized (\u and \U can never appear as they are used for case
forcing). */
else
@@ -1559,7 +1559,7 @@
int s;
PCRE2_SPTR oldptr;
BOOL overflow;
- BOOL alt_bsux =
+ BOOL alt_bsux =
((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
/* Filter calls from pcre2_substitute(). */
@@ -1571,8 +1571,8 @@
*errorcodeptr = ERR3;
return 0;
}
- alt_bsux = FALSE; /* Do not modify \x handling */
- }
+ alt_bsux = FALSE; /* Do not modify \x handling */
+ }
switch (c)
{
@@ -1595,37 +1595,37 @@
if (!alt_bsux) *errorcodeptr = ERR37; else
{
uint32_t xc;
-
+
if (ptr >= ptrend) break;
- if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
+ if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
(extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
{
PCRE2_SPTR hptr = ptr + 1;
cc = 0;
-
+
while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
- {
+ {
if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
{
*errorcodeptr = ERR77;
ptr = hptr; /* Show where */
- break; /* *hptr != } will cause another break below */
- }
+ break; /* *hptr != } will cause another break below */
+ }
cc = (cc << 4) | xc;
- hptr++;
- }
-
+ hptr++;
+ }
+
if (hptr == ptr + 1 || /* No hex digits */
hptr >= ptrend || /* Hit end of input */
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
break; /* Hex escape not recognized */
-
+
c = cc; /* Accept the code point */
- ptr = hptr + 1;
+ ptr = hptr + 1;
}
-
+
else /* Must be exactly 4 hex digits */
- {
+ {
if (ptrend - ptr < 4) break; /* Less than 4 chars */
if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
@@ -1635,8 +1635,8 @@
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 4;
- }
-
+ }
+
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
@@ -3424,7 +3424,7 @@
else
{
tempptr = ptr;
- escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
+ escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
cb->cx->extra_options, TRUE, cb);
if (errorcode != 0)
@@ -7631,9 +7631,20 @@
{
uint32_t ptype = *(++pptr) >> 16;
uint32_t pdata = *pptr & 0xffff;
- *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
- *code++ = ptype;
- *code++ = pdata;
+
+ /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
+ from the auto-anchoring code. */
+
+ if (meta_arg == ESC_p && ptype == PT_ANY)
+ {
+ *code++ = OP_ALLANY;
+ }
+ else
+ {
+ *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
+ *code++ = ptype;
+ *code++ = pdata;
+ }
break; /* End META_ESCAPE */
}
#endif
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/testdata/testinput5 2019-02-13 17:30:24 UTC (rev 1072)
@@ -2173,4 +2173,8 @@
/(?'X²ABC'...)/utf
+# -------
+
+/\p{Any}*xyz/I
+
# End of testinput5
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/testdata/testoutput5 2019-02-13 17:30:24 UTC (rev 1072)
@@ -3296,27 +3296,27 @@
/\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
------------------------------------------------------------------
Bra
- prop Any +
- prop Any
- prop Any +
+ AllAny+
+ AllAny
+ AllAny+
notprop Any
- prop Any +
+ AllAny+
prop L&
- prop Any +
+ AllAny+
prop L
- prop Any +
+ AllAny+
prop Lu
- prop Any +
+ AllAny+
prop Han
- prop Any +
+ AllAny+
prop Xan
- prop Any +
+ AllAny+
prop Xsp
- prop Any +
+ AllAny+
prop Xps
prop Xwd +
- prop Any
- prop Any +
+ AllAny
+ AllAny+
prop Xuc
Ket
End
@@ -3326,7 +3326,7 @@
------------------------------------------------------------------
Bra
prop L& +
- prop Any
+ AllAny
prop L& +
prop L&
notprop L& ++
@@ -3357,7 +3357,7 @@
------------------------------------------------------------------
Bra
prop N +
- prop Any
+ AllAny
prop N +
prop L&
prop N ++
@@ -3388,7 +3388,7 @@
------------------------------------------------------------------
Bra
prop Lu +
- prop Any
+ AllAny
prop Lu +
prop L&
prop Lu +
@@ -3450,7 +3450,7 @@
------------------------------------------------------------------
Bra
prop Xan +
- prop Any
+ AllAny
prop Xan +
prop L&
notprop Xan ++
@@ -3481,7 +3481,7 @@
------------------------------------------------------------------
Bra
prop Xsp +
- prop Any
+ AllAny
prop Xsp ++
prop L&
prop Xsp ++
@@ -3510,7 +3510,7 @@
------------------------------------------------------------------
Bra
prop Xwd +
- prop Any
+ AllAny
prop Xwd +
prop L&
prop Xwd +
@@ -3539,7 +3539,7 @@
------------------------------------------------------------------
Bra
prop Xuc +
- prop Any
+ AllAny
prop Xuc +
prop L&
prop Xuc +
@@ -4927,4 +4927,13 @@
/(?'X²ABC'...)/utf
Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)
+# -------
+
+/\p{Any}*xyz/I
+Capture group count = 0
+Compile options: <none>
+Overall options: anchored
+Last code unit = 'z'
+Subject length lower bound = 3
+
# End of testinput5