[Pcre-svn] [1072] code/trunk: Compile \p{Any} the same as .

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1072] code/trunk: Compile \p{Any} the same as .
Revision: 1072
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1072
Author:   ph10
Date:     2019-02-13 17:30:24 +0000 (Wed, 13 Feb 2019)
Log Message:
-----------
Compile \p{Any} the same as . in DOTALL mode, to benefit from auto-anchoring.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/ChangeLog    2019-02-13 17:30:24 UTC (rev 1072)
@@ -128,7 +128,10 @@
 31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} 
 construct.


+32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
+from auto-anchoring if \p{Any}* starts a pattern.

+
Version 10.32 10-September-2018
-------------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/src/pcre2_compile.c    2019-02-13 17:30:24 UTC (rev 1072)
@@ -1459,7 +1459,7 @@


int
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
- int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
+ int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
compile_block *cb)
{
BOOL utf = (options & PCRE2_UTF) != 0;
@@ -1551,7 +1551,7 @@

/* Escapes that need further processing, including those that are unknown, have
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
-\o, and \x are recognized (\u and \U can never appear as they are used for case
+\o, and \x are recognized (\u and \U can never appear as they are used for case
forcing). */

 else
@@ -1559,7 +1559,7 @@
   int s;
   PCRE2_SPTR oldptr;
   BOOL overflow;
-  BOOL alt_bsux = 
+  BOOL alt_bsux =
     ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;


   /* Filter calls from pcre2_substitute(). */
@@ -1571,8 +1571,8 @@
       *errorcodeptr = ERR3;
       return 0;
       }
-    alt_bsux = FALSE;   /* Do not modify \x handling */   
-    }   
+    alt_bsux = FALSE;   /* Do not modify \x handling */
+    }


   switch (c)
     {
@@ -1595,37 +1595,37 @@
     if (!alt_bsux) *errorcodeptr = ERR37; else
       {
       uint32_t xc;
-      
+
       if (ptr >= ptrend) break;
-      if (*ptr == CHAR_LEFT_CURLY_BRACKET && 
+      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
           (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
         {
         PCRE2_SPTR hptr = ptr + 1;
         cc = 0;
-        
+
         while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
-          { 
+          {
           if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
             {
             *errorcodeptr = ERR77;
             ptr = hptr;   /* Show where */
-            break;        /* *hptr != } will cause another break below */  
-            } 
+            break;        /* *hptr != } will cause another break below */
+            }
           cc = (cc << 4) | xc;
-          hptr++; 
-          } 
-          
+          hptr++;
+          }
+
         if (hptr == ptr + 1 ||   /* No hex digits */
             hptr >= ptrend ||    /* Hit end of input */
             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
           break;         /* Hex escape not recognized */
-           
+
         c = cc;          /* Accept the code point */
-        ptr = hptr + 1; 
+        ptr = hptr + 1;
         }
-         
+
       else  /* Must be exactly 4 hex digits */
-        {      
+        {
         if (ptrend - ptr < 4) break;               /* Less than 4 chars */
         if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
         if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
@@ -1635,8 +1635,8 @@
         if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
         c = (cc << 4) | xc;
         ptr += 4;
-        } 
- 
+        }
+
       if (utf)
         {
         if (c > 0x10ffffU) *errorcodeptr = ERR77;
@@ -3424,7 +3424,7 @@
       else
         {
         tempptr = ptr;
-        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options, 
+        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
           cb->cx->extra_options, TRUE, cb);


         if (errorcode != 0)
@@ -7631,9 +7631,20 @@
       {
       uint32_t ptype = *(++pptr) >> 16;
       uint32_t pdata = *pptr & 0xffff;
-      *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
-      *code++ = ptype;
-      *code++ = pdata;
+
+      /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
+      from the auto-anchoring code. */
+
+      if (meta_arg == ESC_p && ptype == PT_ANY)
+        {
+        *code++ = OP_ALLANY;
+        }
+      else
+        {
+        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
+        *code++ = ptype;
+        *code++ = pdata;
+        }
       break;  /* End META_ESCAPE */
       }
 #endif


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/testdata/testinput5    2019-02-13 17:30:24 UTC (rev 1072)
@@ -2173,4 +2173,8 @@


/(?'X²ABC'...)/utf

+# -------
+
+/\p{Any}*xyz/I
+
# End of testinput5

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2019-02-13 16:48:30 UTC (rev 1071)
+++ code/trunk/testdata/testoutput5    2019-02-13 17:30:24 UTC (rev 1072)
@@ -3296,27 +3296,27 @@
 /\p{Any}+\p{Any} \p{Any}+\P{Any} \p{Any}+\p{L&} \p{Any}+\p{L} \p{Any}+\p{Lu} \p{Any}+\p{Han} \p{Any}+\p{Xan} \p{Any}+\p{Xsp} \p{Any}+\p{Xps} \p{Xwd}+\p{Any} \p{Any}+\p{Xuc}/Bx,ucp
 ------------------------------------------------------------------
         Bra
-        prop Any +
-        prop Any
-        prop Any +
+        AllAny+
+        AllAny
+        AllAny+
         notprop Any
-        prop Any +
+        AllAny+
         prop L&
-        prop Any +
+        AllAny+
         prop L
-        prop Any +
+        AllAny+
         prop Lu
-        prop Any +
+        AllAny+
         prop Han
-        prop Any +
+        AllAny+
         prop Xan
-        prop Any +
+        AllAny+
         prop Xsp
-        prop Any +
+        AllAny+
         prop Xps
         prop Xwd +
-        prop Any
-        prop Any +
+        AllAny
+        AllAny+
         prop Xuc
         Ket
         End
@@ -3326,7 +3326,7 @@
 ------------------------------------------------------------------
         Bra
         prop L& +
-        prop Any
+        AllAny
         prop L& +
         prop L&
         notprop L& ++
@@ -3357,7 +3357,7 @@
 ------------------------------------------------------------------
         Bra
         prop N +
-        prop Any
+        AllAny
         prop N +
         prop L&
         prop N ++
@@ -3388,7 +3388,7 @@
 ------------------------------------------------------------------
         Bra
         prop Lu +
-        prop Any
+        AllAny
         prop Lu +
         prop L&
         prop Lu +
@@ -3450,7 +3450,7 @@
 ------------------------------------------------------------------
         Bra
         prop Xan +
-        prop Any
+        AllAny
         prop Xan +
         prop L&
         notprop Xan ++
@@ -3481,7 +3481,7 @@
 ------------------------------------------------------------------
         Bra
         prop Xsp +
-        prop Any
+        AllAny
         prop Xsp ++
         prop L&
         prop Xsp ++
@@ -3510,7 +3510,7 @@
 ------------------------------------------------------------------
         Bra
         prop Xwd +
-        prop Any
+        AllAny
         prop Xwd +
         prop L&
         prop Xwd +
@@ -3539,7 +3539,7 @@
 ------------------------------------------------------------------
         Bra
         prop Xuc +
-        prop Any
+        AllAny
         prop Xuc +
         prop L&
         prop Xuc +
@@ -4927,4 +4927,13 @@
 /(?'X²ABC'...)/utf
 Failed: error 142 at offset 4: syntax error in subpattern name (missing terminator?)


+# -------
+
+/\p{Any}*xyz/I
+Capture group count = 0
+Compile options: <none>
+Overall options: anchored
+Last code unit = 'z'
+Subject length lower bound = 3
+
# End of testinput5