[Pcre-svn] [361] code/trunk: Complete escape processing for …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [361] code/trunk: Complete escape processing for PCRE2_ALT_VERBNAMES
Revision: 361
          http://www.exim.org/viewvc/pcre2?view=rev&revision=361
Author:   ph10
Date:     2015-09-01 18:32:42 +0100 (Tue, 01 Sep 2015)
Log Message:
-----------
Complete escape processing for PCRE2_ALT_VERBNAMES


Modified Paths:
--------------
    code/trunk/doc/pcre2api.3
    code/trunk/doc/pcre2pattern.3
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/doc/pcre2api.3
===================================================================
--- code/trunk/doc/pcre2api.3    2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/doc/pcre2api.3    2015-09-01 17:32:42 UTC (rev 361)
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@@ -1060,7 +1060,10 @@
 parenthesis. The name is not processed in any way, and it is not possible to
 include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
 option is set, normal backslash processing is applied to verb names and only an
-unescaped closing parenthesis terminates the name.
+unescaped closing parenthesis terminates the name. A closing parenthesis can be
+included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
+option is set, unescaped whitespace in verb names is skipped and #-comments are
+recognized, exactly as in the rest of the pattern.
 .sp
   PCRE2_AUTO_CALLOUT
 .sp
@@ -2962,6 +2965,6 @@
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi


Modified: code/trunk/doc/pcre2pattern.3
===================================================================
--- code/trunk/doc/pcre2pattern.3    2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/doc/pcre2pattern.3    2015-09-01 17:32:42 UTC (rev 361)
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -2953,7 +2953,10 @@
 any way, and it is not possible to include a closing parenthesis in the name.
 However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing 
 is applied to verb names and only an unescaped closing parenthesis terminates 
-the name.
+the name. A closing parenthesis can be included in a name either as \e) or 
+between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace 
+in verb names is skipped and #-comments are recognized, exactly as in the rest 
+of the pattern.
 .P
 The maximum length of a name is 255 in the 8-bit library and 65535 in the
 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
@@ -3383,6 +3386,6 @@
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/src/pcre2_compile.c    2015-09-01 17:32:42 UTC (rev 361)
@@ -2793,6 +2793,148 @@



 /*************************************************
+*       Process (*VERB) name for escapes         *
+*************************************************/
+
+/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
+process the characters in a verb's name argument. It is called twice, once with 
+codeptr == NULL, to find out the length of the processed name, and again to put 
+the name into memory.
+
+Arguments:
+  ptrptr        pointer to the input pointer
+  codeptr       pointer to the compiled code pointer
+  errorcodeptr  pointer to the error code
+  utf           TRUE if processing UTF
+  cb            compile data block
+
+Returns:        length of the processed name, or < 0 on error
+*/
+
+static int
+process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
+  uint32_t options, BOOL utf, compile_block *cb)
+{
+int arglen = 0;
+BOOL inescq = FALSE;
+PCRE2_SPTR ptr = *ptrptr;
+PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
+
+for (; ptr < cb->end_pattern; ptr++)
+  {
+  uint32_t x = *ptr;
+
+  /* Skip over literals */
+
+  if (inescq)
+    {
+    if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
+      {
+      inescq = FALSE;
+      ptr++;;
+      continue;
+      }
+    }
+
+  else  /* Not a literal character */
+    { 
+    if (x == CHAR_RIGHT_PARENTHESIS) break;
+ 
+    /* Skip over comments and whitespace in extended mode. Need a loop to handle
+    whitespace after a comment. */
+  
+    if ((options & PCRE2_EXTENDED) != 0)
+      {
+      for (;;)
+        {
+        while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
+        if (x != CHAR_NUMBER_SIGN) break;
+        ptr++;
+        while (*ptr != CHAR_NULL)
+          {
+          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
+            {                          /* IS_NEWLINE sets cb->nllen. */
+            ptr += cb->nllen;
+            break;
+            }
+          ptr++;
+#ifdef SUPPORT_UNICODE
+          if (utf) FORWARDCHAR(ptr);
+#endif
+          }
+        x = *ptr;     /* Either NULL or the char after a newline */
+        }
+      if (ptr >= cb->end_pattern) break;   
+      }
+  
+    /* Process escapes */
+  
+    if (x == '\\')
+      {
+      int rc;
+      *errorcodeptr = 0;
+      rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
+      *ptrptr = ptr;   /* For possible error */ 
+      if (*errorcodeptr != 0) return -1;
+      if (rc != 0)
+        {
+        if (rc == ESC_Q) 
+          {
+          inescq = TRUE;
+          continue;
+          }
+        if (rc == ESC_E) continue;
+        *errorcodeptr = ERR40;
+        return -1;
+        }
+      }
+    }   
+    
+  /* We have the next character in the name. */
+
+#ifdef SUPPORT_UNICODE
+  if (utf)
+    {
+    if (code == NULL)   /* Just want the length */
+      {
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      int i;
+      for (i = 0; i < PRIV(utf8_table1_size); i++)
+        if ((int)x <= PRIV(utf8_table1)[i]) break;
+      arglen += i;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+      if (x > 0xffff) arglen++;
+#endif
+      }
+    else
+      {
+      PCRE2_UCHAR cbuff[8];
+      x = PRIV(ord2utf)(x, cbuff);
+      memcpy(code, cbuff, CU2BYTES(x));
+      code += x;
+      }
+    }
+  else
+#endif  /* SUPPORT_UNICODE */
+
+  /* Not UTF */
+    {
+    if (code != NULL) *code++ = x;
+    }
+
+  arglen++;
+  }
+
+/* Update the pointers before returning. */
+
+*ptrptr = ptr;
+if (codeptr != NULL) *codeptr = code;
+return arglen;
+}
+
+
+
+/*************************************************
 *      Scan regex to identify named groups       *
 *************************************************/


@@ -5399,33 +5541,9 @@
           }
         else
           {
-          arglen = 0;
-          while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
-            {
-            if (*ptr == '\\')
-              {
-              uint32_t x;
-              *errorcodeptr = 0;
-              i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
-              if (*errorcodeptr != 0) goto FAILED;
-              if (i != 0)
-                {
-                *errorcodeptr = ERR40;
-                goto FAILED;
-                }
-#ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
-              for (i = 0; i < PRIV(utf8_table1_size); i++)
-                if ((int)x <= PRIV(utf8_table1)[i]) break;
-              arglen += i;
-#elif PCRE2_CODE_UNIT_WIDTH == 16
-              if (x > 0xffff) arglen++;
-#endif
-#endif
-              }
-            arglen++;
-            ptr++;
-            }
+          arglen = process_verb_name(&ptr, NULL, errorcodeptr, options, 
+            utf, cb);
+          if (arglen < 0) goto FAILED;
           }


         if ((unsigned int)arglen > MAX_MARK)
@@ -5495,35 +5613,12 @@
               }
             setverb = *code++ = verbs[i].op_arg;
             *code++ = arglen;
-
-            /* If we are processing the argument for escapes, we don't need
-            to apply checks here because it was all checked above when
-            computing the length. */
-
             if ((options & PCRE2_ALT_VERBNAMES) != 0)
               {
-              for (; arg != ptr; arg++)
-                {
-                if (*arg == '\\')
-                  {
-                  uint32_t x;
-                  *errorcodeptr = 0;
-                  (void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
-                    cb);
-#ifdef SUPPORT_UNICODE
-                  if (utf)
-                    {
-                    PCRE2_UCHAR cbuff[8];
-                    x = PRIV(ord2utf)(x, cbuff);
-                    memcpy(code, cbuff, CU2BYTES(x));
-                    code += x;
-                    }
-                  else
-#endif
-                  *code++ = x;
-                  }
-                else *code++ = *arg;
-                }
+              PCRE2_UCHAR *memcode = code;  /* code is "register" */
+              (void)process_verb_name(&arg, &memcode, errorcodeptr, options, 
+                utf, cb);
+              code = memcode;  
               }
             else   /* No argument processing */
               {


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/testdata/testinput2    2015-09-01 17:32:42 UTC (rev 361)
@@ -4449,4 +4449,20 @@
 /(*:ab\t(d\)c)xxx/alt_verbnames,mark
     cxxxz


+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+    x
+    
+/(*:A\ExxxB)x/alt_verbnames,mark
+    x 
+    
+/(*: A \ and #comment
+     \ B)x/x,alt_verbnames,mark
+    x  
+    
+/(*:A
+B)x/alt_verbnames,mark 
+    x
+
+/(*:abc\Qpqr)/alt_verbnames
+
 # End of testinput2 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/testdata/testoutput2    2015-09-01 17:32:42 UTC (rev 361)
@@ -14724,4 +14724,29 @@
  0: xxx
 MK: ab\x09(d)c


+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+    x
+ 0: x
+MK: Axx)xB
+    
+/(*:A\ExxxB)x/alt_verbnames,mark
+    x 
+ 0: x
+MK: AxxxB
+    
+/(*: A \ and #comment
+     \ B)x/x,alt_verbnames,mark
+    x  
+ 0: x
+MK: A and B
+    
+/(*:A
+B)x/alt_verbnames,mark 
+    x
+ 0: x
+MK: A\x0aB
+
+/(*:abc\Qpqr)/alt_verbnames
+Failed: error 160 at offset 12: (*VERB) not recognized or malformed
+
 # End of testinput2