Revision: 361
http://www.exim.org/viewvc/pcre2?view=rev&revision=361
Author: ph10
Date: 2015-09-01 18:32:42 +0100 (Tue, 01 Sep 2015)
Log Message:
-----------
Complete escape processing for PCRE2_ALT_VERBNAMES
Modified Paths:
--------------
code/trunk/doc/pcre2api.3
code/trunk/doc/pcre2pattern.3
code/trunk/src/pcre2_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/doc/pcre2api.3
===================================================================
--- code/trunk/doc/pcre2api.3 2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/doc/pcre2api.3 2015-09-01 17:32:42 UTC (rev 361)
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "01 September 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@@ -1060,7 +1060,10 @@
parenthesis. The name is not processed in any way, and it is not possible to
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
option is set, normal backslash processing is applied to verb names and only an
-unescaped closing parenthesis terminates the name.
+unescaped closing parenthesis terminates the name. A closing parenthesis can be
+included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
+option is set, unescaped whitespace in verb names is skipped and #-comments are
+recognized, exactly as in the rest of the pattern.
.sp
PCRE2_AUTO_CALLOUT
.sp
@@ -2962,6 +2965,6 @@
.rs
.sp
.nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
Modified: code/trunk/doc/pcre2pattern.3
===================================================================
--- code/trunk/doc/pcre2pattern.3 2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/doc/pcre2pattern.3 2015-09-01 17:32:42 UTC (rev 361)
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -2953,7 +2953,10 @@
any way, and it is not possible to include a closing parenthesis in the name.
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
is applied to verb names and only an unescaped closing parenthesis terminates
-the name.
+the name. A closing parenthesis can be included in a name either as \e) or
+between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
+in verb names is skipped and #-comments are recognized, exactly as in the rest
+of the pattern.
.P
The maximum length of a name is 255 in the 8-bit library and 65535 in the
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
@@ -3383,6 +3386,6 @@
.rs
.sp
.nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/src/pcre2_compile.c 2015-09-01 17:32:42 UTC (rev 361)
@@ -2793,6 +2793,148 @@
/*************************************************
+* Process (*VERB) name for escapes *
+*************************************************/
+
+/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
+process the characters in a verb's name argument. It is called twice, once with
+codeptr == NULL, to find out the length of the processed name, and again to put
+the name into memory.
+
+Arguments:
+ ptrptr pointer to the input pointer
+ codeptr pointer to the compiled code pointer
+ errorcodeptr pointer to the error code
+ utf TRUE if processing UTF
+ cb compile data block
+
+Returns: length of the processed name, or < 0 on error
+*/
+
+static int
+process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
+ uint32_t options, BOOL utf, compile_block *cb)
+{
+int arglen = 0;
+BOOL inescq = FALSE;
+PCRE2_SPTR ptr = *ptrptr;
+PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
+
+for (; ptr < cb->end_pattern; ptr++)
+ {
+ uint32_t x = *ptr;
+
+ /* Skip over literals */
+
+ if (inescq)
+ {
+ if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
+ {
+ inescq = FALSE;
+ ptr++;;
+ continue;
+ }
+ }
+
+ else /* Not a literal character */
+ {
+ if (x == CHAR_RIGHT_PARENTHESIS) break;
+
+ /* Skip over comments and whitespace in extended mode. Need a loop to handle
+ whitespace after a comment. */
+
+ if ((options & PCRE2_EXTENDED) != 0)
+ {
+ for (;;)
+ {
+ while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
+ if (x != CHAR_NUMBER_SIGN) break;
+ ptr++;
+ while (*ptr != CHAR_NULL)
+ {
+ if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
+ { /* IS_NEWLINE sets cb->nllen. */
+ ptr += cb->nllen;
+ break;
+ }
+ ptr++;
+#ifdef SUPPORT_UNICODE
+ if (utf) FORWARDCHAR(ptr);
+#endif
+ }
+ x = *ptr; /* Either NULL or the char after a newline */
+ }
+ if (ptr >= cb->end_pattern) break;
+ }
+
+ /* Process escapes */
+
+ if (x == '\\')
+ {
+ int rc;
+ *errorcodeptr = 0;
+ rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
+ *ptrptr = ptr; /* For possible error */
+ if (*errorcodeptr != 0) return -1;
+ if (rc != 0)
+ {
+ if (rc == ESC_Q)
+ {
+ inescq = TRUE;
+ continue;
+ }
+ if (rc == ESC_E) continue;
+ *errorcodeptr = ERR40;
+ return -1;
+ }
+ }
+ }
+
+ /* We have the next character in the name. */
+
+#ifdef SUPPORT_UNICODE
+ if (utf)
+ {
+ if (code == NULL) /* Just want the length */
+ {
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ int i;
+ for (i = 0; i < PRIV(utf8_table1_size); i++)
+ if ((int)x <= PRIV(utf8_table1)[i]) break;
+ arglen += i;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+ if (x > 0xffff) arglen++;
+#endif
+ }
+ else
+ {
+ PCRE2_UCHAR cbuff[8];
+ x = PRIV(ord2utf)(x, cbuff);
+ memcpy(code, cbuff, CU2BYTES(x));
+ code += x;
+ }
+ }
+ else
+#endif /* SUPPORT_UNICODE */
+
+ /* Not UTF */
+ {
+ if (code != NULL) *code++ = x;
+ }
+
+ arglen++;
+ }
+
+/* Update the pointers before returning. */
+
+*ptrptr = ptr;
+if (codeptr != NULL) *codeptr = code;
+return arglen;
+}
+
+
+
+/*************************************************
* Scan regex to identify named groups *
*************************************************/
@@ -5399,33 +5541,9 @@
}
else
{
- arglen = 0;
- while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
- {
- if (*ptr == '\\')
- {
- uint32_t x;
- *errorcodeptr = 0;
- i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
- if (*errorcodeptr != 0) goto FAILED;
- if (i != 0)
- {
- *errorcodeptr = ERR40;
- goto FAILED;
- }
-#ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
- for (i = 0; i < PRIV(utf8_table1_size); i++)
- if ((int)x <= PRIV(utf8_table1)[i]) break;
- arglen += i;
-#elif PCRE2_CODE_UNIT_WIDTH == 16
- if (x > 0xffff) arglen++;
-#endif
-#endif
- }
- arglen++;
- ptr++;
- }
+ arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
+ utf, cb);
+ if (arglen < 0) goto FAILED;
}
if ((unsigned int)arglen > MAX_MARK)
@@ -5495,35 +5613,12 @@
}
setverb = *code++ = verbs[i].op_arg;
*code++ = arglen;
-
- /* If we are processing the argument for escapes, we don't need
- to apply checks here because it was all checked above when
- computing the length. */
-
if ((options & PCRE2_ALT_VERBNAMES) != 0)
{
- for (; arg != ptr; arg++)
- {
- if (*arg == '\\')
- {
- uint32_t x;
- *errorcodeptr = 0;
- (void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
- cb);
-#ifdef SUPPORT_UNICODE
- if (utf)
- {
- PCRE2_UCHAR cbuff[8];
- x = PRIV(ord2utf)(x, cbuff);
- memcpy(code, cbuff, CU2BYTES(x));
- code += x;
- }
- else
-#endif
- *code++ = x;
- }
- else *code++ = *arg;
- }
+ PCRE2_UCHAR *memcode = code; /* code is "register" */
+ (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
+ utf, cb);
+ code = memcode;
}
else /* No argument processing */
{
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/testdata/testinput2 2015-09-01 17:32:42 UTC (rev 361)
@@ -4449,4 +4449,20 @@
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
cxxxz
+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+ x
+
+/(*:A\ExxxB)x/alt_verbnames,mark
+ x
+
+/(*: A \ and #comment
+ \ B)x/x,alt_verbnames,mark
+ x
+
+/(*:A
+B)x/alt_verbnames,mark
+ x
+
+/(*:abc\Qpqr)/alt_verbnames
+
# End of testinput2
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-08-30 17:47:36 UTC (rev 360)
+++ code/trunk/testdata/testoutput2 2015-09-01 17:32:42 UTC (rev 361)
@@ -14724,4 +14724,29 @@
0: xxx
MK: ab\x09(d)c
+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+ x
+ 0: x
+MK: Axx)xB
+
+/(*:A\ExxxB)x/alt_verbnames,mark
+ x
+ 0: x
+MK: AxxxB
+
+/(*: A \ and #comment
+ \ B)x/x,alt_verbnames,mark
+ x
+ 0: x
+MK: A and B
+
+/(*:A
+B)x/alt_verbnames,mark
+ x
+ 0: x
+MK: A\x0aB
+
+/(*:abc\Qpqr)/alt_verbnames
+Failed: error 160 at offset 12: (*VERB) not recognized or malformed
+
# End of testinput2