Revision: 782
http://www.exim.org/viewvc/pcre2?view=rev&revision=782
Author: ph10
Date: 2017-05-14 13:04:13 +0100 (Sun, 14 May 2017)
Log Message:
-----------
Experimental conversion: fix POSIX BRE starting ^*.
Modified Paths:
--------------
code/trunk/src/pcre2_convert.c
code/trunk/testdata/testinput24
code/trunk/testdata/testoutput24
Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c 2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/src/pcre2_convert.c 2017-05-14 12:04:13 UTC (rev 782)
@@ -68,10 +68,11 @@
#define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
#define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS
-/* States for range and POSIX class processing */
+/* States for range and POSIX processing */
enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
-enum { POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
+enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET,
+ POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
/* Macro to add a character string to the output buffer, checking for overflow. */
@@ -119,10 +120,9 @@
PCRE2_SIZE convlength = 0;
uint32_t bracount = 0;
-uint32_t posix_class_state = POSIX_CLASS_NOT_STARTED;
+uint32_t posix_state = POSIX_START_REGEX;
uint32_t lastspecial = 0;
BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
-BOOL inclass = FALSE;
BOOL nextisliteral = FALSE;
(void)utf; /* Not used when Unicode not supported */
@@ -132,14 +132,8 @@
*bufflenptr = plength;
-/* Now scan the input. In non-extended patterns, an initial asterisk is treated
-as literal. Still figuring out what happens in extended patterns... */
+/* Now scan the input. */
-if (plength > 0 && *posix == CHAR_ASTERISK)
- {
- if (!extended) nextisliteral = TRUE;
- }
-
while (plength > 0)
{
uint32_t c, sc;
@@ -168,12 +162,12 @@
/* Handle a character within a class. */
- if (inclass)
+ if (posix_state >= POSIX_CLASS_NOT_STARTED)
{
if (c == CHAR_RIGHT_SQUARE_BRACKET)
{
PUTCHARS(STR_RIGHT_SQUARE_BRACKET);
- inclass = FALSE;
+ posix_state = POSIX_NOT_BRACKET;
}
/* Not the end of the class */
@@ -180,11 +174,11 @@
else
{
- switch (posix_class_state)
+ switch (posix_state)
{
case POSIX_CLASS_STARTED:
if (c <= 127 && islower(c)) break; /* Remain in started state */
- posix_class_state = POSIX_CLASS_NOT_STARTED;
+ posix_state = POSIX_CLASS_NOT_STARTED;
if (c == CHAR_COLON && plength > 0 &&
*posix == CHAR_RIGHT_SQUARE_BRACKET)
{
@@ -197,11 +191,11 @@
case POSIX_CLASS_NOT_STARTED:
if (c == CHAR_LEFT_SQUARE_BRACKET)
- posix_class_state = POSIX_CLASS_STARTING;
+ posix_state = POSIX_CLASS_STARTING;
break;
case POSIX_CLASS_STARTING:
- if (c == CHAR_COLON) posix_class_state = POSIX_CLASS_STARTED;
+ if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
break;
}
@@ -242,8 +236,7 @@
/* Handle "normal" character classes */
- posix_class_state = POSIX_CLASS_NOT_STARTED;
- inclass = TRUE;
+ posix_state = POSIX_CLASS_NOT_STARTED;
/* Handle ^ and ] as first characters */
@@ -295,6 +288,7 @@
case CHAR_DOT:
case CHAR_DOLLAR_SIGN:
+ posix_state = POSIX_NOT_BRACKET;
COPY_SPECIAL:
lastspecial = c;
if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
@@ -302,15 +296,22 @@
break;
case CHAR_ASTERISK:
- if (lastspecial != CHAR_ASTERISK) goto COPY_SPECIAL;
+ if (lastspecial != CHAR_ASTERISK)
+ {
+ if (!extended && posix_state < POSIX_NOT_BRACKET)
+ goto ESCAPE_LITERAL;
+ goto COPY_SPECIAL;
+ }
break; /* Ignore second and subsequent asterisks */
case CHAR_CIRCUMFLEX_ACCENT:
- if (extended ||
- lastspecial == 0 ||
- lastspecial == CHAR_LEFT_PARENTHESIS ||
- lastspecial == CHAR_VERTICAL_LINE)
+ if (extended) goto COPY_SPECIAL;
+ if (posix_state == POSIX_START_REGEX ||
+ lastspecial == CHAR_LEFT_PARENTHESIS)
+ {
+ posix_state = POSIX_ANCHORED;
goto COPY_SPECIAL;
+ }
/* Fall through */
default:
@@ -323,11 +324,13 @@
if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
memcpy(p, posix - clength, CU2BYTES(clength));
p += clength;
+ posix_state = POSIX_NOT_BRACKET;
break;
}
}
-if (inclass) return ERROR_MISSING_SQUARE_BRACKET;
+if (posix_state >= POSIX_CLASS_NOT_STARTED)
+ return ERROR_MISSING_SQUARE_BRACKET;
convlength += p - pp; /* Final segment */
*bufflenptr = convlength;
*p++ = 0;
Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24 2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/testdata/testinput24 2017-05-14 12:04:13 UTC (rev 782)
@@ -266,6 +266,8 @@
/^how to \^how to/
+/^*abc/
+
/*abc/
X*abcY
@@ -272,7 +274,7 @@
/**abc/
XabcY
X*abcY
- X**abcY
+ X**abcY
/^b\(c^d\)\(^e^f\)/
Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24 2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/testdata/testoutput24 2017-05-14 12:04:13 UTC (rev 782)
@@ -426,6 +426,9 @@
/^how to \^how to/
^how to \^how to
+/^*abc/
+^\*abc
+
/*abc/
\*abc
X*abcY
@@ -437,7 +440,7 @@
0: abc
X*abcY
0: *abc
- X**abcY
+ X**abcY
0: **abc
/^b\(c^d\)\(^e^f\)/