Revision: 802
http://www.exim.org/viewvc/pcre2?view=rev&revision=802
Author: zherczeg
Date: 2017-05-24 11:14:43 +0100 (Wed, 24 May 2017)
Log Message:
-----------
Rework character range parsing in glob conversion.
Modified Paths:
--------------
code/trunk/src/pcre2_convert.c
code/trunk/testdata/testinput24
code/trunk/testdata/testoutput24
Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c 2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/src/pcre2_convert.c 2017-05-24 10:14:43 UTC (rev 802)
@@ -423,6 +423,47 @@
}
+/* Prints the separator into the output.
+
+Arguments:
+ out output context
+ separator glob separator
+ with_escape backslash is needed before separator
+*/
+
+static void
+convert_glob_print_separator(pcre2_output_context *out,
+ PCRE2_UCHAR separator, BOOL with_escape)
+{
+if (with_escape)
+ convert_glob_write(out, CHAR_BACKSLASH);
+
+convert_glob_write(out, separator);
+}
+
+
+/* Prints a wildcard into the output.
+
+Arguments:
+ out output context
+ separator glob separator
+ with_escape backslash is needed before separator
+*/
+
+static void
+convert_glob_print_wildcard(pcre2_output_context *out,
+ PCRE2_UCHAR separator, BOOL with_escape)
+{
+out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+convert_glob_write_str(out, 2);
+
+convert_glob_print_separator(out, separator, with_escape);
+
+convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
+}
+
+
/* Parse a posix class.
Arguments:
@@ -519,12 +560,19 @@
static int
convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
- pcre2_output_context *out, PCRE2_UCHAR separator, BOOL with_escape)
+ pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
+ BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
{
+BOOL is_negative = FALSE;
+BOOL separator_seen = FALSE;
+BOOL has_prev_c;
PCRE2_SPTR pattern = *from;
-PCRE2_UCHAR c;
+PCRE2_SPTR char_start = NULL;
+uint32_t c, prev_c;
int result, len;
+(void)utf; /* Avoid compiler warning. */
+
if (pattern >= pattern_end)
{
*from = pattern;
@@ -531,65 +579,70 @@
return ERROR_MISSING_SQUARE_BRACKET;
}
-c = *pattern;
-
-if (c == CHAR_EXCLAMATION_MARK
- || c == CHAR_CIRCUMFLEX_ACCENT)
+if (*pattern == CHAR_EXCLAMATION_MARK
+ || *pattern == CHAR_CIRCUMFLEX_ACCENT)
{
- out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
- out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
- len = 2;
- }
-else
- {
- out->out_str[0] = CHAR_LEFT_PARENTHESIS;
- out->out_str[1] = CHAR_QUESTION_MARK;
- out->out_str[2] = CHAR_EXCLAMATION_MARK;
- len = 3;
- }
+ pattern++;
-if (with_escape)
- {
- out->out_str[len] = CHAR_BACKSLASH;
- len++;
- }
-
-out->out_str[len] = (uint8_t) separator;
-
-convert_glob_write_str(out, len + 1);
-
-if (c == CHAR_EXCLAMATION_MARK
- || c == CHAR_CIRCUMFLEX_ACCENT)
- {
- pattern++;
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
- c = *pattern;
+
+ is_negative = TRUE;
+
+ out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+ out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+ len = 2;
+
+ if (!no_wildsep)
+ {
+ if (with_escape)
+ {
+ out->out_str[len] = CHAR_BACKSLASH;
+ len++;
+ }
+ out->out_str[len] = (uint8_t) separator;
+ }
+
+ convert_glob_write_str(out, len + 1);
}
else
- {
- out->out_str[0] = CHAR_RIGHT_PARENTHESIS;
- out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET;
- convert_glob_write_str(out, 2);
- }
+ convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
-if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET)
+has_prev_c = FALSE;
+prev_c = 0;
+
+if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
{
- convert_glob_write(out, CHAR_BACKSLASH);
- convert_glob_write(out, c);
+ convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
+ has_prev_c = TRUE;
+ prev_c = CHAR_RIGHT_SQUARE_BRACKET;
pattern++;
}
while (pattern < pattern_end)
{
- c = *pattern++;
+ char_start = pattern;
+ GETCHARINCTEST(c, pattern);
if (c == CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_write(out, c);
+
+ if (!is_negative && !no_wildsep && separator_seen)
+ {
+ out->out_str[0] = CHAR_LEFT_PARENTHESIS;
+ out->out_str[1] = CHAR_QUESTION_MARK;
+ out->out_str[2] = CHAR_LESS_THAN_SIGN;
+ out->out_str[3] = CHAR_EXCLAMATION_MARK;
+ convert_glob_write_str(out, 4);
+
+ convert_glob_print_separator(out, separator, with_escape);
+ convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
+ }
+
*from = pattern;
return 0;
}
@@ -605,32 +658,64 @@
pattern = *from;
- /* A dash after a character class is a normal character. */
- if (pattern >= pattern_end || *pattern != CHAR_MINUS)
- continue;
-
- c = CHAR_MINUS;
- pattern++;
+ has_prev_c = FALSE;
+ prev_c = 0;
+ separator_seen = TRUE;
+ continue;
}
- else if (c == CHAR_MINUS)
+ else if (c == CHAR_MINUS && has_prev_c &&
+ *pattern != CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_write(out, CHAR_MINUS);
- c = *pattern++;
- if (c == CHAR_BACKSLASH)
+ char_start = pattern;
+ GETCHARINCTEST(c, pattern);
+
+ if (pattern >= pattern_end) break;
+
+ if (escape != 0 && c == escape)
{
+ char_start = pattern;
+ GETCHARINCTEST(c, pattern);
+ }
+ else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
+ {
+ *from = pattern;
+ return PCRE2_ERROR_CONVERT_SYNTAX;
+ }
+
+ if (prev_c > c)
+ {
+ *from = pattern;
+ return PCRE2_ERROR_CONVERT_SYNTAX;
+ }
+
+ if (prev_c < separator && separator < c) separator_seen = TRUE;
+
+ has_prev_c = FALSE;
+ prev_c = 0;
+ }
+ else
+ {
+ if (escape != 0 && c == escape)
+ {
+ char_start = pattern;
+ GETCHARINCTEST(c, pattern);
+
if (pattern >= pattern_end) break;
- c = *pattern++;
}
+
+ has_prev_c = TRUE;
+ prev_c = c;
}
- else if (c == CHAR_BACKSLASH)
- c = *pattern++;
if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
c == CHAR_BACKSLASH || c == CHAR_MINUS)
convert_glob_write(out, CHAR_BACKSLASH);
- convert_glob_write(out, c);
+ if (c == separator) separator_seen = TRUE;
+
+ do convert_glob_write(out, *char_start++); while (char_start < pattern);
}
*from = pattern;
@@ -638,47 +723,6 @@
}
-/* Prints the separator into the output.
-
-Arguments:
- out output context
- separator glob separator
- with_escape backslash is needed before separator
-*/
-
-static void
-convert_glob_print_separator(pcre2_output_context *out,
- PCRE2_UCHAR separator, BOOL with_escape)
-{
-if (with_escape)
- convert_glob_write(out, CHAR_BACKSLASH);
-
-convert_glob_write(out, separator);
-}
-
-
-/* Prints a wildcard into the output.
-
-Arguments:
- out output context
- separator glob separator
- with_escape backslash is needed before separator
-*/
-
-static void
-convert_glob_print_wildcard(pcre2_output_context *out,
- PCRE2_UCHAR separator, BOOL with_escape)
-{
-out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
-out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
-convert_glob_write_str(out, 2);
-
-convert_glob_print_separator(out, separator, with_escape);
-
-convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
-}
-
-
/* Prints a (*COMMIT) into the output.
Arguments:
@@ -727,8 +771,8 @@
PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength;
PCRE2_UCHAR separator = ccontext->glob_separator;
+PCRE2_UCHAR escape = ccontext->glob_escape;
PCRE2_UCHAR c;
-BOOL no_escape = ccontext->glob_escape == 0;
BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
BOOL in_atomic = FALSE;
@@ -736,14 +780,16 @@
BOOL with_escape, is_start;
int result, len;
-(void)utf; /* Avoid compiler warning */
+(void)utf; /* Avoid compiler warning. */
-if (separator >= 128)
+#ifdef SUPPORT_UNICODE
+if (utf && (separator >= 128 || escape >= 128))
{
- /* Currently only ASCII separators are supported. */
+ /* Currently only ASCII characters are supported. */
*bufflenptr = 0;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
+#endif
with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
@@ -809,7 +855,7 @@
break;
}
- if (!no_escape && *pattern == ccontext->glob_escape)
+ if (escape != 0 && *pattern == escape)
{
pattern++;
if (pattern >= pattern_end)
@@ -908,6 +954,8 @@
out.out_str[0] = CHAR_ASTERISK;
out.out_str[1] = CHAR_QUESTION_MARK;
+ if (pattern >= pattern_end)
+ out.out_str[1] = CHAR_PLUS;
convert_glob_write_str(&out, 2);
continue;
}
@@ -924,12 +972,12 @@
if (c == CHAR_LEFT_SQUARE_BRACKET)
{
result = convert_glob_parse_range(&pattern, pattern_end,
- &out, separator, with_escape);
+ &out, utf, separator, with_escape, escape, no_wildsep);
if (result != 0) break;
continue;
}
- if (!no_escape && c == ccontext->glob_escape)
+ if (escape != 0 && c == escape)
{
if (pattern >= pattern_end)
{
Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24 2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/testdata/testinput24 2017-05-24 10:14:43 UTC (rev 802)
@@ -227,6 +227,18 @@
/[[:alpha:][:xdigit:][:word:]]/
+"[/-/]"
+
+/[-----]/
+
+/[------]/
+
+/[!------]/
+
+/[[:alpha:]-a]/
+
+/[a-[:alpha:]]/
+
/[[:alpha:/
/[[:alpha:]/
Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24 2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/testdata/testoutput24 2017-05-24 10:14:43 UTC (rev 802)
@@ -22,10 +22,10 @@
# Can't have separator in a class
"[ab/cd]"
-(?s)\A(?!/)[ab/cd]\z
+(?s)\A[ab/cd](?<!/)\z
"[,-/]"
-(?s)\A(?!/)[,-/]\z
+(?s)\A[,-/](?<!/)\z
/[ab/
** Pattern conversion error at offset 3: missing terminating ] for character class
@@ -41,7 +41,7 @@
# Now some actual tests
/a?b[]xy]*c/
-(?s)\Aa[^/]b(?!/)[\]xy](*COMMIT)[^/]*?c\z
+(?s)\Aa[^/]b[]xy](*COMMIT)[^/]*?c\z
azb]1234c
0: azb]1234c
@@ -70,7 +70,7 @@
No match
/*/
-(?s)\A[^/]*?\z
+(?s)\A[^/]*+\z
foo
0: foo
\
@@ -77,7 +77,7 @@
0:
/f*/
-(?s)\Af(*COMMIT)[^/]*?\z
+(?s)\Af(*COMMIT)[^/]*+\z
foo
0: foo
f
@@ -92,7 +92,7 @@
No match
/*foo*/
-(?s)\A[^/]*?foo(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?foo(*COMMIT)[^/]*+\z
foo
0: foo
food
@@ -101,7 +101,7 @@
0: aprilfool
/*ob*a*r*/
-(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*+\z
foobar
0: foobar
@@ -127,38 +127,41 @@
0: f\oo
/*[al]?/
-(?s)\A[^/]*?(?!/)[al][^/]\z
+(?s)\A[^/]*?[al][^/]\z
ball
0: ball
/[ten]/
-(?s)\A(?!/)[ten]\z
+(?s)\A[ten]\z
\= Expect no match
ten
No match
/t[a-g]n/
-(?s)\At(?!/)[a-g]n\z
+(?s)\At[a-g]n\z
ten
0: ten
/a[]]b/
-(?s)\Aa(?!/)[\]]b\z
+(?s)\Aa[]]b\z
a]b
0: a]b
/a[]a-]b/
-** Pattern conversion error at offset 7: missing terminating ] for character class
+(?s)\Aa[]a\-]b\z
/a[]-]b/
-** Pattern conversion error at offset 6: missing terminating ] for character class
+(?s)\Aa[]\-]b\z
a-b
+ 0: a-b
a]b
+ 0: a]b
\= Expect no match
aab
+No match
/a[]a-z]b/
-(?s)\Aa(?!/)[\]a-z]b\z
+(?s)\Aa[]a-z]b\z
aab
0: aab
@@ -176,12 +179,12 @@
No match
'[[:alpha:]][[:digit:]][[:upper:]]'
-(?s)\A(?!/)[[:alpha:]](?!/)[[:digit:]](?!/)[[:upper:]]\z
+(?s)\A[[:alpha:]](?<!/)[[:digit:]](?<!/)[[:upper:]](?<!/)\z
a1B
0: a1B
'[[:digit:][:upper:][:space:]]'
-(?s)\A(?!/)[[:digit:][:upper:][:space:]]\z
+(?s)\A[[:digit:][:upper:][:space:]](?<!/)\z
A
0: A
1
@@ -195,7 +198,7 @@
No match
'[a-c[:digit:]x-z]'
-(?s)\A(?!/)[a-c[:digit:]x-z]\z
+(?s)\A[a-c[:digit:]x-z](?<!/)\z
5
0: 5
b
@@ -221,7 +224,7 @@
No match
/A[+-0]B/
-(?s)\AA(?!/)[+-0]B\z
+(?s)\AA[+-0](?<!/)B\z
A+B
0: A+B
A.B
@@ -249,7 +252,7 @@
0: .xyz
"[,-0]x?z"
-(?s)\A(?!/)[,-0]x[^/]z\z
+(?s)\A[,-0](?<!/)x[^/]z\z
,xyz
0: ,xyz
\= Expect no match
@@ -259,12 +262,12 @@
0: .xyz
".x*"
-(?s)\A\.x(*COMMIT)[^/]*?\z
+(?s)\A\.x(*COMMIT)[^/]*+\z
.xabc
0: .xabc
/a[--0]z/
-(?s)\Aa(?!/)[\--0]z\z
+(?s)\Aa[\--0](?<!/)z\z
a-z
0: a-z
a.z
@@ -278,7 +281,7 @@
No match
/<[a-c-d]>/
-(?s)\A<(?!/)[a-c-d]>\z
+(?s)\A<[a-c\-d]>\z
<a>
0: <a>
<b>
@@ -291,7 +294,7 @@
0: <->
/a[[:digit:].]z/
-(?s)\Aa(?!/)[[:digit:].]z\z
+(?s)\Aa[[:digit:].](?<!/)z\z
a1z
0: a1z
a.z
@@ -334,20 +337,38 @@
(?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z
/*a*\/*b*/
-(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*+\z
/?a?\/?b?/
(?s)\A[^/]a[^/]/[^/]b[^/]\z
/[a\\b\c][]][-][\]\-]/
-(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z
+(?s)\A[a\\bc][]][\-][\]\-]\z
/[^a\\b\c][!]][!-][^\]\-]/
-(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z
+(?s)\A[^/a\\bc][^/]][^/\-][^/\]\-]\z
/[[:alpha:][:xdigit:][:word:]]/
-(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z
+(?s)\A[[:alpha:][:xdigit:][:word:]](?<!/)\z
+"[/-/]"
+(?s)\A[/-/](?<!/)\z
+
+/[-----]/
+(?s)\A[\--\-\-\-]\z
+
+/[------]/
+(?s)\A[\--\-\--\-]\z
+
+/[!------]/
+(?s)\A[^/\--\-\--\-]\z
+
+/[[:alpha:]-a]/
+(?s)\A[[:alpha:]\-a](?<!/)\z
+
+/[a-[:alpha:]]/
+** Pattern conversion error at offset 4: invalid syntax
+
/[[:alpha:/
** Pattern conversion error at offset 9: missing terminating ] for character class
@@ -386,7 +407,7 @@
0: /xax/
/**\/*a*/
-(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?\z)
+(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*+\z)
xx/xx/xx/xax
0: /xax
xx/xx/xx/xax/xx
@@ -393,7 +414,7 @@
No match
/**\/*a*\/**\/*b*/
-(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*?\z)
+(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*+\z)
xx/xx/xx/xax/xx/xb
0: /xax/xx/xb
xx/xx/xx/xax/xx/x
@@ -402,10 +423,10 @@
#pattern convert=glob:glob_no_starstar
/***/
-(?s)\A[^/]*?\z
+(?s)\A[^/]*+\z
/**a**/
-(?s)\A[^/]*?a(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?a(*COMMIT)[^/]*+\z
#pattern convert=unset
#pattern convert=glob:glob_no_wild_separator