[Pcre-svn] [802] code/trunk: Rework character range parsing …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [802] code/trunk: Rework character range parsing in glob conversion.
Revision: 802
          http://www.exim.org/viewvc/pcre2?view=rev&revision=802
Author:   zherczeg
Date:     2017-05-24 11:14:43 +0100 (Wed, 24 May 2017)
Log Message:
-----------
Rework character range parsing in glob conversion.


Modified Paths:
--------------
    code/trunk/src/pcre2_convert.c
    code/trunk/testdata/testinput24
    code/trunk/testdata/testoutput24


Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c    2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/src/pcre2_convert.c    2017-05-24 10:14:43 UTC (rev 802)
@@ -423,6 +423,47 @@
 }



+/* Prints the separator into the output.
+
+Arguments:
+  out            output context
+  separator      glob separator
+  with_escape    backslash is needed before separator
+*/
+
+static void
+convert_glob_print_separator(pcre2_output_context *out,
+  PCRE2_UCHAR separator, BOOL with_escape)
+{
+if (with_escape)
+  convert_glob_write(out, CHAR_BACKSLASH);
+
+convert_glob_write(out, separator);
+}
+
+
+/* Prints a wildcard into the output.
+
+Arguments:
+  out            output context
+  separator      glob separator
+  with_escape    backslash is needed before separator
+*/
+
+static void
+convert_glob_print_wildcard(pcre2_output_context *out,
+  PCRE2_UCHAR separator, BOOL with_escape)
+{
+out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+convert_glob_write_str(out, 2);
+
+convert_glob_print_separator(out, separator, with_escape);
+
+convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
+}
+
+
 /* Parse a posix class.


Arguments:
@@ -519,12 +560,19 @@

static int
convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
- pcre2_output_context *out, PCRE2_UCHAR separator, BOOL with_escape)
+ pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
+ BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
{
+BOOL is_negative = FALSE;
+BOOL separator_seen = FALSE;
+BOOL has_prev_c;
PCRE2_SPTR pattern = *from;
-PCRE2_UCHAR c;
+PCRE2_SPTR char_start = NULL;
+uint32_t c, prev_c;
int result, len;

+(void)utf; /* Avoid compiler warning. */
+
if (pattern >= pattern_end)
{
*from = pattern;
@@ -531,65 +579,70 @@
return ERROR_MISSING_SQUARE_BRACKET;
}

-c = *pattern;
-
-if (c == CHAR_EXCLAMATION_MARK
-    || c == CHAR_CIRCUMFLEX_ACCENT)
+if (*pattern == CHAR_EXCLAMATION_MARK
+    || *pattern == CHAR_CIRCUMFLEX_ACCENT)
   {
-  out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
-  out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
-  len = 2;
-  }
-else
-  {
-  out->out_str[0] = CHAR_LEFT_PARENTHESIS;
-  out->out_str[1] = CHAR_QUESTION_MARK;
-  out->out_str[2] = CHAR_EXCLAMATION_MARK;
-  len = 3;
-  }
+  pattern++;


-if (with_escape)
-  {
-  out->out_str[len] = CHAR_BACKSLASH;
-  len++;
-  }
-
-out->out_str[len] = (uint8_t) separator;
-
-convert_glob_write_str(out, len + 1);
-
-if (c == CHAR_EXCLAMATION_MARK
-    || c == CHAR_CIRCUMFLEX_ACCENT)
-  {
-  pattern++;
   if (pattern >= pattern_end)
     {
     *from = pattern;
     return ERROR_MISSING_SQUARE_BRACKET;
     }
-  c = *pattern;
+
+  is_negative = TRUE;
+
+  out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+  out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+  len = 2;
+
+  if (!no_wildsep)
+    {
+    if (with_escape)
+      {
+      out->out_str[len] = CHAR_BACKSLASH;
+      len++;
+      }
+    out->out_str[len] = (uint8_t) separator;
+    }
+
+  convert_glob_write_str(out, len + 1);
   }
 else
-  {
-  out->out_str[0] = CHAR_RIGHT_PARENTHESIS;
-  out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET;
-  convert_glob_write_str(out, 2);
-  }
+  convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);


-if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET)
+has_prev_c = FALSE;
+prev_c = 0;
+
+if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
{
- convert_glob_write(out, CHAR_BACKSLASH);
- convert_glob_write(out, c);
+ convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
+ has_prev_c = TRUE;
+ prev_c = CHAR_RIGHT_SQUARE_BRACKET;
pattern++;
}

while (pattern < pattern_end)
{
- c = *pattern++;
+ char_start = pattern;
+ GETCHARINCTEST(c, pattern);

   if (c == CHAR_RIGHT_SQUARE_BRACKET)
     {
     convert_glob_write(out, c);
+
+    if (!is_negative && !no_wildsep && separator_seen)
+      {
+      out->out_str[0] = CHAR_LEFT_PARENTHESIS;
+      out->out_str[1] = CHAR_QUESTION_MARK;
+      out->out_str[2] = CHAR_LESS_THAN_SIGN;
+      out->out_str[3] = CHAR_EXCLAMATION_MARK;
+      convert_glob_write_str(out, 4);
+
+      convert_glob_print_separator(out, separator, with_escape);
+      convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
+      }
+
     *from = pattern;
     return 0;
     }
@@ -605,32 +658,64 @@


     pattern = *from;


-    /* A dash after a character class is a normal character. */
-    if (pattern >= pattern_end || *pattern != CHAR_MINUS)
-      continue;
-
-    c = CHAR_MINUS;
-    pattern++;
+    has_prev_c = FALSE;
+    prev_c = 0;
+    separator_seen = TRUE;
+    continue;
     }
-  else if (c == CHAR_MINUS)
+  else if (c == CHAR_MINUS && has_prev_c &&
+           *pattern != CHAR_RIGHT_SQUARE_BRACKET)
     {
     convert_glob_write(out, CHAR_MINUS);
-    c = *pattern++;


-    if (c == CHAR_BACKSLASH)
+    char_start = pattern;
+    GETCHARINCTEST(c, pattern);
+
+    if (pattern >= pattern_end) break;
+
+    if (escape != 0 && c == escape)
       {
+      char_start = pattern;
+      GETCHARINCTEST(c, pattern);
+      }
+    else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
+      {
+      *from = pattern;
+      return PCRE2_ERROR_CONVERT_SYNTAX;
+      }
+
+    if (prev_c > c)
+      {
+      *from = pattern;
+      return PCRE2_ERROR_CONVERT_SYNTAX;
+      }
+
+    if (prev_c < separator && separator < c) separator_seen = TRUE;
+
+    has_prev_c = FALSE;
+    prev_c = 0;
+    }
+  else
+    {
+    if (escape != 0 && c == escape)
+      {
+      char_start = pattern;
+      GETCHARINCTEST(c, pattern);
+
       if (pattern >= pattern_end) break;
-        c = *pattern++;
       }
+
+    has_prev_c = TRUE;
+    prev_c = c;
     }
-  else if (c == CHAR_BACKSLASH)
-    c = *pattern++;


   if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
       c == CHAR_BACKSLASH || c == CHAR_MINUS)
     convert_glob_write(out, CHAR_BACKSLASH);


- convert_glob_write(out, c);
+ if (c == separator) separator_seen = TRUE;
+
+ do convert_glob_write(out, *char_start++); while (char_start < pattern);
}

*from = pattern;
@@ -638,47 +723,6 @@
}


-/* Prints the separator into the output.
-
-Arguments:
-  out            output context
-  separator      glob separator
-  with_escape    backslash is needed before separator
-*/
-
-static void
-convert_glob_print_separator(pcre2_output_context *out,
-  PCRE2_UCHAR separator, BOOL with_escape)
-{
-if (with_escape)
-  convert_glob_write(out, CHAR_BACKSLASH);
-
-convert_glob_write(out, separator);
-}
-
-
-/* Prints a wildcard into the output.
-
-Arguments:
-  out            output context
-  separator      glob separator
-  with_escape    backslash is needed before separator
-*/
-
-static void
-convert_glob_print_wildcard(pcre2_output_context *out,
-  PCRE2_UCHAR separator, BOOL with_escape)
-{
-out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
-out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
-convert_glob_write_str(out, 2);
-
-convert_glob_print_separator(out, separator, with_escape);
-
-convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
-}
-
-
 /* Prints a (*COMMIT) into the output.


Arguments:
@@ -727,8 +771,8 @@
PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength;
PCRE2_UCHAR separator = ccontext->glob_separator;
+PCRE2_UCHAR escape = ccontext->glob_escape;
PCRE2_UCHAR c;
-BOOL no_escape = ccontext->glob_escape == 0;
BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
BOOL in_atomic = FALSE;
@@ -736,14 +780,16 @@
BOOL with_escape, is_start;
int result, len;

-(void)utf; /* Avoid compiler warning */
+(void)utf; /* Avoid compiler warning. */

-if (separator >= 128)
+#ifdef SUPPORT_UNICODE
+if (utf && (separator >= 128 || escape >= 128))
{
- /* Currently only ASCII separators are supported. */
+ /* Currently only ASCII characters are supported. */
*bufflenptr = 0;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
+#endif

with_escape = strchr(pcre2_escaped_literals, separator) != NULL;

@@ -809,7 +855,7 @@
         break;
         }


-      if (!no_escape && *pattern == ccontext->glob_escape)
+      if (escape != 0 && *pattern == escape)
         {
         pattern++;
         if (pattern >= pattern_end)
@@ -908,6 +954,8 @@


     out.out_str[0] = CHAR_ASTERISK;
     out.out_str[1] = CHAR_QUESTION_MARK;
+    if (pattern >= pattern_end)
+      out.out_str[1] = CHAR_PLUS;
     convert_glob_write_str(&out, 2);
     continue;
     }
@@ -924,12 +972,12 @@
   if (c == CHAR_LEFT_SQUARE_BRACKET)
     {
     result = convert_glob_parse_range(&pattern, pattern_end,
-      &out, separator, with_escape);
+      &out, utf, separator, with_escape, escape, no_wildsep);
     if (result != 0) break;
     continue;
     }


-  if (!no_escape && c == ccontext->glob_escape)
+  if (escape != 0 && c == escape)
     {
     if (pattern >= pattern_end)
       {


Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24    2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/testdata/testinput24    2017-05-24 10:14:43 UTC (rev 802)
@@ -227,6 +227,18 @@


/[[:alpha:][:xdigit:][:word:]]/

+"[/-/]"
+
+/[-----]/
+
+/[------]/
+
+/[!------]/
+
+/[[:alpha:]-a]/
+
+/[a-[:alpha:]]/
+
/[[:alpha:/

/[[:alpha:]/

Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24    2017-05-24 09:10:29 UTC (rev 801)
+++ code/trunk/testdata/testoutput24    2017-05-24 10:14:43 UTC (rev 802)
@@ -22,10 +22,10 @@
 # Can't have separator in a class


"[ab/cd]"
-(?s)\A(?!/)[ab/cd]\z
+(?s)\A[ab/cd](?<!/)\z

"[,-/]"
-(?s)\A(?!/)[,-/]\z
+(?s)\A[,-/](?<!/)\z

/[ab/
** Pattern conversion error at offset 3: missing terminating ] for character class
@@ -41,7 +41,7 @@
# Now some actual tests

 /a?b[]xy]*c/
-(?s)\Aa[^/]b(?!/)[\]xy](*COMMIT)[^/]*?c\z
+(?s)\Aa[^/]b[]xy](*COMMIT)[^/]*?c\z
     azb]1234c
  0: azb]1234c


@@ -70,7 +70,7 @@
No match

 /*/
-(?s)\A[^/]*?\z
+(?s)\A[^/]*+\z
     foo
  0: foo
     \
@@ -77,7 +77,7 @@
  0: 


 /f*/
-(?s)\Af(*COMMIT)[^/]*?\z
+(?s)\Af(*COMMIT)[^/]*+\z
     foo
  0: foo
     f
@@ -92,7 +92,7 @@
 No match


 /*foo*/
-(?s)\A[^/]*?foo(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?foo(*COMMIT)[^/]*+\z
     foo
  0: foo
     food
@@ -101,7 +101,7 @@
  0: aprilfool


 /*ob*a*r*/
-(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*+\z
     foobar
  0: foobar


@@ -127,38 +127,41 @@
0: f\oo

 /*[al]?/
-(?s)\A[^/]*?(?!/)[al][^/]\z
+(?s)\A[^/]*?[al][^/]\z
     ball
  0: ball


 /[ten]/
-(?s)\A(?!/)[ten]\z
+(?s)\A[ten]\z
 \= Expect no match
     ten
 No match


 /t[a-g]n/
-(?s)\At(?!/)[a-g]n\z
+(?s)\At[a-g]n\z
     ten
  0: ten


 /a[]]b/
-(?s)\Aa(?!/)[\]]b\z
+(?s)\Aa[]]b\z
     a]b
  0: a]b


/a[]a-]b/
-** Pattern conversion error at offset 7: missing terminating ] for character class
+(?s)\Aa[]a\-]b\z

 /a[]-]b/
-** Pattern conversion error at offset 6: missing terminating ] for character class
+(?s)\Aa[]\-]b\z
     a-b
+ 0: a-b
     a]b
+ 0: a]b
 \= Expect no match
     aab
+No match


 /a[]a-z]b/
-(?s)\Aa(?!/)[\]a-z]b\z
+(?s)\Aa[]a-z]b\z
     aab
  0: aab


@@ -176,12 +179,12 @@
No match

 '[[:alpha:]][[:digit:]][[:upper:]]'
-(?s)\A(?!/)[[:alpha:]](?!/)[[:digit:]](?!/)[[:upper:]]\z
+(?s)\A[[:alpha:]](?<!/)[[:digit:]](?<!/)[[:upper:]](?<!/)\z
     a1B
  0: a1B


 '[[:digit:][:upper:][:space:]]'
-(?s)\A(?!/)[[:digit:][:upper:][:space:]]\z
+(?s)\A[[:digit:][:upper:][:space:]](?<!/)\z
     A
  0: A
     1
@@ -195,7 +198,7 @@
 No match


 '[a-c[:digit:]x-z]'
-(?s)\A(?!/)[a-c[:digit:]x-z]\z
+(?s)\A[a-c[:digit:]x-z](?<!/)\z
     5
  0: 5
     b
@@ -221,7 +224,7 @@
 No match


 /A[+-0]B/
-(?s)\AA(?!/)[+-0]B\z
+(?s)\AA[+-0](?<!/)B\z
     A+B
  0: A+B
     A.B
@@ -249,7 +252,7 @@
  0: .xyz


 "[,-0]x?z"
-(?s)\A(?!/)[,-0]x[^/]z\z
+(?s)\A[,-0](?<!/)x[^/]z\z
     ,xyz
  0: ,xyz
 \= Expect no match
@@ -259,12 +262,12 @@
  0: .xyz


 ".x*"
-(?s)\A\.x(*COMMIT)[^/]*?\z
+(?s)\A\.x(*COMMIT)[^/]*+\z
     .xabc
  0: .xabc


 /a[--0]z/
-(?s)\Aa(?!/)[\--0]z\z
+(?s)\Aa[\--0](?<!/)z\z
     a-z
  0: a-z
     a.z
@@ -278,7 +281,7 @@
 No match


 /<[a-c-d]>/
-(?s)\A<(?!/)[a-c-d]>\z
+(?s)\A<[a-c\-d]>\z
     <a>
  0: <a>
     <b>
@@ -291,7 +294,7 @@
  0: <->


 /a[[:digit:].]z/
-(?s)\Aa(?!/)[[:digit:].]z\z
+(?s)\Aa[[:digit:].](?<!/)z\z
     a1z
  0: a1z
     a.z
@@ -334,20 +337,38 @@
 (?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z


/*a*\/*b*/
-(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*+\z

/?a?\/?b?/
(?s)\A[^/]a[^/]/[^/]b[^/]\z

/[a\\b\c][]][-][\]\-]/
-(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z
+(?s)\A[a\\bc][]][\-][\]\-]\z

/[^a\\b\c][!]][!-][^\]\-]/
-(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z
+(?s)\A[^/a\\bc][^/]][^/\-][^/\]\-]\z

/[[:alpha:][:xdigit:][:word:]]/
-(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z
+(?s)\A[[:alpha:][:xdigit:][:word:]](?<!/)\z

+"[/-/]"
+(?s)\A[/-/](?<!/)\z
+
+/[-----]/
+(?s)\A[\--\-\-\-]\z
+
+/[------]/
+(?s)\A[\--\-\--\-]\z
+
+/[!------]/
+(?s)\A[^/\--\-\--\-]\z
+
+/[[:alpha:]-a]/
+(?s)\A[[:alpha:]\-a](?<!/)\z
+
+/[a-[:alpha:]]/
+** Pattern conversion error at offset 4: invalid syntax
+
/[[:alpha:/
** Pattern conversion error at offset 9: missing terminating ] for character class

@@ -386,7 +407,7 @@
0: /xax/

/**\/*a*/
-(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?\z)
+(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*+\z)
xx/xx/xx/xax
0: /xax
xx/xx/xx/xax/xx
@@ -393,7 +414,7 @@
No match

/**\/*a*\/**\/*b*/
-(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*?\z)
+(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*+\z)
xx/xx/xx/xax/xx/xb
0: /xax/xx/xb
xx/xx/xx/xax/xx/x
@@ -402,10 +423,10 @@
#pattern convert=glob:glob_no_starstar

/***/
-(?s)\A[^/]*?\z
+(?s)\A[^/]*+\z

/**a**/
-(?s)\A[^/]*?a(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?a(*COMMIT)[^/]*+\z

#pattern convert=unset
#pattern convert=glob:glob_no_wild_separator