[Pcre-svn] [787] code/trunk: Support character classes in gl…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [787] code/trunk: Support character classes in glob conversion.
Revision: 787
          http://www.exim.org/viewvc/pcre2?view=rev&revision=787
Author:   zherczeg
Date:     2017-05-18 07:46:22 +0100 (Thu, 18 May 2017)
Log Message:
-----------
Support character classes in glob conversion.


Modified Paths:
--------------
    code/trunk/src/pcre2_convert.c
    code/trunk/testdata/testinput24
    code/trunk/testdata/testoutput24


Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c    2017-05-17 17:55:11 UTC (rev 786)
+++ code/trunk/src/pcre2_convert.c    2017-05-18 06:46:22 UTC (rev 787)
@@ -58,7 +58,7 @@
 #define ERROR_END_BACKSLASH 101
 #define ERROR_MISSING_SQUARE_BRACKET 106
 #define ERROR_MISSING_CLOSING_PARENTHESIS 114
-#define ERROR_TOO_DEEP_NESTING 119
+#define ERROR_UNKNOWN_POSIX_CLASS 130
 #define ERROR_NO_UNICODE 132


/* Generated pattern fragments */
@@ -651,17 +651,17 @@
/* Write a character into the output.

 Arguments:
-  context        the bash glob context
+  out            output context
   chr            the next character
 */


static void
-convert_glob_bash_write(pcre2_output_context *context, PCRE2_UCHAR chr)
+convert_glob_bash_write(pcre2_output_context *out, PCRE2_UCHAR chr)
{
-context->output_size++;
+out->output_size++;

-if (context->output < context->output_end)
- *context->output++ = chr;
+if (out->output < out->output_end)
+ *out->output++ = chr;
}


@@ -668,17 +668,17 @@
/* Write a string into the output.

 Arguments:
-  context        the bash glob context
-  length         length of context->out_str
+  out            output context
+  length         length of out->out_str
 */


static void
-convert_glob_bash_write_str(pcre2_output_context *context, PCRE2_SIZE length)
+convert_glob_bash_write_str(pcre2_output_context *out, PCRE2_SIZE length)
{
-uint8_t *out_str = context->out_str;
-PCRE2_UCHAR *output = context->output;
-PCRE2_SPTR output_end = context->output_end;
-PCRE2_SIZE output_size = context->output_size;
+uint8_t *out_str = out->out_str;
+PCRE2_UCHAR *output = out->output;
+PCRE2_SPTR output_end = out->output_end;
+PCRE2_SIZE output_size = out->output_size;

do
{
@@ -689,17 +689,219 @@
}
while (--length != 0);

-context->output = output;
-context->output_size = output_size;
+out->output = output;
+out->output_size = output_size;
}


+/* Parse a posix class.
+
+Arguments:
+  from           starting point of scanning the range
+  pattern_end    end of pattern
+  out            output context
+
+Returns:      TRUE => success
+             FALSE => malformed class
+*/
+
+static int
+convert_glob_bash_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
+  pcre2_output_context *out)
+{
+static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
+  "graph:lower:print:punct:space:upper:word:xdigit:";
+PCRE2_SPTR pattern = *from;
+PCRE2_SPTR start;
+const char *class_ptr;
+PCRE2_UCHAR c;
+
+out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+out->out_str[1] = CHAR_COLON;
+convert_glob_bash_write_str(out, 2);
+
+while (TRUE)
+  {
+  if (pattern >= pattern_end)
+    {
+    *from = pattern;
+    return ERROR_MISSING_SQUARE_BRACKET;
+    }
+
+  c = *pattern++;
+
+  if (c == CHAR_COLON && pattern < pattern_end &&
+      *pattern == CHAR_RIGHT_SQUARE_BRACKET)
+    {
+      break;
+    }
+
+  if (c < CHAR_a || c > CHAR_z)
+    {
+    /* All POSIX class is composed of lowercase characters */
+    *from = pattern;
+    return ERROR_MISSING_SQUARE_BRACKET;
+    }
+
+  convert_glob_bash_write(out, c);
+  }
+
+start = *from;
+*from = pattern + 1;
+class_ptr = posix_classes;
+
+while (TRUE)
+  {
+  if (*class_ptr == CHAR_NULL) return ERROR_UNKNOWN_POSIX_CLASS;
+
+  pattern = start;
+
+  while (*pattern == (PCRE2_UCHAR) *class_ptr)
+    {
+    if (*pattern == CHAR_COLON)
+      {
+      out->out_str[0] = CHAR_COLON;
+      out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
+      convert_glob_bash_write_str(out, 2);
+      return 0;
+      }
+    pattern++;
+    class_ptr++;
+    }
+
+  while (*class_ptr != CHAR_COLON) class_ptr++;
+  class_ptr++;
+  }
+}
+
+
+/* Parse a range of characters.
+
+Arguments:
+  from           starting point of scanning the range
+  pattern_end    end of pattern
+  out            output context
+  separator      glob separator
+
+Returns:         0 => success
+                !0 => error code
+*/
+
+static int
+convert_glob_bash_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
+  pcre2_output_context *out, PCRE2_UCHAR separator)
+{
+PCRE2_SPTR pattern = *from;
+PCRE2_UCHAR c;
+int result, len;
+
+if (pattern >= pattern_end)
+  {
+  *from = pattern;
+  return ERROR_MISSING_SQUARE_BRACKET;
+  }
+
+c = *pattern;
+
+if (c == CHAR_EXCLAMATION_MARK
+    || c == CHAR_CIRCUMFLEX_ACCENT)
+  {
+  out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+  out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+  len = 2;
+  }
+else
+  {
+  out->out_str[0] = CHAR_LEFT_PARENTHESIS;
+  out->out_str[1] = CHAR_QUESTION_MARK;
+  out->out_str[2] = CHAR_EXCLAMATION_MARK;
+  len = 3;
+  }
+
+if (separator < 128 && strchr(pcre2_escaped_literals, separator) != NULL)
+  {
+  out->out_str[len] = CHAR_BACKSLASH;
+  len++;
+  }
+
+convert_glob_bash_write_str(out, len);
+convert_glob_bash_write(out, separator);
+
+if (c == CHAR_EXCLAMATION_MARK
+    || c == CHAR_CIRCUMFLEX_ACCENT)
+  {
+  pattern++;
+  if (pattern >= pattern_end)
+    {
+    *from = pattern;
+    return ERROR_MISSING_SQUARE_BRACKET;
+    }
+  c = *pattern;
+  }
+else
+  {
+  out->out_str[0] = CHAR_RIGHT_PARENTHESIS;
+  out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET;
+  convert_glob_bash_write_str(out, 2);
+  }
+
+if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET)
+  {
+  convert_glob_bash_write(out, CHAR_BACKSLASH);
+  convert_glob_bash_write(out, c);
+  pattern++;
+  }
+
+while (pattern < pattern_end)
+  {
+  c = *pattern++;
+
+  if (c == CHAR_RIGHT_SQUARE_BRACKET)
+    {
+    convert_glob_bash_write(out, c);
+    *from = pattern;
+    return 0;
+    }
+
+  if (c == CHAR_LEFT_SQUARE_BRACKET && pattern < pattern_end &&
+      *pattern == CHAR_COLON)
+    {
+    *from = pattern + 1;
+
+    result = convert_glob_bash_parse_class(from, pattern_end, out);
+    if (result != 0) return result;
+
+    pattern = *from;
+    continue;
+    }
+
+  if (c == CHAR_BACKSLASH)
+    {
+    if (pattern >= pattern_end)
+      {
+      *from = pattern;
+      return ERROR_END_BACKSLASH;
+      }
+    c = *pattern++;
+    }
+
+  if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
+      c == CHAR_BACKSLASH || c == CHAR_MINUS)
+    convert_glob_bash_write(out, CHAR_BACKSLASH);
+
+  convert_glob_bash_write(out, c);
+  }
+
+*from = pattern;
+return ERROR_MISSING_SQUARE_BRACKET;
+}
+
+
 /* Prints a wildcard into the output.


 Arguments:
-  context           the bash glob context
-  separator         glob separator
-  after_sep         whether the wildcard is right after a separator
+  out            output context
+  separator      glob separator
 */


static void
@@ -711,7 +913,7 @@
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;

-if (separator == CHAR_BACKSLASH)
+if (separator < 128 && strchr(pcre2_escaped_literals, separator) != NULL)
{
out->out_str[2] = CHAR_BACKSLASH;
len = 3;
@@ -749,8 +951,8 @@
pcre2_output_context out;
PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength;
+PCRE2_UCHAR c;
int result;
-PCRE2_UCHAR c;

 /* Initialize default for error offset as end of input. */
 out.output = use_buffer;
@@ -800,6 +1002,14 @@
     continue;
     }


+  if (c == CHAR_LEFT_SQUARE_BRACKET)
+    {
+    result = convert_glob_bash_parse_range(&pattern, pattern_end,
+      &out, ccontext->glob_separator);
+    if (result != 0) break;
+    continue;
+    }
+
   if (c == CHAR_BACKSLASH)
     {
     if (pattern >= pattern_end)


Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24    2017-05-17 17:55:11 UTC (rev 786)
+++ code/trunk/testdata/testinput24    2017-05-18 06:46:22 UTC (rev 787)
@@ -228,7 +228,23 @@


/?a?\/?b?/

+/[a\\b\c][]][-][\]\-]/

+/[^a\\b\c][!]][!-][^\]\-]/
+
+/[[:alpha:][:xdigit:][:word:]]/
+
+/[[:alpha:/
+
+/[[:alpha:]/
+
+/[[:alphaa:]]/
+
+/[[:xdigi:]]/
+
+/[[:xdigit::]]/
+
+
#pattern convert=unset
#pattern convert=posix_extended


Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24    2017-05-17 17:55:11 UTC (rev 786)
+++ code/trunk/testdata/testoutput24    2017-05-18 06:46:22 UTC (rev 787)
@@ -359,7 +359,31 @@
 /?a?\/?b?/
 (?s)\A[^/]a[^/]/[^/]b[^/]\z


+/[a\\b\c][]][-][\]\-]/
+(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z

+/[^a\\b\c][!]][!-][^\]\-]/
+(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z
+
+/[[:alpha:][:xdigit:][:word:]]/
+(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z
+
+/[[:alpha:/
+** Pattern conversion error at offset 9: missing terminating ] for character class
+
+/[[:alpha:]/
+** Pattern conversion error at offset 10: missing terminating ] for character class
+
+/[[:alphaa:]]/
+** Pattern conversion error at offset 11: unknown POSIX class name
+
+/[[:xdigi:]]/
+** Pattern conversion error at offset 10: unknown POSIX class name
+
+/[[:xdigit::]]/
+** Pattern conversion error at offset 10: missing terminating ] for character class
+
+
#pattern convert=unset
#pattern convert=posix_extended