[Pcre-svn] [785] code/trunk: Reworking bash glob conversion.

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [785] code/trunk: Reworking bash glob conversion.
Revision: 785
          http://www.exim.org/viewvc/pcre2?view=rev&revision=785
Author:   zherczeg
Date:     2017-05-17 14:44:24 +0100 (Wed, 17 May 2017)
Log Message:
-----------
Reworking bash glob conversion.


Modified Paths:
--------------
    code/trunk/src/pcre2_convert.c
    code/trunk/testdata/testinput24
    code/trunk/testdata/testoutput24


Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c    2017-05-16 12:42:17 UTC (rev 784)
+++ code/trunk/src/pcre2_convert.c    2017-05-17 13:44:24 UTC (rev 785)
@@ -693,110 +693,7 @@
 context->output_size = output_size;
 }


-/* Bash glob reading modes. */

-#define PCRE2_BASH_GLOB_NORMAL           0
-#define PCRE2_BASH_GLOB_QUOTED           1
-#define PCRE2_BASH_GLOB_DOUBLE_QUOTED    2
-#define PCRE2_BASH_GLOB_BACKSLASH        3
-
-/* Maximum nesting level of enclosed groups. */
-
-#define PCRE2_BASH_GLOB_MAX_NESTING      16
-
-typedef struct pcre2_bash_glob_context {
-  PCRE2_SPTR pattern;
-  PCRE2_SPTR pattern_end;
-  pcre2_output_context out;
-  int read_mode;
-  BOOL is_control_char;
-} pcre2_bash_glob_context;
-
-/* Read the next character from the glob. If the character
-   is a control character context->is_control_char is set
-   to TRUE. Otherwise this field is FALSE.
-
-Arguments:
-  context        the bash glob context
-  utf            TRUE if UTF
-*/
-
-static BOOL
-convert_glob_bash_read(pcre2_bash_glob_context *context, BOOL utf)
-{
-while (TRUE)
-  {
-  if (context->pattern >= context->pattern_end)
-    return FALSE;
-
-  context->pattern++;
-
-#ifdef SUPPORT_UNICODE
-  /* Intermediate unicode octets are always normal characters. */
-  if (utf && NOT_FIRSTCU(context->pattern[-1]))
-    {
-    context->is_control_char = FALSE;
-    return TRUE;
-    }
-#endif
-
-  if (context->read_mode == PCRE2_BASH_GLOB_QUOTED)
-    {
-    if (context->pattern[-1] != CHAR_APOSTROPHE)
-      return TRUE;
-
-    context->read_mode = PCRE2_BASH_GLOB_NORMAL;
-    continue;
-    }
-  else if (context->read_mode == PCRE2_BASH_GLOB_DOUBLE_QUOTED)
-    {
-    if (context->pattern[-1] == CHAR_BACKSLASH &&
-        context->pattern < context->pattern_end &&
-        (context->pattern[0] == CHAR_QUOTATION_MARK ||
-         context->pattern[0] == CHAR_BACKSLASH))
-      {
-        context->pattern++;
-        return TRUE;
-      }
-    else if (context->pattern[-1] != CHAR_QUOTATION_MARK)
-      return TRUE;
-
-    context->read_mode = PCRE2_BASH_GLOB_NORMAL;
-    continue;
-    }
-
-  context->is_control_char = FALSE;
-
-  if (context->pattern[-1] == CHAR_APOSTROPHE)
-    {
-    context->read_mode = PCRE2_BASH_GLOB_QUOTED;
-    continue;
-    }
-
-  if (context->pattern[-1] == CHAR_QUOTATION_MARK)
-    {
-    context->read_mode = PCRE2_BASH_GLOB_DOUBLE_QUOTED;
-    continue;
-    }
-
-  if (context->pattern[-1] == CHAR_BACKSLASH)
-    {
-    if (context->pattern < context->pattern_end)
-      {
-      context->pattern++;
-      return TRUE;
-      }
-
-    context->read_mode = PCRE2_BASH_GLOB_BACKSLASH;
-    return FALSE;
-    }
-
-  context->is_control_char = TRUE;
-  return TRUE;
-  }
-}
-
-
 /* Prints a wildcard into the output.


Arguments:
@@ -806,30 +703,24 @@
*/

static void
-convert_glob_bash_wildcard(pcre2_bash_glob_context *context,
- PCRE2_UCHAR separator, BOOL after_sep)
+convert_glob_bash_wildcard(pcre2_output_context *out,
+ PCRE2_UCHAR separator)
{
int len = 2;

-context->out.out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
-context->out.out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
+out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
+out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;

if (separator == CHAR_BACKSLASH)
{
- context->out.out_str[2] = CHAR_BACKSLASH;
+ out->out_str[2] = CHAR_BACKSLASH;
len = 3;
}

-if (after_sep)
- {
- context->out.out_str[len] = CHAR_DOT;
- len++;
- }
+convert_glob_bash_write_str(out, len);

-convert_glob_bash_write_str(&context->out, len);
-
-convert_glob_bash_write(&context->out, separator);
-convert_glob_bash_write(&context->out, CHAR_RIGHT_SQUARE_BRACKET);
+convert_glob_bash_write(out, separator);
+convert_glob_bash_write(out, CHAR_RIGHT_SQUARE_BRACKET);
}


@@ -851,204 +742,98 @@
*/

static int
-convert_glob_bash(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength,
+convert_glob_bash(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length,
PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext)
{
-pcre2_bash_glob_context context;
-uint8_t group_types[PCRE2_BASH_GLOB_MAX_NESTING];
-int nesting_level, result;
-BOOL after_sep = TRUE;
+pcre2_output_context out;
+PCRE2_SPTR pattern_start = pattern;
+PCRE2_SPTR pattern_end = pattern + plength;
+int result;
PCRE2_UCHAR c;

/* Initialize default for error offset as end of input. */
-context.pattern = pattern;
-context.pattern_end = pattern + plength;
-context.read_mode = PCRE2_BASH_GLOB_NORMAL;
-context.out.output = use_buffer;
-context.out.output_end = use_buffer + use_length;
-context.out.output_size = 0;
+out.output = use_buffer;
+out.output_end = use_buffer + use_length;
+out.output_size = 0;

-context.out.out_str[0] = CHAR_BACKSLASH;
-context.out.out_str[1] = CHAR_A;
-convert_glob_bash_write_str(&context.out, 2);
+out.out_str[0] = CHAR_LEFT_PARENTHESIS;
+out.out_str[1] = CHAR_QUESTION_MARK;
+out.out_str[2] = CHAR_s;
+out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
+out.out_str[4] = CHAR_BACKSLASH;
+out.out_str[5] = CHAR_A;
+convert_glob_bash_write_str(&out, 6);

-nesting_level = 0;
result = 0;

-while (convert_glob_bash_read(&context, utf))
+while (pattern < pattern_end)
{
- c = context.pattern[-1];
+ c = *pattern++;

-  if (context.is_control_char)
+  if (c == CHAR_ASTERISK)
     {
-    if (c == CHAR_LEFT_PARENTHESIS)
+    if (pattern != pattern_start + 1)
       {
-      /* ! Unexpected open parenthesis ! */
-      result = ERROR_END_BACKSLASH;
-      break;
+      out.out_str[0] = CHAR_LEFT_PARENTHESIS;
+      out.out_str[1] = CHAR_ASTERISK;
+      out.out_str[2] = CHAR_C;
+      out.out_str[3] = CHAR_O;
+      out.out_str[4] = CHAR_M;
+      out.out_str[5] = CHAR_M;
+      out.out_str[6] = CHAR_I;
+      out.out_str[7] = CHAR_T;
+      convert_glob_bash_write_str(&out, 8);
+      convert_glob_bash_write(&out, CHAR_RIGHT_PARENTHESIS);
       }


-    if (c == CHAR_RIGHT_PARENTHESIS)
-      {
-      if (nesting_level == 0)
-        {
-        /* ! Unexpected open parenthesis ! */
-        result = ERROR_END_BACKSLASH;
-        break;
-        }
+    convert_glob_bash_wildcard(&out, ccontext->glob_separator);
+    out.out_str[0] = CHAR_ASTERISK;
+    out.out_str[1] = CHAR_QUESTION_MARK;
+    convert_glob_bash_write_str(&out, 2);
+    continue;
+    }


-      c = group_types[--nesting_level];
+  if (c == CHAR_QUESTION_MARK)
+    {
+    convert_glob_bash_wildcard(&out, ccontext->glob_separator);
+    continue;
+    }


-      convert_glob_bash_write(&context.out, CHAR_RIGHT_PARENTHESIS);
-      if (c != CHAR_COMMERCIAL_AT)
-        {
-        convert_glob_bash_write(&context.out, c);
-        convert_glob_bash_write(&context.out, CHAR_QUESTION_MARK);
-        }
-
-      after_sep = FALSE;
-      continue;
-      }
-
-    if (c == CHAR_VERTICAL_LINE && nesting_level > 0)
+  if (c == CHAR_BACKSLASH)
+    {
+    if (pattern >= pattern_end)
       {
-      convert_glob_bash_write(&context.out, CHAR_VERTICAL_LINE);
-
-      after_sep = FALSE;
-      continue;
+      result = ERROR_END_BACKSLASH;
+      break;
       }
-
-    if ((c == CHAR_QUESTION_MARK || c == CHAR_ASTERISK ||
-         c == CHAR_PLUS || c == CHAR_COMMERCIAL_AT) &&
-        context.pattern < context.pattern_end &&
-        context.pattern[0] == CHAR_LEFT_PARENTHESIS)
-      {
-      if (nesting_level >= PCRE2_BASH_GLOB_MAX_NESTING)
-        {
-        result = ERROR_TOO_DEEP_NESTING;
-        break;
-        }
-
-      if (after_sep)
-        {
-        context.out.out_str[0] = CHAR_LEFT_PARENTHESIS;
-        context.out.out_str[1] = CHAR_QUESTION_MARK;
-        context.out.out_str[2] = CHAR_EXCLAMATION_MARK;
-        context.out.out_str[3] = CHAR_BACKSLASH;
-        context.out.out_str[4] = CHAR_DOT;
-        context.out.out_str[5] = CHAR_RIGHT_PARENTHESIS;
-        convert_glob_bash_write_str(&context.out, 6);
-        }
-
-      context.pattern++;
-      group_types[nesting_level++] = (uint8_t) c;
-
-      context.out.out_str[0] = CHAR_LEFT_PARENTHESIS;
-      context.out.out_str[1] = CHAR_QUESTION_MARK;
-      context.out.out_str[2] = CHAR_COLON;
-      convert_glob_bash_write_str(&context.out, 3);
-
-      after_sep = FALSE;
-      continue;
-      }
-
-    if (c == CHAR_ASTERISK)
-      {
-      if (nesting_level == 0 && context.pattern != pattern + 1)
-        {
-        context.out.out_str[0] = CHAR_LEFT_PARENTHESIS;
-        context.out.out_str[1] = CHAR_ASTERISK;
-        context.out.out_str[2] = CHAR_C;
-        context.out.out_str[3] = CHAR_O;
-        context.out.out_str[4] = CHAR_M;
-        context.out.out_str[5] = CHAR_M;
-        context.out.out_str[6] = CHAR_I;
-        context.out.out_str[7] = CHAR_T;
-        convert_glob_bash_write_str(&context.out, 8);
-        convert_glob_bash_write(&context.out, CHAR_RIGHT_PARENTHESIS);
-        }
-
-      if (after_sep)
-        {
-        context.out.out_str[0] = CHAR_LEFT_PARENTHESIS;
-        context.out.out_str[1] = CHAR_QUESTION_MARK;
-        context.out.out_str[2] = CHAR_COLON;
-        convert_glob_bash_write_str(&context.out, 3);
-
-        convert_glob_bash_wildcard(&context, ccontext->glob_separator, TRUE);
-        convert_glob_bash_wildcard(&context, ccontext->glob_separator, FALSE);
-
-        context.out.out_str[0] = CHAR_ASTERISK;
-        context.out.out_str[1] = CHAR_QUESTION_MARK;
-        context.out.out_str[2] = CHAR_RIGHT_PARENTHESIS;
-        context.out.out_str[3] = CHAR_QUESTION_MARK;
-        context.out.out_str[4] = CHAR_QUESTION_MARK;
-        convert_glob_bash_write_str(&context.out, 5);
-        }
-      else
-        {
-        convert_glob_bash_wildcard(&context, ccontext->glob_separator, FALSE);
-        context.out.out_str[0] = CHAR_ASTERISK;
-        context.out.out_str[1] = CHAR_QUESTION_MARK;
-        convert_glob_bash_write_str(&context.out, 2);
-        }
-
-      after_sep = FALSE;
-      continue;
-      }
-
-    if (c == CHAR_QUESTION_MARK)
-      {
-      convert_glob_bash_wildcard(&context,
-        ccontext->glob_separator, after_sep);
-
-      after_sep = FALSE;
-      continue;
-      }
+    c = *pattern++;
     }


-  after_sep = (c == ccontext->glob_separator);
-
-  if (after_sep && nesting_level > 0)
-    {
-    context.out.out_str[0] = CHAR_LEFT_PARENTHESIS;
-    context.out.out_str[1] = CHAR_ASTERISK;
-    context.out.out_str[2] = CHAR_F;
-    context.out.out_str[3] = CHAR_RIGHT_PARENTHESIS;
-    convert_glob_bash_write_str(&context.out, 4);
-
-    after_sep = FALSE;
-    continue;
-    }
-
   if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL)
-    convert_glob_bash_write(&context.out, CHAR_BACKSLASH);
+    convert_glob_bash_write(&out, CHAR_BACKSLASH);


- convert_glob_bash_write(&context.out, c);
+ convert_glob_bash_write(&out, c);
}

 if (result == 0)
   {
-  /* ! Unexpected end of input ! */
-  if (nesting_level > 0 || context.read_mode != PCRE2_BASH_GLOB_NORMAL)
-    result = ERROR_MISSING_CLOSING_PARENTHESIS;
-  else
-    {
-    context.out.out_str[0] = CHAR_BACKSLASH;
-    context.out.out_str[1] = CHAR_z;
-    context.out.out_str[2] = CHAR_NULL;
-    convert_glob_bash_write_str(&context.out, 3);
-    }
+  out.out_str[0] = CHAR_BACKSLASH;
+  out.out_str[1] = CHAR_z;
+  out.out_str[2] = CHAR_NULL;
+  convert_glob_bash_write_str(&out, 3);
+
+  if (!dummyrun && out.output_size != (out.output - use_buffer))
+    result = PCRE2_ERROR_NOMEMORY;
   }


if (result != 0)
{
- *bufflenptr = context.out.output - use_buffer;
+ *bufflenptr = pattern - pattern_start;
return result;
}

-*bufflenptr = context.out.output_size - 1;
+*bufflenptr = out.output_size - 1;
return 0;
}

@@ -1139,7 +924,7 @@
     break;


     case PCRE2_CONVERT_GLOB_BASH:
-    rc = convert_glob_bash(pattype, pattern, plength, utf, use_buffer, use_length,
+    rc = convert_glob_bash(options, pattern, plength, utf, use_buffer, use_length,
       bufflenptr, dummyrun, ccontext);
     break;



Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24    2017-05-16 12:42:17 UTC (rev 784)
+++ code/trunk/testdata/testinput24    2017-05-17 13:44:24 UTC (rev 785)
@@ -222,21 +222,13 @@


/A\B\\C\D/

-/A'B'C'''\'''D''/
+/\\{}\?\*+\[\]()|.^$/

-/A""B"\\\"\C"''""/
-
-/'\{}?*+[]()|.^$'/
-
/*a*\/*b*/

/?a?\/?b?/

-/a|b@(a|b)*\/@(a|b)/

-/\/@(a\/|b\/)\//
-
-
#pattern convert=unset
#pattern convert=posix_extended


Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24    2017-05-16 12:42:17 UTC (rev 784)
+++ code/trunk/testdata/testoutput24    2017-05-17 13:44:24 UTC (rev 785)
@@ -345,33 +345,21 @@
 # Non control character checking


//
-\A\z
+(?s)\A\z

/A\B\\C\D/
-\AAB\\CD\z
+(?s)\AAB\\CD\z

-/A'B'C'''\'''D''/
-\AABC\\D\z
+/\\{}\?\*+\[\]()|.^$/
+(?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z

-/A""B"\\\"\C"''""/
-\AAB\\"\\C\z
-
-/'\{}?*+[]()|.^$'/
-\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z
-
/*a*\/*b*/
-\A(?:[^./][^/]*?)??a(*COMMIT)[^/]*?/(*COMMIT)(?:[^./][^/]*?)??b(*COMMIT)[^/]*?\z
+(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z

/?a?\/?b?/
-\A[^./]a[^/]/[^./]b[^/]\z
+(?s)\A[^/]a[^/]/[^/]b[^/]\z

-/a|b@(a|b)*\/@(a|b)/
-\Aa\|b(?:a|b)(*COMMIT)[^/]*?/(?!\.)(?:a|b)\z

-/\/@(a\/|b\/)\//
-\A/(?!\.)(?:a(*F)|b(*F))/\z
-
-
#pattern convert=unset
#pattern convert=posix_extended