[Pcre-svn] [782] code/trunk: Experimental conversion: fix PO…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [782] code/trunk: Experimental conversion: fix POSIX BRE starting ^*.
Revision: 782
          http://www.exim.org/viewvc/pcre2?view=rev&revision=782
Author:   ph10
Date:     2017-05-14 13:04:13 +0100 (Sun, 14 May 2017)
Log Message:
-----------
Experimental conversion: fix POSIX BRE starting ^*.


Modified Paths:
--------------
    code/trunk/src/pcre2_convert.c
    code/trunk/testdata/testinput24
    code/trunk/testdata/testoutput24


Modified: code/trunk/src/pcre2_convert.c
===================================================================
--- code/trunk/src/pcre2_convert.c    2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/src/pcre2_convert.c    2017-05-14 12:04:13 UTC (rev 782)
@@ -68,10 +68,11 @@
 #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS
 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS


-/* States for range and POSIX class processing */
+/* States for range and POSIX processing */

 enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED };
-enum { POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };
+enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET, 
+       POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED };


/* Macro to add a character string to the output buffer, checking for overflow. */

@@ -119,10 +120,9 @@
PCRE2_SIZE convlength = 0;

uint32_t bracount = 0;
-uint32_t posix_class_state = POSIX_CLASS_NOT_STARTED;
+uint32_t posix_state = POSIX_START_REGEX;
uint32_t lastspecial = 0;
BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0;
-BOOL inclass = FALSE;
BOOL nextisliteral = FALSE;

 (void)utf;       /* Not used when Unicode not supported */
@@ -132,14 +132,8 @@


*bufflenptr = plength;

-/* Now scan the input. In non-extended patterns, an initial asterisk is treated
-as literal. Still figuring out what happens in extended patterns... */
+/* Now scan the input. */

-if (plength > 0 && *posix == CHAR_ASTERISK)
- {
- if (!extended) nextisliteral = TRUE;
- }
-
while (plength > 0)
{
uint32_t c, sc;
@@ -168,12 +162,12 @@

/* Handle a character within a class. */

-  if (inclass)
+  if (posix_state >= POSIX_CLASS_NOT_STARTED)
     {
     if (c == CHAR_RIGHT_SQUARE_BRACKET) 
       {
       PUTCHARS(STR_RIGHT_SQUARE_BRACKET); 
-      inclass = FALSE;
+      posix_state = POSIX_NOT_BRACKET;
       }


     /* Not the end of the class */
@@ -180,11 +174,11 @@


     else 
       {
-      switch (posix_class_state)
+      switch (posix_state)
         {
         case POSIX_CLASS_STARTED:
         if (c <= 127 && islower(c)) break;  /* Remain in started state */
-        posix_class_state = POSIX_CLASS_NOT_STARTED;       
+        posix_state = POSIX_CLASS_NOT_STARTED;       
         if (c == CHAR_COLON  && plength > 0 && 
             *posix == CHAR_RIGHT_SQUARE_BRACKET)
           {
@@ -197,11 +191,11 @@


         case POSIX_CLASS_NOT_STARTED:  
         if (c == CHAR_LEFT_SQUARE_BRACKET) 
-          posix_class_state = POSIX_CLASS_STARTING;
+          posix_state = POSIX_CLASS_STARTING;
         break;


         case POSIX_CLASS_STARTING:
-        if (c == CHAR_COLON) posix_class_state = POSIX_CLASS_STARTED;
+        if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED;
         break;
         }   


@@ -242,8 +236,7 @@

     /* Handle "normal" character classes */


-    posix_class_state = POSIX_CLASS_NOT_STARTED; 
-    inclass = TRUE;
+    posix_state = POSIX_CLASS_NOT_STARTED; 


     /* Handle ^ and ] as first characters */


@@ -295,6 +288,7 @@

     case CHAR_DOT:
     case CHAR_DOLLAR_SIGN:   
+    posix_state = POSIX_NOT_BRACKET; 
     COPY_SPECIAL:
     lastspecial = c; 
     if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY;
@@ -302,15 +296,22 @@
     break; 


     case CHAR_ASTERISK:
-    if (lastspecial != CHAR_ASTERISK) goto COPY_SPECIAL;
+    if (lastspecial != CHAR_ASTERISK) 
+      {
+      if (!extended && posix_state < POSIX_NOT_BRACKET)
+        goto ESCAPE_LITERAL; 
+      goto COPY_SPECIAL;
+      } 
     break;   /* Ignore second and subsequent asterisks */  


     case CHAR_CIRCUMFLEX_ACCENT:
-    if (extended || 
-          lastspecial == 0 || 
-          lastspecial == CHAR_LEFT_PARENTHESIS ||
-          lastspecial == CHAR_VERTICAL_LINE) 
+    if (extended) goto COPY_SPECIAL;
+    if (posix_state == POSIX_START_REGEX || 
+        lastspecial == CHAR_LEFT_PARENTHESIS) 
+      {
+      posix_state = POSIX_ANCHORED;
       goto COPY_SPECIAL;
+      } 
     /* Fall through */      


     default:
@@ -323,11 +324,13 @@
     if (p + clength > endp) return PCRE2_ERROR_NOMEMORY;
     memcpy(p, posix - clength, CU2BYTES(clength));
     p += clength;
+    posix_state = POSIX_NOT_BRACKET; 
     break;
     }
   }


-if (inclass) return ERROR_MISSING_SQUARE_BRACKET;
+if (posix_state >= POSIX_CLASS_NOT_STARTED) 
+  return ERROR_MISSING_SQUARE_BRACKET;
 convlength += p - pp;        /* Final segment */
 *bufflenptr = convlength;
 *p++ = 0;


Modified: code/trunk/testdata/testinput24
===================================================================
--- code/trunk/testdata/testinput24    2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/testdata/testinput24    2017-05-14 12:04:13 UTC (rev 782)
@@ -266,6 +266,8 @@


/^how to \^how to/

+/^*abc/
+
 /*abc/
     X*abcY


@@ -272,7 +274,7 @@
 /**abc/
     XabcY
     X*abcY
-    X**abcY  
+    X**abcY


/^b\(c^d\)\(^e^f\)/


Modified: code/trunk/testdata/testoutput24
===================================================================
--- code/trunk/testdata/testoutput24    2017-05-13 17:46:27 UTC (rev 781)
+++ code/trunk/testdata/testoutput24    2017-05-14 12:04:13 UTC (rev 782)
@@ -426,6 +426,9 @@
 /^how to \^how to/
 ^how to \^how to


+/^*abc/
+^\*abc
+
 /*abc/
 \*abc
     X*abcY
@@ -437,7 +440,7 @@
  0: abc
     X*abcY
  0: *abc
-    X**abcY  
+    X**abcY
  0: **abc


/^b\(c^d\)\(^e^f\)/