[Pcre-svn] [390] code/trunk: Fix UCP with [[:<]] and [[:>:]]…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [390] code/trunk: Fix UCP with [[:<]] and [[:>:]] bad compile bug.
Revision: 390
          http://www.exim.org/viewvc/pcre2?view=rev&revision=390
Author:   ph10
Date:     2015-10-21 12:29:07 +0100 (Wed, 21 Oct 2015)
Log Message:
-----------
Fix UCP with [[:<]] and [[:>:]] bad compile bug.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/src/pcre2_intmodedep.h
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/ChangeLog    2015-10-21 11:29:07 UTC (rev 390)
@@ -206,7 +206,10 @@
 59. Change 55 above introduced a bug by which certain patterns provoked the 
 erroneous error "\ at end of pattern".


+60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling
+errors or other strange effects if compiled in UCP mode.

+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/src/pcre2_compile.c    2015-10-21 11:29:07 UTC (rev 390)
@@ -1618,6 +1618,13 @@
 not relevant, but the options argument is the final value of the compiled
 pattern's options.


+There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
+processed, it is replaced by a nested alternative sequence. If this contains a
+backslash (which is usually does), ptrend does not point to its end - it still
+points to the end of the whole pattern. However, we can detect this case 
+because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
+terminated and there are only ever two levels of nesting.
+
 Arguments:
   ptrptr         points to the input position pointer
   ptrend         points to the end of the input
@@ -1643,10 +1650,14 @@
 int escape = 0;
 int i;


-/* If backslash is at the end of the string, it's an error. The check must be
-skipped when processing a nested insertion string during compilation. */
+/* Find the end of a nested insert. */

-if ((cb == NULL || cb->nestptr == NULL) && ptr >= ptrend)
+if (cb != NULL && cb->nestptr[0] != NULL)
+ ptrend = ptr + PRIV(strlen)(ptr);
+
+/* If backslash is at the end of the string, it's an error. */
+
+if (ptr >= ptrend)
{
*errorcodeptr = ERR1;
return 0;
@@ -3700,13 +3711,14 @@
c = *ptr;

/* If we are at the end of a nested substitution, revert to the outer level
- string. Nesting only happens one level deep, and the inserted string is
- always zero terminated. */
+ string. Nesting only happens one or two levels deep, and the inserted string
+ is always zero terminated. */

-  if (c == CHAR_NULL && cb->nestptr != NULL)
+  if (c == CHAR_NULL && cb->nestptr[0] != NULL)
     {
-    ptr = cb->nestptr;
-    cb->nestptr = NULL;
+    ptr = cb->nestptr[0];
+    cb->nestptr[0] = cb->nestptr[1]; 
+    cb->nestptr[1] = NULL;
     c = *ptr;
     }


@@ -3823,7 +3835,7 @@
/* Fill in length of a previous callout, except when the next thing is a
quantifier or when processing a property substitution string in UCP mode. */

-  if (!is_quantifier && previous_callout != NULL && cb->nestptr == NULL &&
+  if (!is_quantifier && previous_callout != NULL && cb->nestptr[0] == NULL &&
        after_manual_callout-- <= 0)
     {
     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
@@ -3834,7 +3846,8 @@
   /* Create auto callout, except for quantifiers, or while processing property
   strings that are substituted for \w etc in UCP mode. */


-  if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && cb->nestptr == NULL)
+  if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && 
+       cb->nestptr[0] == NULL)
     {
     previous_callout = code;
     code = auto_callout(code, ptr, cb);
@@ -3926,13 +3939,15 @@
     In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
     used for "start of word" and "end of word". As these are otherwise illegal
     sequences, we don't break anything by recognizing them. They are replaced
-    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
-    erroneous and are handled by the normal code below. */
+    by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
+    nesting level, as no other inserted sequences will contains these oddities.
+    Sequences like [a[:<:]] are erroneous and are handled by the normal code
+    below. */


     case CHAR_LEFT_SQUARE_BRACKET:
     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
       {
-      cb->nestptr = ptr + 7;
+      cb->nestptr[0] = ptr + 7;
       ptr = sub_start_of_word;  /* Do not combine these statements; clang's */
       ptr--;                    /* sanitizer moans about a negative index. */
       continue;
@@ -3940,7 +3955,7 @@


     if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
       {
-      cb->nestptr = ptr + 7;
+      cb->nestptr[0] = ptr + 7;
       ptr = sub_end_of_word;    /* Do not combine these statements; clang's */
       ptr--;                    /* sanitizer moans about a negative index. */
       continue;
@@ -4125,11 +4140,13 @@
           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);


           /* The posix_substitutes table specifies which POSIX classes can be
-          converted to \p or \P items. */
+          converted to \p or \P items. This can only happen at top nestling 
+          level, as there will never be a POSIX class in a string that is 
+          substituted for something else. */


           if (posix_substitutes[pc] != NULL)
             {
-            cb->nestptr = tempptr + 1;
+            cb->nestptr[0] = tempptr + 1;
             ptr = posix_substitutes[pc] - 1;
             goto CONTINUE_CLASS;
             }
@@ -4263,9 +4280,10 @@
             case ESC_DU:     /* when PCRE2_UCP is set. We replace the */
             case ESC_wu:     /* escape sequence with an appropriate \p */
             case ESC_WU:     /* or \P to test Unicode properties instead */
-            case ESC_su:     /* of the default ASCII testing. */
-            case ESC_SU:
-            cb->nestptr = ptr;
+            case ESC_su:     /* of the default ASCII testing. This might be */
+            case ESC_SU:     /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
+            cb->nestptr[1] = cb->nestptr[0]; 
+            cb->nestptr[0] = ptr;
             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
             class_has_8bitchar--;                /* Undo! */
             break;
@@ -4607,10 +4625,11 @@


       CONTINUE_CLASS:
       c = *(++ptr);
-      if (c == 0 && cb->nestptr != NULL)
+      if (c == CHAR_NULL && cb->nestptr[0] != NULL)
         {
-        ptr = cb->nestptr;
-        cb->nestptr = NULL;
+        ptr = cb->nestptr[0];
+        cb->nestptr[0] = cb->nestptr[1]; 
+        cb->nestptr[1] = NULL;
         c = *(++ptr);
         }


@@ -7082,7 +7101,8 @@
 #ifdef SUPPORT_UNICODE
         if (escape >= ESC_DU && escape <= ESC_wu)
           {
-          cb->nestptr = ptr + 1;                   /* Where to resume */
+          cb->nestptr[1] = cb->nestptr[0];         /* Back up if at 2nd level */
+          cb->nestptr[0] = ptr + 1;                /* Where to resume */
           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
           }
         else
@@ -8079,7 +8099,7 @@
 cb.cx = ccontext;
 cb.dupnames = FALSE;
 cb.end_pattern = pattern + patlen;
-cb.nestptr = NULL;
+cb.nestptr[0] = cb.nestptr[1] = NULL;
 cb.external_flags = 0;
 cb.external_options = options;
 cb.had_recurse = FALSE;


Modified: code/trunk/src/pcre2_intmodedep.h
===================================================================
--- code/trunk/src/pcre2_intmodedep.h    2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/src/pcre2_intmodedep.h    2015-10-21 11:29:07 UTC (rev 390)
@@ -686,7 +686,7 @@
   PCRE2_SPTR start_code;           /* The start of the compiled code */
   PCRE2_SPTR start_pattern;        /* The start of the pattern */
   PCRE2_SPTR end_pattern;          /* The end of the pattern */
-  PCRE2_SPTR nestptr;              /* Pointer saved for string substitution */ 
+  PCRE2_SPTR nestptr[2];           /* Pointer(s) saved for string substitution */ 
   PCRE2_UCHAR *name_table;         /* The name/number table */
   size_t workspace_size;           /* Size of workspace */
   uint16_t names_found;            /* Number of entries so far */


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/testdata/testinput5    2015-10-21 11:29:07 UTC (rev 390)
@@ -1686,4 +1686,6 @@
 \= Expect no match
     123     


+/(*UCP)(*UTF)[[:>:]]X/B
+
# End of testinput5

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2015-10-17 18:31:29 UTC (rev 389)
+++ code/trunk/testdata/testoutput5    2015-10-21 11:29:07 UTC (rev 390)
@@ -4047,4 +4047,17 @@
     123     
 No match


+/(*UCP)(*UTF)[[:>:]]X/B
+------------------------------------------------------------------
+        Bra
+        \b
+        AssertB
+        Reverse
+        prop Xwd
+        Ket
+        X
+        Ket
+        End
+------------------------------------------------------------------
+
 # End of testinput5