[Pcre-svn] [626] code/trunk: Fix mis-parsing of a conditiona…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [626] code/trunk: Fix mis-parsing of a conditional group with callout but a question mark where
Revision: 626
          http://www.exim.org/viewvc/pcre2?view=rev&revision=626
Author:   ph10
Date:     2016-12-23 18:34:10 +0000 (Fri, 23 Dec 2016)
Log Message:
-----------
Fix mis-parsing of a conditional group with callout but a question mark where 
the assertion should start.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/ChangeLog    2016-12-23 18:34:10 UTC (rev 626)
@@ -110,6 +110,10 @@


   (p) A buffer overflow could occur while sorting the names in the group name 
       list (depending on the order in which the names were seen). 
+      
+  (q) A conditional group that started with a callout was not doing the right
+      check for a following assertion, leading to compiling bad code. Example:
+      /(?(C'XX))?!XX/   


4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the

Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/src/pcre2_compile.c    2016-12-23 18:34:10 UTC (rev 626)
@@ -2325,7 +2325,7 @@
         parsed_pattern = manage_callouts(thisptr, &previous_callout, options,
           parsed_pattern, cb);
       PARSED_LITERAL(c, parsed_pattern);
-      meta_quantifier = 0; 
+      meta_quantifier = 0;
       }
     continue;  /* Next character */
     }
@@ -2475,13 +2475,16 @@
   /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
   assertion, possibly preceded by a callout. If the value is 1, we have just
   had the callout and expect an assertion. There must be at least 3 more
-  characters in all cases. We know that the current character is an opening
-  parenthesis, as otherwise we wouldn't be here. Note that expect_cond_assert
-  may be negative, since all callouts just decrement it. */
+  characters in all cases. When expect_cond_assert is 2, we know that the
+  current character is an opening parenthesis, as otherwise we wouldn't be
+  here. However, when it is 1, we need to check, and it's easiest just to check
+  always. Note that expect_cond_assert may be negative, since all callouts just
+  decrement it. */


   if (expect_cond_assert > 0)
     {
-    BOOL ok = ptrend - ptr >= 3 && ptr[0] == CHAR_QUESTION_MARK;
+    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
+              ptr[0] == CHAR_QUESTION_MARK;
     if (ok) switch(ptr[1])
       {
       case CHAR_C:
@@ -4527,7 +4530,7 @@


 Returns:            0 There's been an error, *errorcodeptr is non-zero
                    +1 Success, this branch must match at least one character
-                   -1 Success, this branch may match an empty string 
+                   -1 Success, this branch may match an empty string
 */


 static int
@@ -4538,7 +4541,7 @@
 {
 int bravalue = 0;
 int okreturn = -1;
-int group_return = 0; 
+int group_return = 0;
 uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
 uint32_t greedy_default, greedy_non_default;
 uint32_t repeat_type, op_type;
@@ -4622,7 +4625,7 @@
   BOOL should_flip_negation;
   BOOL match_all_or_no_wide_chars;
   BOOL possessive_quantifier;
-  BOOL note_group_empty; 
+  BOOL note_group_empty;
   int class_has_8bitchar;
   int i;
   uint32_t mclength;
@@ -4687,15 +4690,15 @@
   Checking for the legality of quantifiers happens in parse_regex(), except for
   a quantifier after an assertion that is a condition. */


-  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) 
+  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
     {
     previous = code;
     if (matched_char) okreturn = 1;
-    } 
+    }


   previous_matched_char = matched_char;
   matched_char = FALSE;
-  note_group_empty = FALSE; 
+  note_group_empty = FALSE;
   skipunits = 0;         /* Default value for most subgroups */


   switch(meta)
@@ -4737,7 +4740,7 @@
     repeats. The value of reqcu doesn't change either. */


     case META_DOT:
-    matched_char = TRUE; 
+    matched_char = TRUE;
     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
     zerofirstcu = firstcu;
     zerofirstcuflags = firstcuflags;
@@ -4755,7 +4758,7 @@


     case META_CLASS_EMPTY:
     case META_CLASS_EMPTY_NOT:
-    matched_char = TRUE; 
+    matched_char = TRUE;
     *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
     if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
     zerofirstcu = firstcu;
@@ -4778,7 +4781,7 @@


     case META_CLASS_NOT:
     case META_CLASS:
-    matched_char = TRUE; 
+    matched_char = TRUE;
     negate_class = meta == META_CLASS_NOT;


     /* We can optimize the case of a single character in a class by generating
@@ -5502,7 +5505,7 @@
     goto GROUP_PROCESS_NOTE_EMPTY;


     /* The DEFINE condition is always false. It's internal groups may never
-    be called, so matched_char must remain false, hence the jump to 
+    be called, so matched_char must remain false, hence the jump to
     GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */


     case META_COND_DEFINE:
@@ -5603,12 +5606,12 @@


     /* Process nested bracketed regex. The nesting depth is maintained for the
     benefit of the stackguard function. The test for too deep nesting is now
-    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS; 
-    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take 
+    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
+    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
     note of whether or not they may match an empty string. */
-    
+
     GROUP_PROCESS_NOTE_EMPTY:
-    note_group_empty = TRUE; 
+    note_group_empty = TRUE;


     GROUP_PROCESS:
     cb->parens_depth += 1;
@@ -5619,7 +5622,7 @@
     templastcapture = cb->lastcapture;    /* Save value before group */
     length_prevgroup = 0;                 /* Initialize for pre-compile phase */


-    if ((group_return = 
+    if ((group_return =
          compile_regex(
          options,                         /* The option state */
          &tempcode,                       /* Where to put code (updated) */
@@ -5636,14 +5639,14 @@
            &length_prevgroup              /* Pre-compile phase */
          )) == 0)
       return 0;  /* Error */
-       
+
     cb->parens_depth -= 1;


-    /* If that was a non-conditional significant group (not an assertion, not a 
+    /* If that was a non-conditional significant group (not an assertion, not a
     DEFINE) that matches at least one character, then the current item matches
     a character. Conditionals are handled below. */


-    if (note_group_empty && bravalue != OP_COND && group_return > 0) 
+    if (note_group_empty && bravalue != OP_COND && group_return > 0)
       matched_char = TRUE;


     /* If that was an atomic group and there are no capturing groups within it,
@@ -5676,7 +5679,7 @@
          tc += GET(tc,1);
          }
       while (*tc != OP_KET);
-      
+
       /* A DEFINE group is never obeyed inline (the "condition" is always
       false). It must have only one branch. Having checked this, change the
       opcode to OP_FALSE. */
@@ -5695,7 +5698,7 @@


       /* A "normal" conditional group. If there is just one branch, we must not
       make use of its firstcu or reqcu, because this is equivalent to an
-      empty second branch. Also, it may match an empty string. If there are two 
+      empty second branch. Also, it may match an empty string. If there are two
       branches, this item must match a character if the group must. */


       else
@@ -6018,7 +6021,7 @@
     repeat_max = 1;


     REPEAT:
-    if (previous_matched_char && repeat_min > 0) matched_char = TRUE; 
+    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;


     /* Remember whether this is a variable length repeat, and default to
     single-char opcodes. */
@@ -6087,7 +6090,7 @@
       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
       code += 2 + 2 * LINK_SIZE;
       length_prevgroup = 3 + 3*LINK_SIZE;
-      group_return = -1;  /* Set "may match empty string" */ 
+      group_return = -1;  /* Set "may match empty string" */
       }


     /* Now handle repetition for the different types of item. */
@@ -6444,9 +6447,9 @@


           else
             {
-            /* In the compile phase, adjust the opcode if the group can match 
+            /* In the compile phase, adjust the opcode if the group can match
             an empty string. For a conditional group with only one branch, the
-            value of group_return will not show "could be empty", so we must 
+            value of group_return will not show "could be empty", so we must
             check that separately. */


             if (lengthptr == NULL)
@@ -6877,10 +6880,10 @@
     character if it hasn't already been set. */


     if (meta_arg > ESC_b && meta_arg < ESC_Z)
-      { 
-      matched_char = TRUE; 
+      {
+      matched_char = TRUE;
       if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
-      } 
+      }


     /* Set values to reset to if this is followed by a zero repeat. */


@@ -6946,7 +6949,7 @@
     NORMAL_CHAR:
     meta = *pptr;     /* Get the full 32 bits */
     NORMAL_CHAR_SET:  /* Character is already in meta */
-    matched_char = TRUE; 
+    matched_char = TRUE;


     /* For caseless UTF mode, check whether this character has more than one
     other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
@@ -7071,7 +7074,7 @@
 Returns:            0 There has been an error
                    +1 Success, this group must match at least one character
                    -1 Success, this group may match an empty string
-*/                    
+*/


 static int
 compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
@@ -7156,7 +7159,7 @@
 for (;;)
   {
   int branch_return;
-    
+
   /* Insert OP_REVERSE if this is as lookbehind assertion. */


if (lookbehind && lookbehindlength > 0)
@@ -7169,14 +7172,14 @@
/* Now compile the branch; in the pre-compile phase its length gets added
into the length. */

-  if ((branch_return = 
+  if ((branch_return =
         compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
           &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
           cb, (lengthptr == NULL)? NULL : &length)) == 0)
     return 0;
-    
+
   /* If a branch can match an empty string, so can the whole group. */
-    
+
   if (branch_return < 0) okreturn = -1;


   /* In the real compile phase, there is some post-processing to be done. */
@@ -7308,7 +7311,7 @@
         }
       *lengthptr += length;
       }
-// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);       
+// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
     return okreturn;
     }


@@ -9127,7 +9130,7 @@
pptr = cb.parsed_pattern;
code = (PCRE2_UCHAR *)codestart;
*code = OP_BRA;
-regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
+regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
re->top_bracket = cb.bracount;

Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/testdata/testinput2    2016-12-23 18:34:10 UTC (rev 626)
@@ -4944,4 +4944,10 @@


/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand

+# This one used to compile rubbish instead of a compile error, and then
+# behave unpredictably at match time.
+
+/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
+    .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
+
 # End of testinput2 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2016-12-23 17:36:22 UTC (rev 625)
+++ code/trunk/testdata/testoutput2    2016-12-23 18:34:10 UTC (rev 626)
@@ -15424,6 +15424,13 @@


/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand

+# This one used to compile rubbish instead of a compile error, and then
+# behave unpredictably at match time.
+
+/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
+Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
+    .+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
+
 # End of testinput2 
 Error -63: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data