[Pcre-svn] [912] code/trunk: Fix auto-possessification bug a…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [912] code/trunk: Fix auto-possessification bug at the end of a capturing group that is called
Revision: 912
          http://www.exim.org/viewvc/pcre2?view=rev&revision=912
Author:   ph10
Date:     2018-01-31 17:53:56 +0000 (Wed, 31 Jan 2018)
Log Message:
-----------
Fix auto-possessification bug at the end of a capturing group that is called 
recursively.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_auto_possess.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/ChangeLog    2018-01-31 17:53:56 UTC (rev 912)
@@ -134,7 +134,16 @@
 131072, which allows for the maximum number of captures (65535) plus the 
 overall match. This fixes oss-fuzz issue 5415.


+31. Auto-possessification at the end of a capturing group was dependent on what
+follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused
+incorrect behaviour when the group was called recursively from elsewhere in the
+pattern where something different might follow. This bug is an unforseen
+consequence of change #1 for 10.30 - the implementation of backtracking into
+recursions. Iterators at the ends of capturing groups are no longer considered
+for auto-possessification if the pattern contains any recursions. Fixes
+Bugzilla #2232.

+
Version 10.30 14-August-2017
----------------------------


Modified: code/trunk/src/pcre2_auto_possess.c
===================================================================
--- code/trunk/src/pcre2_auto_possess.c    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/src/pcre2_auto_possess.c    2018-01-31 17:53:56 UTC (rev 912)
@@ -558,6 +558,8 @@
     continue;
     }


+  /* At the end of a branch, skip to the end of the group. */
+
   if (c == OP_ALT)
     {
     do code += GET(code, 1); while (*code == OP_ALT);
@@ -564,25 +566,49 @@
     c = *code;
     }


+  /* Inspect the next opcode. */
+
   switch(c)
     {
+    /* We can always possessify a greedy iterator at the end of the pattern,
+    which is reached after skipping over the final OP_KET. A non-greedy
+    iterator must never be possessified. */
+
     case OP_END:
-    case OP_KETRPOS:
-    /* TRUE only in greedy case. The non-greedy case could be replaced by
-    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
-    uses more memory, which we cannot get at this stage.) */
-
     return base_list[1] != 0;


+    /* When an iterator is at the end of certain kinds of group we can inspect
+    what follows the group by skipping over the closing ket. Note that this
+    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
+    iteration is variable (could be another iteration or could be the next
+    item). As these two opcodes are not listed in the next switch, they will
+    end up as the next code to inspect, and return FALSE by virtue of being
+    unsupported. */
+
     case OP_KET:
-    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
-    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
-    cannot be converted to a possessive form. */
+    case OP_KETRPOS:
+    /* The non-greedy case cannot be converted to a possessive form. */


     if (base_list[1] == 0) return FALSE;


+    /* If the bracket is capturing it might be referenced by an OP_RECURSE
+    so its last iterator can never be possessified if the pattern contains
+    recursions. (This could be improved by keeping a list of group numbers that
+    are called by recursion.) */
+
     switch(*(code - GET(code, 1)))
       {
+      case OP_CBRA:
+      case OP_SCBRA:
+      case OP_CBRAPOS:
+      case OP_SCBRAPOS:
+      if (cb->had_recurse) return FALSE;
+      break;
+
+      /* Atomic sub-patterns and assertions can always auto-possessify their
+      last iterator. However, if the group was entered as a result of checking
+      a previous iterator, this is not possible. */
+
       case OP_ASSERT:
       case OP_ASSERT_NOT:
       case OP_ASSERTBACK:
@@ -589,16 +615,16 @@
       case OP_ASSERTBACK_NOT:
       case OP_ONCE:


-      /* Atomic sub-patterns and assertions can always auto-possessify their
-      last iterator. However, if the group was entered as a result of checking
-      a previous iterator, this is not possible. */
-
       return !entered_a_group;
       }


+    /* Skip over the bracket and inspect what comes next. */
+
     code += PRIV(OP_lengths)[c];
     continue;


+    /* Handle cases where the next item is a group. */
+
     case OP_ONCE:
     case OP_BRA:
     case OP_CBRA:
@@ -637,11 +663,15 @@
     code += PRIV(OP_lengths)[c];
     continue;


+    /* The next opcode does not need special handling; fall through and use it
+    to see if the base can be possessified. */
+
     default:
     break;
     }


- /* Check for a supported opcode, and load its properties. */
+ /* We now have the next appropriate opcode to compare with the base. Check
+ for a supported opcode, and load its properties. */

   code = get_chr_property_list(code, utf, cb->fcc, list);
   if (code == NULL) return FALSE;    /* Unsupported */


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testinput1    2018-01-31 17:53:56 UTC (rev 912)
@@ -6159,4 +6159,34 @@
 /((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
     XYYZ


+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+    a
+
+/^(a?)b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)+b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)++b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)+b/
+    b
+    ab
+    aaab 
+
+/(?=a+)a(a+)++b/
+    aab
+
 # End of testinput1 


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testinput2    2018-01-31 17:53:56 UTC (rev 912)
@@ -5412,4 +5412,21 @@
 \= Expect no match
     \na


+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified. 
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+    
+/^(a?)b(?1)a/B
+
+/^(a?)+b(?1)a/B
+
+/^(a?)++b(?1)a/B
+
+/^(a?)+b/B
+
+/(?=a+)a(a+)++b/B
+
 # End of testinput2


Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testoutput1    2018-01-31 17:53:56 UTC (rev 912)
@@ -9758,4 +9758,68 @@
  1: Y
  2: 


+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+ 0: aa
+    a
+ 0: a
+
+/^(a?)b(?1)a/
+    abaa
+ 0: abaa
+ 1: a
+    aba 
+ 0: aba
+ 1: a
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)+b(?1)a/
+    abaa
+ 0: abaa
+ 1: 
+    aba 
+ 0: aba
+ 1: 
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)++b(?1)a/
+    abaa
+ 0: abaa
+ 1: 
+    aba 
+ 0: aba
+ 1: 
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)+b/
+    b
+ 0: b
+ 1: 
+    ab
+ 0: ab
+ 1: 
+    aaab 
+ 0: aaab
+ 1: 
+
+/(?=a+)a(a+)++b/
+    aab
+ 0: aab
+ 1: a
+
 # End of testinput1 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2018-01-16 16:50:40 UTC (rev 911)
+++ code/trunk/testdata/testoutput2    2018-01-31 17:53:56 UTC (rev 912)
@@ -12701,7 +12701,7 @@
         Ket
         a
         CBraPos 1
-        a++
+        a+
         KetRpos
         a
         Ket
@@ -16468,6 +16468,113 @@
     \na
 No match


+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified. 
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        X
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+    
+/^(a?)b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        CBra 1
+        a?
+        Ket
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)++b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBraPos 1
+        a?
+        KetRpos
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?=a+)a(a+)++b/B
+------------------------------------------------------------------
+        Bra
+        Assert
+        a++
+        Ket
+        a
+        CBraPos 1
+        a++
+        KetRpos
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data