[Pcre-svn] [1133] code/trunk: Fix lookbehind within lookahea…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1133] code/trunk: Fix lookbehind within lookahead within lookbehind misbehaviour bug.
Revision: 1133
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1133
Author:   ph10
Date:     2019-07-16 16:06:21 +0100 (Tue, 16 Jul 2019)
Log Message:
-----------
Fix lookbehind within lookahead within lookbehind misbehaviour bug.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/testdata/testinput1
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput1
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/ChangeLog    2019-07-16 15:06:21 UTC (rev 1133)
@@ -90,7 +90,12 @@


18. Implement non-atomic positive lookaround assertions.

+19. If a lookbehind contained a lookahead that contained another lookbehind
+within it, the nested lookbehind was not correctly processed. For example, if
+/(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching
+"b".

+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/src/pcre2_compile.c    2019-07-16 15:06:21 UTC (rev 1133)
@@ -135,6 +135,8 @@
   set_lookbehind_lengths(uint32_t **, int *, int *, int *,
     parsed_recurse_check *, compile_block *);


+static int
+ check_lookbehinds(uint32_t *, uint32_t **, compile_block *);


 /*************************************************
@@ -651,7 +653,7 @@
   STRING_positive_lookahead0
   STRING_positive_lookbehind0
   STRING_non_atomic_positive_lookahead0
-  STRING_non_atomic_positive_lookbehind0  
+  STRING_non_atomic_positive_lookbehind0
   STRING_negative_lookahead0
   STRING_negative_lookbehind0
   STRING_atomic0
@@ -670,7 +672,7 @@
   { 18, META_LOOKAHEAD         },
   { 19, META_LOOKBEHIND        },
   { 29, META_LOOKAHEAD_NA      },
-  { 30, META_LOOKBEHIND_NA     }, 
+  { 30, META_LOOKBEHIND_NA     },
   { 18, META_LOOKAHEADNOT      },
   { 19, META_LOOKBEHINDNOT     },
   {  6, META_ATOMIC            },
@@ -4738,7 +4740,7 @@
     case OP_ASSERT_NOT:
     case OP_ASSERTBACK:
     case OP_ASSERTBACK_NOT:
-    case OP_ASSERTBACK_NA: 
+    case OP_ASSERTBACK_NA:
     if (!skipassert) return code;
     do code += GET(code, 1); while (*code == OP_ALT);
     code += PRIV(OP_lengths)[*code];
@@ -6579,7 +6581,7 @@
     we must only take the reqcu when the group also set a firstcu. Otherwise,
     in that example, 'X' ends up set for both. */


-    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) && 
+    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
              subreqcuflags >= 0 && subfirstcuflags >= 0)
       {
       reqcu = subreqcu;
@@ -7014,10 +7016,10 @@


       case OP_ASSERT:
       case OP_ASSERT_NOT:
-      case OP_ASSERT_NA: 
+      case OP_ASSERT_NA:
       case OP_ASSERTBACK:
       case OP_ASSERTBACK_NOT:
-      case OP_ASSERTBACK_NA: 
+      case OP_ASSERTBACK_NA:
       case OP_ONCE:
       case OP_SCRIPT_RUN:
       case OP_BRA:
@@ -7973,7 +7975,7 @@
 /* Remember if this is a lookbehind assertion, and if it is, save its length
 and skip over the pattern offset. */


-lookbehind = *code == OP_ASSERTBACK || 
+lookbehind = *code == OP_ASSERTBACK ||
              *code == OP_ASSERTBACK_NOT ||
              *code == OP_ASSERTBACK_NA;


@@ -8649,10 +8651,10 @@
      case OP_CBRAPOS:
      case OP_SCBRAPOS:
      case OP_ASSERT:
-     case OP_ASSERT_NA: 
+     case OP_ASSERT_NA:
      case OP_ONCE:
      case OP_SCRIPT_RUN:
-     d = find_firstassertedcu(scode, &dflags, inassert + 
+     d = find_firstassertedcu(scode, &dflags, inassert +
        ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
      if (dflags < 0)
        return 0;
@@ -9108,16 +9110,16 @@
       }
     break;


-    /* Lookaheads can be ignored, but we must start the skip inside the group
-    so that it isn't treated as a group within the branch. */
+    /* Lookaheads do not contribute to the length of this branch, but they may
+    contain lookbehinds within them whose lengths need to be set. */


     case META_LOOKAHEAD:
     case META_LOOKAHEADNOT:
     case META_LOOKAHEAD_NA:
-    pptr = parsed_skip(pptr + 1, PSKIP_KET);
-    if (pptr == NULL) goto PARSED_SKIP_FAILED;
+    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, cb);
+    if (*errcodeptr != 0) return -1;


-    /* Also ignore any qualifiers that follow a lookahead assertion. */
+    /* Ignore any qualifiers that follow a lookahead assertion. */


     switch (pptr[1])
       {
@@ -9454,21 +9456,29 @@
 the error offset is marked unset. The enables the functions above not to
 override settings from deeper nestings.


-Arguments cb      points to the compile block
-Returns:          0 on success, or an errorcode (cb->erroroffset will be set)
+This function is called recursively from get_branchlength() for lookaheads in
+order to process any lookbehinds that they may contain. It stops when it hits a
+non-nested closing parenthesis in this case, returning a pointer to it.
+
+Arguments
+  pptr    points to where to start (start of pattern or start of lookahead)
+  retptr  if not NULL, return the ket pointer here
+  cb      points to the compile block
+
+Returns:  0 on success, or an errorcode (cb->erroroffset will be set)
 */


static int
-check_lookbehinds(compile_block *cb)
+check_lookbehinds(uint32_t *pptr, uint32_t **retptr, compile_block *cb)
{
-uint32_t *pptr;
int max;
int errorcode = 0;
int loopcount = 0;
+int nestlevel = 0;

cb->erroroffset = PCRE2_UNSET;

-for (pptr = cb->parsed_pattern; *pptr != META_END; pptr++)
+for (; *pptr != META_END; pptr++)
{
if (*pptr < META_END) continue; /* Literal */

@@ -9482,14 +9492,31 @@
       pptr += 1;
     break;


+    case META_KET:
+    if (--nestlevel < 0)
+      {
+      if (retptr != NULL) *retptr = pptr;
+      return 0;
+      }
+    break;
+
+    case META_ATOMIC:
+    case META_CAPTURE:
+    case META_COND_ASSERT:
+    case META_LOOKAHEAD:
+    case META_LOOKAHEADNOT:
+    case META_LOOKAHEAD_NA:
+    case META_NOCAPTURE:
+    case META_SCRIPT_RUN:
+    nestlevel++;
+    break;
+
     case META_ACCEPT:
     case META_ALT:
     case META_ASTERISK:
     case META_ASTERISK_PLUS:
     case META_ASTERISK_QUERY:
-    case META_ATOMIC:
     case META_BACKREF:
-    case META_CAPTURE:
     case META_CIRCUMFLEX:
     case META_CLASS:
     case META_CLASS_EMPTY:
@@ -9497,15 +9524,9 @@
     case META_CLASS_END:
     case META_CLASS_NOT:
     case META_COMMIT:
-    case META_COND_ASSERT:
     case META_DOLLAR:
     case META_DOT:
     case META_FAIL:
-    case META_KET:
-    case META_LOOKAHEAD:
-    case META_LOOKAHEADNOT:
-    case META_LOOKAHEAD_NA:
-    case META_NOCAPTURE:
     case META_PLUS:
     case META_PLUS_PLUS:
     case META_PLUS_QUERY:
@@ -9515,7 +9536,6 @@
     case META_QUERY_QUERY:
     case META_RANGE_ESCAPED:
     case META_RANGE_LITERAL:
-    case META_SCRIPT_RUN:
     case META_SKIP:
     case META_THEN:
     break;
@@ -10021,7 +10041,7 @@


if (has_lookbehind)
{
- errorcode = check_lookbehinds(&cb);
+ errorcode = check_lookbehinds(cb.parsed_pattern, NULL, &cb);
if (errorcode != 0) goto HAD_CB_ERROR;
}


Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testinput1    2019-07-16 15:06:21 UTC (rev 1133)
@@ -6377,4 +6377,10 @@
 /(?<=a(*SKIP)x)|d/
     abcd


+/(?<=(?=.(?<=x)))/aftertext
+    abx
+
+/(?<=(?=(?<=a)))b/
+    ab
+
 # End of testinput1 


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testinput2    2019-07-16 15:06:21 UTC (rev 1133)
@@ -5690,4 +5690,7 @@


# ----

+/(?<=(?=.(?<=x)))/
+    ab\=ph
+
 # End of testinput2


Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testoutput1    2019-07-16 15:06:21 UTC (rev 1133)
@@ -10097,4 +10097,13 @@
     abcd
  0: d


+/(?<=(?=.(?<=x)))/aftertext
+    abx
+ 0: 
+ 0+ x
+
+/(?<=(?=(?<=a)))b/
+    ab
+ 0: b
+
 # End of testinput1 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2019-07-15 16:04:13 UTC (rev 1132)
+++ code/trunk/testdata/testoutput2    2019-07-16 15:06:21 UTC (rev 1133)
@@ -17185,6 +17185,10 @@


# ----

+/(?<=(?=.(?<=x)))/
+    ab\=ph
+No match
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data