[Pcre-svn] [243] code/trunk: Fix handling of global matching…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [243] code/trunk: Fix handling of global matching in pcre2test when a lookbehind assertion
Revision: 243
          http://www.exim.org/viewvc/pcre2?view=rev&revision=243
Author:   ph10
Date:     2015-04-06 13:16:36 +0100 (Mon, 06 Apr 2015)


Log Message:
-----------
Fix handling of global matching in pcre2test when a lookbehind assertion
contains \K.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2test.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/ChangeLog    2015-04-06 12:16:36 UTC (rev 243)
@@ -70,7 +70,10 @@
 17. The use of \K in a positive lookbehind assertion in a non-anchored pattern
 (e.g. /(?<=\Ka)/) could make pcre2grep loop.


+18. There was a similar problem to 17 in pcre2test for global matches, though
+the code there did catch the loop.

+
Version 10.10 06-March-2015
---------------------------


Modified: code/trunk/src/pcre2test.c
===================================================================
--- code/trunk/src/pcre2test.c    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/src/pcre2test.c    2015-04-06 12:16:36 UTC (rev 243)
@@ -3557,14 +3557,14 @@
 callout argument string point to strings of the appropriate width. Casts can be
 used to deal with this.


-Argument:   
+Argument:
   cb            pointer to enumerate block
   callout_data  user data


-Returns:    0 
+Returns:    0
 */


-static int callout_callback(pcre2_callout_enumerate_block_8 *cb, 
+static int callout_callback(pcre2_callout_enumerate_block_8 *cb,
   void *callout_data)
 {
 uint32_t i;
@@ -3587,13 +3587,13 @@
       }
   fprintf(outfile, "%c  ", delimiter);
   }
-else fprintf(outfile, "%d  ", cb->callout_number); 
+else fprintf(outfile, "%d  ", cb->callout_number);


fprintf(outfile, "%.*s\n",
(int)((cb->next_item_length == 0)? 1 : cb->next_item_length),
pbuffer8 + cb->pattern_position);
-
-return 0;
+
+return 0;
}


@@ -3879,10 +3879,10 @@
     int len;
     fprintf(outfile, "Callout enumerate failed: error %d: ", errorcode);
     if (errorcode < 0)
-      {  
+      {
       PCRE2_GET_ERROR_MESSAGE(len, errorcode, pbuffer);
       PCHARSV(CASTVAR(void *, pbuffer), 0, len, FALSE, outfile);
-      } 
+      }
     fprintf(outfile, "\n");
     return PR_SKIP;
     }
@@ -5684,20 +5684,20 @@


ovector = FLD(match_data, ovector);

- /* After the first time round a global loop, save the current ovector[0,1] so
- that we can check that they do change each time. Otherwise a matching bug
- that returns the same string causes an infinite loop. It has happened! */
+ /* After the first time round a global loop, for a normal global (/g)
+ iteration, save the current ovector[0,1] so that we can check that they do
+ change each time. Otherwise a matching bug that returns the same string
+ causes an infinite loop. It has happened! */

-  if (gmatched > 0)
+  if (gmatched > 0 && (dat_datctl.control & CTL_GLOBAL) != 0)
     {
     ovecsave[0] = ovector[0];
     ovecsave[1] = ovector[1];
     }


- /* Set the variables on the first iteration, just to stop a compiler warning
- when ovecsave[] is referenced below. */
+ /* For altglobal (or first time round the loop), set an "unset" value. */

- else ovecsave[0] = ovecsave[1] = 0;
+ else ovecsave[0] = ovecsave[1] = PCRE2_UNSET;

   /* Fill the ovector with junk to detect elements that do not get set
   when they should be. */
@@ -6169,14 +6169,49 @@
       if (end_offset == ulen) break;      /* End of subject */
       g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
       }
-    else g_notempty = 0;


-    /* For /g, update the start offset, leaving the rest alone */
+    /* However, even after matching a non-empty string, there is still one
+    tricky case. If a pattern contains \K within a lookbehind assertion at the
+    start, the end of the matched string can be at the offset where the match
+    started. In the case of a normal /g iteration without special action, this
+    leads to a loop that keeps on returning the same substring. The loop would
+    be caught above, but we really want to move on to the next match. */


-    if ((dat_datctl.control & CTL_GLOBAL) != 0) dat_datctl.offset = end_offset;
+    else
+      {
+      g_notempty = 0;   /* Set for a "normal" repeat */
+      if ((dat_datctl.control & CTL_GLOBAL) != 0)
+        {
+        PCRE2_SIZE startchar;
+        PCRE2_GET_STARTCHAR(startchar, match_data);
+        if (end_offset <= startchar)
+          {
+          if (startchar >= ulen) break;       /* End of subject */
+          end_offset = startchar + 1;
+          if (utf && test_mode != PCRE32_MODE)
+            {
+            if (test_mode == PCRE8_MODE)
+              {
+              for (; end_offset < ulen; end_offset++)
+                if ((((PCRE2_SPTR8)pp)[end_offset] & 0xc0) != 0x80) break;
+              }
+            else  /* 16-bit mode */
+              {
+              for (; end_offset < ulen; end_offset++)
+                if ((((PCRE2_SPTR16)pp)[end_offset] & 0xfc00) != 0xdc00) break;
+              }
+            }
+          }
+        }
+      }


-    /* For /G, update the pointer and length */
+    /* For /g (global), update the start offset, leaving the rest alone. */


+    if ((dat_datctl.control & CTL_GLOBAL) != 0)
+      dat_datctl.offset = end_offset;
+
+    /* For altglobal, just update the pointer and length. */
+
     else
       {
       pp += end_offset * code_unit_size;


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/testdata/testinput2    2015-04-06 12:16:36 UTC (rev 243)
@@ -4255,4 +4255,12 @@


";(?<=()((?3))((?2)))"

+# Perl loops on this (PCRE2 used to!)
+
+/(?<=\Ka)/g,aftertext
+    aaaaa
+
+/(?<=\Ka)/altglobal,aftertext
+    aaaaa
+
 # End of testinput2 


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/testdata/testinput5    2015-04-06 12:16:36 UTC (rev 243)
@@ -1643,4 +1643,10 @@
 /[A-`]/iB,utf
     abcdefghijklmno


+/(?<=\K\x{17f})/g,utf,aftertext
+    \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
+
+/(?<=\K\x{17f})/altglobal,utf,aftertext
+    \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
+
 # End of testinput5 


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/testdata/testoutput2    2015-04-06 12:16:36 UTC (rev 243)
@@ -14260,4 +14260,32 @@
 ";(?<=()((?3))((?2)))"
 Failed: error 125 at offset 20: lookbehind assertion is not fixed length


+# Perl loops on this (PCRE2 used to!)
+
+/(?<=\Ka)/g,aftertext
+    aaaaa
+ 0: a
+ 0+ aaaa
+ 0: a
+ 0+ aaa
+ 0: a
+ 0+ aa
+ 0: a
+ 0+ a
+ 0: a
+ 0+ 
+
+/(?<=\Ka)/altglobal,aftertext
+    aaaaa
+ 0: a
+ 0+ aaaa
+ 0: a
+ 0+ aaa
+ 0: a
+ 0+ aa
+ 0: a
+ 0+ a
+ 0: a
+ 0+ 
+
 # End of testinput2 


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2015-04-03 11:14:19 UTC (rev 242)
+++ code/trunk/testdata/testoutput5    2015-04-06 12:16:36 UTC (rev 243)
@@ -4021,4 +4021,30 @@
     abcdefghijklmno
  0: a


+/(?<=\K\x{17f})/g,utf,aftertext
+    \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}
+ 0: \x{17f}
+ 0+ 
+
+/(?<=\K\x{17f})/altglobal,utf,aftertext
+    \x{17f}\x{17f}\x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}\x{17f}
+ 0: \x{17f}
+ 0+ \x{17f}
+ 0: \x{17f}
+ 0+ 
+
 # End of testinput5