[Pcre-svn] [1543] code/trunk: Fix pcregrep loop when \K is u…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1543] code/trunk: Fix pcregrep loop when \K is used in a lookbehind assertion.
Revision: 1543
          http://vcs.pcre.org/viewvc?view=rev&revision=1543
Author:   ph10
Date:     2015-04-07 16:52:11 +0100 (Tue, 07 Apr 2015)


Log Message:
-----------
Fix pcregrep loop when \K is used in a lookbehind assertion.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/RunGrepTest
    code/trunk/pcregrep.c
    code/trunk/testdata/grepoutput


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-04-01 15:43:53 UTC (rev 1542)
+++ code/trunk/ChangeLog    2015-04-07 15:52:11 UTC (rev 1543)
@@ -145,6 +145,9 @@
 35. A mutual recursion within a lookbehind assertion such as (?<=((?2))((?1)))
     caused a stack overflow instead of the diagnosis of a non-fixed length
     lookbehind assertion. This bug was discovered by the LLVM fuzzer.
+    
+36. The use of \K in a positive lookbehind assertion in a non-anchored pattern
+    (e.g. /(?<=\Ka)/) could make pcregrep loop.



Version 8.36 26-September-2014

Modified: code/trunk/RunGrepTest
===================================================================
--- code/trunk/RunGrepTest    2015-04-01 15:43:53 UTC (rev 1542)
+++ code/trunk/RunGrepTest    2015-04-07 15:52:11 UTC (rev 1543)
@@ -506,6 +506,11 @@
 (cd $srcdir; echo "a" | $valgrind $pcregrep -M "|a" ) >>testtrygrep 2>&1
 echo "RC=$?" >>testtrygrep


+echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
+echo "a" >testtemp1grep
+echo "aaaaa" >>testtemp1grep
+(cd $srcdir; $valgrind $pcregrep --line-offsets '(?<=\Ka)' testtemp1grep) >>testtrygrep 2>&1
+echo "RC=$?" >>testtrygrep

# Now compare the results.


Modified: code/trunk/pcregrep.c
===================================================================
--- code/trunk/pcregrep.c    2015-04-01 15:43:53 UTC (rev 1542)
+++ code/trunk/pcregrep.c    2015-04-07 15:52:11 UTC (rev 1543)
@@ -1582,11 +1582,14 @@
   int endlinelength;
   int mrc = 0;
   int startoffset = 0;
+  int prevoffsets[2]; 
   unsigned int options = 0;
   BOOL match;
   char *matchptr = ptr;
   char *t = ptr;
   size_t length, linelength;
+  
+  prevoffsets[0] = prevoffsets[1] = -1; 


   /* At this point, ptr is at the start of a line. We need to find the length
   of the subject string to pass to pcre_exec(). In multiline mode, it is the
@@ -1729,55 +1732,86 @@
       {
       if (!invert)
         {
-        if (printname != NULL) fprintf(stdout, "%s:", printname);
-        if (number) fprintf(stdout, "%d:", linenumber);
-
-        /* Handle --line-offsets */
-
-        if (line_offsets)
-          fprintf(stdout, "%d,%d\n", (int)(matchptr + offsets[0] - ptr),
-            offsets[1] - offsets[0]);
-
-        /* Handle --file-offsets */
-
-        else if (file_offsets)
-          fprintf(stdout, "%d,%d\n",
-            (int)(filepos + matchptr + offsets[0] - ptr),
-            offsets[1] - offsets[0]);
-
-        /* Handle --only-matching, which may occur many times */
-
-        else
+        int oldstartoffset = startoffset;
+        
+        /* It is possible, when a lookbehind assertion contains \K, for the 
+        same string to be found again. The code below advances startoffset, but 
+        until it is past the "bumpalong" offset that gave the match, the same
+        substring will be returned. The PCRE1 library does not return the
+        bumpalong offset, so all we can do is ignore repeated strings. (PCRE2
+        does this better.) */
+         
+        if (prevoffsets[0] != offsets[0] || prevoffsets[1] != offsets[1])
           {
-          BOOL printed = FALSE;
-          omstr *om;
-
-          for (om = only_matching; om != NULL; om = om->next)
+          prevoffsets[0] = offsets[0];
+          prevoffsets[1] = offsets[1]; 
+            
+          if (printname != NULL) fprintf(stdout, "%s:", printname);
+          if (number) fprintf(stdout, "%d:", linenumber);
+          
+          /* Handle --line-offsets */
+          
+          if (line_offsets)
+            fprintf(stdout, "%d,%d\n", (int)(matchptr + offsets[0] - ptr),
+              offsets[1] - offsets[0]);
+          
+          /* Handle --file-offsets */
+          
+          else if (file_offsets)
+            fprintf(stdout, "%d,%d\n",
+              (int)(filepos + matchptr + offsets[0] - ptr),
+              offsets[1] - offsets[0]);
+          
+          /* Handle --only-matching, which may occur many times */
+          
+          else
             {
-            int n = om->groupnum;
-            if (n < mrc)
+            BOOL printed = FALSE;
+            omstr *om;
+          
+            for (om = only_matching; om != NULL; om = om->next)
               {
-              int plen = offsets[2*n + 1] - offsets[2*n];
-              if (plen > 0)
+              int n = om->groupnum;
+              if (n < mrc)
                 {
-                if (printed) fprintf(stdout, "%s", om_separator);
-                if (do_colour) fprintf(stdout, "%c[%sm", 0x1b, colour_string);
-                FWRITE(matchptr + offsets[n*2], 1, plen, stdout);
-                if (do_colour) fprintf(stdout, "%c[00m", 0x1b);
-                printed = TRUE;
+                int plen = offsets[2*n + 1] - offsets[2*n];
+                if (plen > 0)
+                  {
+                  if (printed) fprintf(stdout, "%s", om_separator);
+                  if (do_colour) fprintf(stdout, "%c[%sm", 0x1b, colour_string);
+                  FWRITE(matchptr + offsets[n*2], 1, plen, stdout);
+                  if (do_colour) fprintf(stdout, "%c[00m", 0x1b);
+                  printed = TRUE;
+                  }
                 }
               }
+          
+            if (printed || printname != NULL || number) fprintf(stdout, "\n");
             }
+          }   


-          if (printed || printname != NULL || number) fprintf(stdout, "\n");
-          }
+        /* Prepare to repeat to find the next match. If the patterned contained 
+        a lookbehind tht included \K, it is possible that the end of the match 
+        might be at or before the actual strting offset we have just used. We 
+        need to start one character further on. Unfortunately, for unanchored 
+        patterns, the actual start offset can be greater that the one that was 
+        set as a result of "bumpalong". PCRE1 does not return the actual start 
+        offset, so we have to check against the original start offset. This may 
+        lead to duplicates - we we need the fudge above to avoid printing them. 
+        (PCRE2 does this better.) */


-        /* Prepare to repeat to find the next match */
-
         match = FALSE;
         if (line_buffered) fflush(stdout);
         rc = 0;                      /* Had some success */
         startoffset = offsets[1];    /* Restart after the match */
+        if (startoffset <= oldstartoffset)
+          {
+          if ((size_t)startoffset >= length) 
+            goto END_ONE_MATCH;              /* We were at the end */
+          startoffset = oldstartoffset + 1;
+          if (utf8)
+            while ((matchptr[startoffset] & 0xc0) == 0x80) startoffset++;    
+          }   
         goto ONLY_MATCHING_RESTART;
         }
       }
@@ -1974,6 +2008,7 @@
   /* Advance to after the newline and increment the line number. The file
   offset to the current line is maintained in filepos. */


+ END_ONE_MATCH:
ptr += linelength + endlinelength;
filepos += (int)(linelength + endlinelength);
linenumber++;

Modified: code/trunk/testdata/grepoutput
===================================================================
--- code/trunk/testdata/grepoutput    2015-04-01 15:43:53 UTC (rev 1542)
+++ code/trunk/testdata/grepoutput    2015-04-07 15:52:11 UTC (rev 1543)
@@ -743,3 +743,11 @@
 ---------------------------- Test 106 -----------------------------
 a
 RC=0
+---------------------------- Test 107 -----------------------------
+1:0,1
+2:0,1
+2:1,1
+2:2,1
+2:3,1
+2:4,1
+RC=0