[Pcre-svn] [900] code/trunk: Previous FIRSTLINE patch was br…

Inizio della pagina
Delete this message
Autore: Subversion repository
Data:  
To: pcre-svn
Oggetto: [Pcre-svn] [900] code/trunk: Previous FIRSTLINE patch was broken.
Revision: 900
          http://www.exim.org/viewvc/pcre2?view=rev&revision=900
Author:   ph10
Date:     2018-01-01 14:54:06 +0000 (Mon, 01 Jan 2018)
Log Message:
-----------
Previous FIRSTLINE patch was broken. Fix it.


Modified Paths:
--------------
    code/trunk/src/pcre2_dfa_match.c
    code/trunk/src/pcre2_match.c
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput6
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput6


Modified: code/trunk/src/pcre2_dfa_match.c
===================================================================
--- code/trunk/src/pcre2_dfa_match.c    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/src/pcre2_dfa_match.c    2018-01-01 14:54:06 UTC (rev 900)
@@ -3363,8 +3363,6 @@
   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
       (options & PCRE2_DFA_RESTART) == 0)
     {
-    PCRE2_SPTR save_end_subject = end_subject;
-
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
     first newline following the start of matching. Temporarily adjust
@@ -3388,13 +3386,6 @@
       else
 #endif
       while (t < end_subject && !IS_NEWLINE(t)) t++;
-
-      /* Note that we only need to advance by one code unit if we found a
-      newline. If the newline is CRLF, a first code unit of LF should not
-      match, because it is not at or before the newline. Similarly, only the
-      first code unit of a Unicode newline might be relevant. */
-
-      if (t < end_subject) t++;
       end_subject = t;
       }


@@ -3466,14 +3457,18 @@
 #endif
           }


-        /* If we can't find the required code unit, break the bumpalong loop,
-        to force a match failure, except when doing partial matching, when we
-        let the next cycle run at the end of the subject. To see why, consider
-        the pattern /(?<=abc)def/, which partially matches "abc", even though
-        the string does not contain the starting character "d". */
+        /* If we can't find the required code unit, having reached the true end
+        of the subject, break the bumpalong loop, to force a match failure,
+        except when doing partial matching, when we let the next cycle run at
+        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+        which partially matches "abc", even though the string does not contain
+        the starting character "d". If we have not reached the true end of the
+        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+        we also let the cycle run, because the matching string is legitimately
+        allowed to start with the first code unit of a newline. */


         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
-            start_match >= end_subject)
+            start_match >= mb->end_subject)
           break;
         }


@@ -3532,7 +3527,7 @@

     /* Restore fudged end_subject */


-    end_subject = save_end_subject;
+    end_subject = mb->end_subject;


     /* The following two optimizations are disabled for partial matching. */



Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/src/pcre2_match.c    2018-01-01 14:54:06 UTC (rev 900)
@@ -7,7 +7,7 @@


                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2017 University of Cambridge
+          New API code Copyright (c) 2015-2018 University of Cambridge


-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -6363,15 +6363,11 @@

   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
     {
-    PCRE2_SPTR save_end_subject = end_subject;
-
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
     first newline following the start of matching. Temporarily adjust
-    end_subject so that we stop the optimization scans for a first code unit
-    immediately after the first character of a newline (the first code unit can
-    legitimately be a newline). If the match fails at the newline, later code
-    breaks this loop. */
+    end_subject so that we stop the scans for a first code unit at a newline.
+    If the match fails at the newline, later code breaks the loop. */


     if (firstline)
       {
@@ -6388,13 +6384,6 @@
       else
 #endif
       while (t < end_subject && !IS_NEWLINE(t)) t++;
-
-      /* Note that we only need to advance by one code unit if we found a
-      newline. If the newline is CRLF, a first code unit of LF should not
-      match, because it is not at or before the newline. Similarly, only the
-      first code unit of a Unicode newline might be relevant. */
-
-      if (t < end_subject) t++;
       end_subject = t;
       }


@@ -6470,13 +6459,17 @@
 #endif
           }


-        /* If we can't find the required code unit, break the bumpalong loop,
-        to force a match failure, except when doing partial matching, when we
-        let the next cycle run at the end of the subject. To see why, consider
-        the pattern /(?<=abc)def/, which partially matches "abc", even though
-        the string does not contain the starting character "d". */
+        /* If we can't find the required code unit, having reached the true end
+        of the subject, break the bumpalong loop, to force a match failure,
+        except when doing partial matching, when we let the next cycle run at
+        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+        which partially matches "abc", even though the string does not contain
+        the starting character "d". If we have not reached the true end of the
+        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+        we also let the cycle run, because the matching string is legitimately
+        allowed to start with the first code unit of a newline. */


-        if (!mb->partial && start_match >= end_subject)
+        if (!mb->partial && start_match >= mb->end_subject)
           {
           rc = MATCH_NOMATCH;
           break;
@@ -6538,7 +6531,7 @@


     /* Restore fudged end_subject */


-    end_subject = save_end_subject;
+    end_subject = mb->end_subject;


     /* The following two optimizations must be disabled for partial matching. */



Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/testdata/testinput2    2018-01-01 14:54:06 UTC (rev 900)
@@ -5405,4 +5405,8 @@
 \= Expect no match
     xyz\r\nabc


+/[abc]/firstline
+\= Expect no match
+    \na
+    
 # End of testinput2


Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/testdata/testinput6    2018-01-01 14:54:06 UTC (rev 900)
@@ -4942,4 +4942,8 @@
 \= Expect no match
     xyz\r\nabc


+/[abc]/firstline
+\= Expect no match
+    \na
+    
 # End of testinput6


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/testdata/testoutput2    2018-01-01 14:54:06 UTC (rev 900)
@@ -16453,6 +16453,11 @@
     xyz\r\nabc
 No match


+/[abc]/firstline
+\= Expect no match
+    \na
+No match
+    
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data


Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6    2018-01-01 14:12:35 UTC (rev 899)
+++ code/trunk/testdata/testoutput6    2018-01-01 14:54:06 UTC (rev 900)
@@ -7766,4 +7766,9 @@
     xyz\r\nabc
 No match


+/[abc]/firstline
+\= Expect no match
+    \na
+No match
+    
 # End of testinput6