[Pcre-svn] [365] code/trunk: Further fixes for bumpalong pro…

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [365] code/trunk: Further fixes for bumpalong processing in UTF-8 mode.
Revision: 365
          http://vcs.pcre.org/viewvc?view=rev&revision=365
Author:   ph10
Date:     2008-07-11 18:06:55 +0100 (Fri, 11 Jul 2008)


Log Message:
-----------
Further fixes for bumpalong processing in UTF-8 mode.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_dfa_exec.c
    code/trunk/pcre_exec.c
    code/trunk/testdata/testinput8
    code/trunk/testdata/testoutput8


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/ChangeLog    2008-07-11 17:06:55 UTC (rev 365)
@@ -61,7 +61,7 @@


 14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
     the data contained the byte 0x85 as part of a UTF-8 character within its 
-    first line.  
+    first line. This applied both to normal and DFA matching. 



Version 7.7 07-May-08

Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c    2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/pcre_dfa_exec.c    2008-07-11 17:06:55 UTC (rev 365)
@@ -2733,7 +2733,18 @@


     if (firstline)
       {
-      const uschar *t = current_subject;
+      USPTR t = current_subject;
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {     
+        while (t < md->end_subject && !IS_NEWLINE(t)) 
+          {
+          t++;
+          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+          } 
+        }
+      else
+#endif        
       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
       end_subject = t;
       }
@@ -2755,9 +2766,22 @@
       {
       if (current_subject > md->start_subject + start_offset)
         {
+#ifdef SUPPORT_UTF8
+        if (utf8)
+          {
+          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
+            {
+            current_subject++;       
+            while(current_subject < end_subject && 
+                  (*current_subject & 0xc0) == 0x80) 
+              current_subject++;
+            } 
+          }
+        else
+#endif                  
         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
           current_subject++;
-
+          
         /* If we have just passed a CR and the newline option is ANY or
         ANYCRLF, and we are now at a LF, advance the match position by one more
         character. */


Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/pcre_exec.c    2008-07-11 17:06:55 UTC (rev 365)
@@ -4695,7 +4695,7 @@


   if (firstline)
     {
-    USPTR *t = start_match;
+    USPTR t = start_match;
 #ifdef SUPPORT_UTF8
     if (utf8)
       {     
@@ -4716,39 +4716,11 @@
   if (first_byte >= 0)
     {
     if (first_byte_caseless)
-      {
-#ifdef SUPPORT_UTF8
-      if (utf8)
-        {
-        while (start_match < end_subject && md->lcc[*start_match] != first_byte)
-          {
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        }
-      else
-#endif                  
       while (start_match < end_subject && md->lcc[*start_match] != first_byte)
         start_match++;
-      }   
-    else    /* Caseful case */
-      { 
-#ifdef SUPPORT_UTF8
-      if (utf8)
-        {
-        while (start_match < end_subject && *start_match != first_byte)
-          {
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        }
-      else
-#endif                  
+    else
       while (start_match < end_subject && *start_match != first_byte)
         start_match++;
-      }   
     }


/* Or to just after a linebreak for a multiline match */
@@ -4788,23 +4760,6 @@

   else if (start_bits != NULL)
     {
-#ifdef SUPPORT_UTF8    
-    if (utf8)
-      { 
-      while (start_match < end_subject)
-        {
-        register unsigned int c = *start_match;
-        if ((start_bits[c/8] & (1 << (c&7))) == 0)
-          { 
-          start_match++;       
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
-            start_match++;
-          } 
-        else break;
-        }
-      }
-    else
-#endif           
     while (start_match < end_subject)
       {
       register unsigned int c = *start_match;


Modified: code/trunk/testdata/testinput8
===================================================================
--- code/trunk/testdata/testinput8    2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/testdata/testinput8    2008-07-11 17:06:55 UTC (rev 365)
@@ -664,4 +664,7 @@
     a\x{85}b\<bsr_anycrlf>
     a\x0bb\<bsr_anycrlf>


+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+
 / End of testinput 8 / 


Modified: code/trunk/testdata/testoutput8
===================================================================
--- code/trunk/testdata/testoutput8    2008-07-11 14:53:41 UTC (rev 364)
+++ code/trunk/testdata/testoutput8    2008-07-11 17:06:55 UTC (rev 365)
@@ -1284,4 +1284,8 @@
     a\x0bb\<bsr_anycrlf>
 No match


+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+ 0: X
+
 / End of testinput 8 /