[Pcre-svn] [364] code/trunk: Several bugs concerned with ski…

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [364] code/trunk: Several bugs concerned with skipping over UTF-8 characters at the start of
Revision: 364
          http://vcs.pcre.org/viewvc?view=rev&revision=364
Author:   ph10
Date:     2008-07-11 15:53:41 +0100 (Fri, 11 Jul 2008)


Log Message:
-----------
Several bugs concerned with skipping over UTF-8 characters at the start of
matching (8.0/13, 8.0/14).

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/pcre_internal.h
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/ChangeLog    2008-07-11 14:53:41 UTC (rev 364)
@@ -55,6 +55,13 @@
     pcre_dfa_exec() could read past the end of the passed subject if there was 
     no match. To help with detecting such bugs (e.g. with valgrind), I modified
     pcretest so that it places the subject at the end of its malloc-ed buffer.
+    
+13. The change to pcretest in 12 above threw up a couple more cases when pcre_
+    exec() might read past the end of the data buffer in UTF-8 mode. 
+    
+14. A similar bug to 7.3/2 existed when the PCRE_FIRSTLINE option was set and
+    the data contained the byte 0x85 as part of a UTF-8 character within its 
+    first line.  



Version 7.7 07-May-08

Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/pcre_exec.c    2008-07-11 14:53:41 UTC (rev 364)
@@ -4695,32 +4695,82 @@


   if (firstline)
     {
-    USPTR t = start_match;
+    USPTR *t = start_match;
+#ifdef SUPPORT_UTF8
+    if (utf8)
+      {     
+      while (t < md->end_subject && !IS_NEWLINE(t)) 
+        {
+        t++;
+        while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+        } 
+      }
+    else
+#endif        
     while (t < md->end_subject && !IS_NEWLINE(t)) t++;
     end_subject = t;
     }


- /* Now test for a unique first byte */
+ /* Now advance to a unique first byte if there is one. */

   if (first_byte >= 0)
     {
     if (first_byte_caseless)
-      while (start_match < end_subject &&
-             md->lcc[*start_match] != first_byte)
-        { NEXTCHAR(start_match); }
-    else
+      {
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
+      while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+        start_match++;
+      }   
+    else    /* Caseful case */
+      { 
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && *start_match != first_byte)
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
       while (start_match < end_subject && *start_match != first_byte)
-        { NEXTCHAR(start_match); }
+        start_match++;
+      }   
     }


- /* Or to just after a linebreak for a multiline match if possible */
+ /* Or to just after a linebreak for a multiline match */

   else if (startline)
     {
     if (start_match > md->start_subject + start_offset)
       {
+#ifdef SUPPORT_UTF8
+      if (utf8)
+        {
+        while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          {
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        }
+      else
+#endif                  
       while (start_match < end_subject && !WAS_NEWLINE(start_match))
-        { NEXTCHAR(start_match); }
+        start_match++;


       /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
       and we are now at a LF, advance the match position by one more character.
@@ -4734,16 +4784,32 @@
       }
     }


- /* Or to a non-unique first char after study */
+ /* Or to a non-unique first byte after study */

   else if (start_bits != NULL)
     {
+#ifdef SUPPORT_UTF8    
+    if (utf8)
+      { 
+      while (start_match < end_subject)
+        {
+        register unsigned int c = *start_match;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0)
+          { 
+          start_match++;       
+          while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 
+            start_match++;
+          } 
+        else break;
+        }
+      }
+    else
+#endif           
     while (start_match < end_subject)
       {
       register unsigned int c = *start_match;
-      if ((start_bits[c/8] & (1 << (c&7))) == 0)
-        { NEXTCHAR(start_match); }
-      else break;
+      if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+        else break;
       }
     }



Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/pcre_internal.h    2008-07-11 14:53:41 UTC (rev 364)
@@ -381,7 +381,6 @@
 support is omitted, we don't even define it. */


#ifndef SUPPORT_UTF8
-#define NEXTCHAR(p) p++;
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -391,13 +390,6 @@

#else /* SUPPORT_UTF8 */

-/* Advance a character pointer one byte in non-UTF-8 mode and by one character
-in UTF-8 mode. */
-
-#define NEXTCHAR(p) \
- p++; \
- if (utf8) { while((*p & 0xc0) == 0x80) p++; }
-
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/testdata/testinput5    2008-07-11 14:53:41 UTC (rev 364)
@@ -477,4 +477,7 @@
     \x{de}\x{de}
     \x{123} 


+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+
 / End of testinput5 /


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2008-07-10 16:24:33 UTC (rev 363)
+++ code/trunk/testdata/testoutput5    2008-07-11 14:53:41 UTC (rev 364)
@@ -1637,4 +1637,8 @@
 ** Truncation will probably give the wrong result.
 No match


+/X/8f<any> 
+    A\x{1ec5}ABCXYZ
+ 0: X
+
 / End of testinput5 /