[Pcre-svn] [556] code/trunk: Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE

Autor: Subversion repository
Data:
Para: pcre-svn
Assunto: [Pcre-svn] [556] code/trunk: Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.

Revision: 556

          http://vcs.pcre.org/viewvc?view=rev&revision=556
Author:   ph10
Date:     2010-10-26 12:06:44 +0100 (Tue, 26 Oct 2010)

Log Message:
-----------
Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/doc/pcrepattern.3
    code/trunk/pcre_compile.c
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput5

Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/ChangeLog    2010-10-26 11:06:44 UTC (rev 556)
@@ -39,6 +39,19 @@
     /t\b/ matched against "cat" with PCRE_PARTIAL_HARD set did return a partial
     match rather than a full match, which was wrong by the old rules, but is 
     now correct.] 
+    
+6.  There was a bug in the handling of #-introduced comments, recognized when
+    PCRE_EXTENDED is set, when PCRE_NEWLINE_ANY and PCRE_UTF8 were also set.
+    If a UTF-8 multi-byte character included the byte 0x85 (e.g. +U0445, whose
+    UTF-8 encoding is 0xd1,0x85), this was misinterpreted as a newline when
+    scanning for the end of the comment. (*Character* 0x85 is an "any" newline,
+    but *byte* 0x85 is not, in UTF-8 mode). This bug was present in several 
+    places in pcre_compile().
+    
+7.  Related to (6) above, when pcre_compile() was skipping #-introduced 
+    comments when looking ahead for named forward references to subpatterns, 
+    the only newline sequence it recognized was NL. It now handles newlines 
+    according to the set newline convention.

Version 8.10 25-Jun-2010

Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3    2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/doc/pcrepattern.3    2010-10-26 11:06:44 UTC (rev 556)
@@ -66,6 +66,7 @@
 page.
 .
 .
+.\" HTML <a name="newlines"></a>
 .SH "NEWLINE CONVENTIONS"
 .rs
 .sp
@@ -2109,7 +2110,25 @@
 .P
 If the PCRE_EXTENDED option is set, an unescaped # character outside a
 character class introduces a comment that continues to immediately after the
-next newline in the pattern.
+next newline character or character sequence in the pattern. Which characters 
+are interpreted as newlines is controlled by the options passed to 
+\fBpcre_compile()\fP or by a special sequence at the start of the pattern, as
+described in the section entitled
+.\" HTML <a href="#recursion">
+.\" </a>
+"Newline conventions"
+.\"
+above. Note that end of a comment is a literal newline sequence in the pattern; 
+escape sequences that happen to represent a newline do not terminate a comment. 
+For example, consider this pattern when PCRE_EXTENDED is set, and the default 
+newline convention is in force:
+.sp
+  abc #comment \en still comment
+.sp
+On encountering the # character, \fBpcre_compile()\fP skips along, looking for 
+a newline in the pattern. The sequence \en is still literal at this stage, so
+it does not terminate the comment. Only an actual character with the code value
+0x0a does so.
 .
 .
 .\" HTML <a name="recursion"></a>

Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/pcre_compile.c    2010-10-26 11:06:44 UTC (rev 556)
@@ -1110,6 +1110,7 @@
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode 
   count        pointer to the current capturing subpattern number (updated)

 Returns:       the number of the named subpattern, or -1 if not found
@@ -1117,7 +1118,7 @@

static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
@@ -1278,7 +1279,15 @@

   if (xmode && *ptr == CHAR_NUMBER_SIGN)
     {
-    while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+    ptr++; 
+    while (*ptr != 0)
+      {
+      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+      ptr++;
+#ifdef SUPPORT_UTF8         
+      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+      }
     if (*ptr == 0) goto FAIL_EXIT;
     continue;
     }
@@ -1287,7 +1296,7 @@

   if (*ptr == CHAR_LEFT_PARENTHESIS)
     {
-    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
     if (rc > 0) return rc;
     if (*ptr == 0) goto FAIL_EXIT;
     }
@@ -1333,12 +1342,14 @@
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
+  utf8         TRUE if we are in UTF-8 mode

 Returns:       the number of the found subpattern, or -1 if not found
 */

static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
@@ -1351,7 +1362,7 @@

for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}

@@ -2515,8 +2526,15 @@
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
     if (*ptr == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
+        {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
       }
     else break;
     }
@@ -2552,8 +2570,15 @@
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
     if (*ptr == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
+        {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+        }
       }
     else break;
     }
@@ -3126,9 +3151,14 @@
     if ((cd->ctypes[c] & ctype_space) != 0) continue;
     if (c == CHAR_NUMBER_SIGN)
       {
-      while (*(++ptr) != 0)
+      ptr++; 
+      while (*ptr != 0)
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+        ptr++;
+#ifdef SUPPORT_UTF8         
+        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
         }
       if (*ptr != 0) continue;

@@ -5036,7 +5066,7 @@
         /* Search the pattern for a forward reference */

         else if ((i = find_parens(cd, name, namelen,
-                        (options & PCRE_EXTENDED) != 0)) > 0)
+                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
           code[1+LINK_SIZE]++;
@@ -5382,7 +5412,7 @@
             }
           else if ((recno =                /* Forward back reference */
                     find_parens(cd, name, namelen,
-                      (options & PCRE_EXTENDED) != 0)) <= 0)
+                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
             {
             *errorcodeptr = ERR15;
             goto FAILED;
@@ -5493,7 +5523,7 @@
             if (called == NULL)
               {
               if (find_parens(cd, NULL, recno,
-                    (options & PCRE_EXTENDED) != 0) < 0)
+                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
                 {
                 *errorcodeptr = ERR15;
                 goto FAILED;

Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/testdata/testinput5    2010-10-26 11:06:44 UTC (rev 556)
@@ -794,4 +794,21 @@
     \x{a2} \x{84} 
     A Z

+'A#хц'8x<any>BZ
+
+'A#хц
+ PQ'8x<any>BZ
+
+/a+#хaa
+ z#XX?/8x<any>BZ
+
+/a+#хaa
+ z#х?/8x<any>BZ
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+
/-- End of testinput5 --/

Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/testdata/testoutput5    2010-10-26 11:06:44 UTC (rev 556)
@@ -2222,4 +2222,67 @@
     A Z 
  0: A Z

+'A#хц'8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        A
+        Ket
+        End
+------------------------------------------------------------------
+
+'A#хц
+  PQ'8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        APQ
+        Ket
+        End
+------------------------------------------------------------------
+  
+/a+#хaa
+  z#XX?/8x<any>BZ 
+------------------------------------------------------------------
+        Bra
+        a++
+        z
+        Ket
+        End
+------------------------------------------------------------------
+
+/a+#хaa
+  z#х?/8x<any>BZ 
+------------------------------------------------------------------
+        Bra
+        a++
+        z
+        Ket
+        End
+------------------------------------------------------------------
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        \1
+        xxx
+        CBra 1
+        456
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+        Bra
+        \1
+        xxx
+        CBra 1
+        456
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput5 --/

Esta mensagem é parte da seguinte discussão:
	Árvore completa da discussão ordenada por data

[Pcre-svn] [556] code/trunk: Fix #-comment bugs in UTF-8 mod…