Revision: 556
http://vcs.pcre.org/viewvc?view=rev&revision=556
Author: ph10
Date: 2010-10-26 12:06:44 +0100 (Tue, 26 Oct 2010)
Log Message:
-----------
Fix #-comment bugs in UTF-8 mode with PCRE_NEWLINE_ANY.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrepattern.3
code/trunk/pcre_compile.c
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/ChangeLog 2010-10-26 11:06:44 UTC (rev 556)
@@ -39,6 +39,19 @@
/t\b/ matched against "cat" with PCRE_PARTIAL_HARD set did return a partial
match rather than a full match, which was wrong by the old rules, but is
now correct.]
+
+6. There was a bug in the handling of #-introduced comments, recognized when
+ PCRE_EXTENDED is set, when PCRE_NEWLINE_ANY and PCRE_UTF8 were also set.
+ If a UTF-8 multi-byte character included the byte 0x85 (e.g. +U0445, whose
+ UTF-8 encoding is 0xd1,0x85), this was misinterpreted as a newline when
+ scanning for the end of the comment. (*Character* 0x85 is an "any" newline,
+ but *byte* 0x85 is not, in UTF-8 mode). This bug was present in several
+ places in pcre_compile().
+
+7. Related to (6) above, when pcre_compile() was skipping #-introduced
+ comments when looking ahead for named forward references to subpatterns,
+ the only newline sequence it recognized was NL. It now handles newlines
+ according to the set newline convention.
Version 8.10 25-Jun-2010
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/doc/pcrepattern.3 2010-10-26 11:06:44 UTC (rev 556)
@@ -66,6 +66,7 @@
page.
.
.
+.\" HTML <a name="newlines"></a>
.SH "NEWLINE CONVENTIONS"
.rs
.sp
@@ -2109,7 +2110,25 @@
.P
If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues to immediately after the
-next newline in the pattern.
+next newline character or character sequence in the pattern. Which characters
+are interpreted as newlines is controlled by the options passed to
+\fBpcre_compile()\fP or by a special sequence at the start of the pattern, as
+described in the section entitled
+.\" HTML <a href="#recursion">
+.\" </a>
+"Newline conventions"
+.\"
+above. Note that end of a comment is a literal newline sequence in the pattern;
+escape sequences that happen to represent a newline do not terminate a comment.
+For example, consider this pattern when PCRE_EXTENDED is set, and the default
+newline convention is in force:
+.sp
+ abc #comment \en still comment
+.sp
+On encountering the # character, \fBpcre_compile()\fP skips along, looking for
+a newline in the pattern. The sequence \en is still literal at this stage, so
+it does not terminate the comment. Only an actual character with the code value
+0x0a does so.
.
.
.\" HTML <a name="recursion"></a>
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/pcre_compile.c 2010-10-26 11:06:44 UTC (rev 556)
@@ -1110,6 +1110,7 @@
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
count pointer to the current capturing subpattern number (updated)
Returns: the number of the named subpattern, or -1 if not found
@@ -1117,7 +1118,7 @@
static int
find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode, int *count)
+ BOOL xmode, BOOL utf8, int *count)
{
uschar *ptr = *ptrptr;
int start_count = *count;
@@ -1278,7 +1279,15 @@
if (xmode && *ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+ ptr++;
+ while (*ptr != 0)
+ {
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
if (*ptr == 0) goto FAIL_EXIT;
continue;
}
@@ -1287,7 +1296,7 @@
if (*ptr == CHAR_LEFT_PARENTHESIS)
{
- int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+ int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
if (rc > 0) return rc;
if (*ptr == 0) goto FAIL_EXIT;
}
@@ -1333,12 +1342,14 @@
name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode
+ utf8 TRUE if we are in UTF-8 mode
Returns: the number of the found subpattern, or -1 if not found
*/
static int
-find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
+ BOOL utf8)
{
uschar *ptr = (uschar *)cd->start_pattern;
int count = 0;
@@ -1351,7 +1362,7 @@
for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
if (rc > 0 || *ptr++ == 0) break;
}
@@ -2515,8 +2526,15 @@
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -2552,8 +2570,15 @@
while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
if (*ptr == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
+ {
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
+ }
}
else break;
}
@@ -3126,9 +3151,14 @@
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == CHAR_NUMBER_SIGN)
{
- while (*(++ptr) != 0)
+ ptr++;
+ while (*ptr != 0)
{
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
+ ptr++;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+#endif
}
if (*ptr != 0) continue;
@@ -5036,7 +5066,7 @@
/* Search the pattern for a forward reference */
else if ((i = find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) > 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) > 0)
{
PUT2(code, 2+LINK_SIZE, i);
code[1+LINK_SIZE]++;
@@ -5382,7 +5412,7 @@
}
else if ((recno = /* Forward back reference */
find_parens(cd, name, namelen,
- (options & PCRE_EXTENDED) != 0)) <= 0)
+ (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
@@ -5493,7 +5523,7 @@
if (called == NULL)
{
if (find_parens(cd, NULL, recno,
- (options & PCRE_EXTENDED) != 0) < 0)
+ (options & PCRE_EXTENDED) != 0, utf8) < 0)
{
*errorcodeptr = ERR15;
goto FAILED;
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/testdata/testinput5 2010-10-26 11:06:44 UTC (rev 556)
@@ -794,4 +794,21 @@
\x{a2} \x{84}
A Z
+'A#хц'8x<any>BZ
+
+'A#хц
+ PQ'8x<any>BZ
+
+/a+#хaa
+ z#XX?/8x<any>BZ
+
+/a+#хaa
+ z#х?/8x<any>BZ
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2010-10-26 08:26:20 UTC (rev 555)
+++ code/trunk/testdata/testoutput5 2010-10-26 11:06:44 UTC (rev 556)
@@ -2222,4 +2222,67 @@
A Z
0: A Z
+'A#хц'8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ A
+ Ket
+ End
+------------------------------------------------------------------
+
+'A#хц
+ PQ'8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ APQ
+ Ket
+ End
+------------------------------------------------------------------
+
+/a+#хaa
+ z#XX?/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ a++
+ z
+ Ket
+ End
+------------------------------------------------------------------
+
+/a+#хaa
+ z#х?/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ a++
+ z
+ Ket
+ End
+------------------------------------------------------------------
+
+/\g{A}xxx#bXX(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ \1
+ xxx
+ CBra 1
+ 456
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
+/\g{A}xxx#bх(?'A'123)
+(?'A'456)/8x<any>BZ
+------------------------------------------------------------------
+ Bra
+ \1
+ xxx
+ CBra 1
+ 456
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput5 --/