Revision: 384
http://vcs.pcre.org/viewvc?view=rev&revision=384
Author: ph10
Date: 2009-03-08 16:27:43 +0000 (Sun, 08 Mar 2009)
Log Message:
-----------
Fix Unicode property support in character classes for chars > 127 in non-UTF-8
mode.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcre_xclass.c
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput6
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/ChangeLog 2009-03-08 16:27:43 UTC (rev 384)
@@ -47,6 +47,9 @@
10. The PCRE_DOLLAR_ENDONLY option was not working when pcre_dfa_exec() was
used for matching.
+
+11. Unicode property support in character classes was not working for
+ characters (bytes) greater than 128 when not in UTF-8 mode.
Version 7.8 05-Sep-08
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_exec.c 2009-03-08 16:27:43 UTC (rev 384)
@@ -1707,7 +1707,7 @@
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
GETCHARINCTEST(c, eptr);
{
- const ucd_record * prop = GET_UCD(c);
+ const ucd_record *prop = GET_UCD(c);
switch(ecode[1])
{
@@ -2075,7 +2075,8 @@
/* Match an extended character class. This opcode is encountered only
- in UTF-8 mode, because that's the only time it is compiled. */
+ when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
+ mode, because Unicode properties are supported in non-UTF-8 mode. */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
@@ -2117,7 +2118,7 @@
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
}
@@ -2136,7 +2137,7 @@
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
+ GETCHARINCTEST(c, eptr);
if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@@ -2151,7 +2152,7 @@
{
int len = 1;
if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
+ GETCHARLENTEST(c, eptr, len);
if (!_pcre_xclass(c, data)) break;
eptr += len;
}
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_internal.h 2009-03-08 16:27:43 UTC (rev 384)
@@ -478,6 +478,26 @@
len += gcaa; \
}
+/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
+pointer, incrementing length if there are extra bytes. This is called when we
+know we are in UTF-8 mode. */
+
+#define GETCHARLENTEST(c, eptr, len) \
+ c = *eptr; \
+ if (utf8 && c >= 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ len += gcaa; \
+ }
+
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
because almost all calls are already within a block of UTF-8 only code. */
Modified: code/trunk/pcre_xclass.c
===================================================================
--- code/trunk/pcre_xclass.c 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_xclass.c 2009-03-08 16:27:43 UTC (rev 384)
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
+ Copyright (c) 1997-2009 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@
/* This module contains an internal function that is used to match an extended
-class (one that contains characters whose values are > 255). It is used by both
-pcre_exec() and pcre_def_exec(). */
+class. It is used by both pcre_exec() and pcre_def_exec(). */
#ifdef HAVE_CONFIG_H
@@ -55,7 +54,7 @@
*************************************************/
/* This function is called to match a character against an extended class that
-might contain values > 255.
+might contain values > 255 and/or Unicode properties.
Arguments:
c the character
@@ -104,7 +103,7 @@
#ifdef SUPPORT_UCP
else /* XCL_PROP & XCL_NOTPROP */
{
- const ucd_record * prop = GET_UCD(c);
+ const ucd_record *prop = GET_UCD(c);
switch(*data)
{
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/testdata/testinput6 2009-03-08 16:27:43 UTC (rev 384)
@@ -942,5 +942,13 @@
\x{10a}\x{10b}
\x{10b}\x{10b}
\x{10b}\x{10a}
+
+/The next two tests are for property support in non-UTF-8 mode/
+/(?:\p{Lu}|\x20)+/
+ \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+
+/[\p{Lu}\x20]+/
+ \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+
/ End of testinput6 /
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/testdata/testoutput6 2009-03-08 16:27:43 UTC (rev 384)
@@ -1746,5 +1746,15 @@
\x{10b}\x{10a}
0: \x{10b}\x{10a}
1: \x{10b}
+
+/The next two tests are for property support in non-UTF-8 mode/
+/(?:\p{Lu}|\x20)+/
+ \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+ 0: A P\xc2T\xc9 TODAY
+
+/[\p{Lu}\x20]+/
+ \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+ 0: A P\xc2T\xc9 TODAY
+
/ End of testinput6 /