[Pcre-svn] [384] code/trunk: Fix Unicode property support in…

Página Inicial
Delete this message
Autor: Subversion repository
Data:  
Para: pcre-svn
Assunto: [Pcre-svn] [384] code/trunk: Fix Unicode property support in character classes for chars > 127 in non-UTF-8
Revision: 384
          http://vcs.pcre.org/viewvc?view=rev&revision=384
Author:   ph10
Date:     2009-03-08 16:27:43 +0000 (Sun, 08 Mar 2009)


Log Message:
-----------
Fix Unicode property support in character classes for chars > 127 in non-UTF-8
mode.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_exec.c
    code/trunk/pcre_internal.h
    code/trunk/pcre_xclass.c
    code/trunk/testdata/testinput6
    code/trunk/testdata/testoutput6


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/ChangeLog    2009-03-08 16:27:43 UTC (rev 384)
@@ -47,6 +47,9 @@


 10. The PCRE_DOLLAR_ENDONLY option was not working when pcre_dfa_exec() was 
     used for matching. 
+    
+11. Unicode property support in character classes was not working for 
+    characters (bytes) greater than 128 when not in UTF-8 mode.



Version 7.8 05-Sep-08

Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_exec.c    2009-03-08 16:27:43 UTC (rev 384)
@@ -1707,7 +1707,7 @@
     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
     GETCHARINCTEST(c, eptr);
       {
-      const ucd_record * prop = GET_UCD(c);
+      const ucd_record *prop = GET_UCD(c);


       switch(ecode[1])
         {
@@ -2075,7 +2075,8 @@



     /* Match an extended character class. This opcode is encountered only
-    in UTF-8 mode, because that's the only time it is compiled. */
+    when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
+    mode, because Unicode properties are supported in non-UTF-8 mode. */


 #ifdef SUPPORT_UTF8
     case OP_XCLASS:
@@ -2117,7 +2118,7 @@
       for (i = 1; i <= min; i++)
         {
         if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
-        GETCHARINC(c, eptr);
+        GETCHARINCTEST(c, eptr);
         if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
         }


@@ -2136,7 +2137,7 @@
           RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
-          GETCHARINC(c, eptr);
+          GETCHARINCTEST(c, eptr);
           if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
           }
         /* Control never gets here */
@@ -2151,7 +2152,7 @@
           {
           int len = 1;
           if (eptr >= md->end_subject) break;
-          GETCHARLEN(c, eptr, len);
+          GETCHARLENTEST(c, eptr, len);
           if (!_pcre_xclass(c, data)) break;
           eptr += len;
           }


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_internal.h    2009-03-08 16:27:43 UTC (rev 384)
@@ -478,6 +478,26 @@
     len += gcaa; \
     }


+/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
+pointer, incrementing length if there are extra bytes. This is called when we
+know we are in UTF-8 mode. */
+
+#define GETCHARLENTEST(c, eptr, len) \
+  c = *eptr; \
+  if (utf8 && c >= 0xc0) \
+    { \
+    int gcii; \
+    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
+    int gcss = 6*gcaa; \
+    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+    for (gcii = 1; gcii <= gcaa; gcii++) \
+      { \
+      gcss -= 6; \
+      c |= (eptr[gcii] & 0x3f) << gcss; \
+      } \
+    len += gcaa; \
+    }
+
 /* If the pointer is not at the start of a character, move it back until
 it is. This is called only in UTF-8 mode - we don't put a test within the macro
 because almost all calls are already within a block of UTF-8 only code. */


Modified: code/trunk/pcre_xclass.c
===================================================================
--- code/trunk/pcre_xclass.c    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/pcre_xclass.c    2009-03-08 16:27:43 UTC (rev 384)
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.


                        Written by Philip Hazel
-           Copyright (c) 1997-2008 University of Cambridge
+           Copyright (c) 1997-2009 University of Cambridge


-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -39,8 +39,7 @@


/* This module contains an internal function that is used to match an extended
-class (one that contains characters whose values are > 255). It is used by both
-pcre_exec() and pcre_def_exec(). */
+class. It is used by both pcre_exec() and pcre_def_exec(). */


#ifdef HAVE_CONFIG_H
@@ -55,7 +54,7 @@
*************************************************/

/* This function is called to match a character against an extended class that
-might contain values > 255.
+might contain values > 255 and/or Unicode properties.

 Arguments:
   c           the character
@@ -104,7 +103,7 @@
 #ifdef SUPPORT_UCP
   else  /* XCL_PROP & XCL_NOTPROP */
     {
-    const ucd_record * prop = GET_UCD(c);
+    const ucd_record *prop = GET_UCD(c);


     switch(*data)
       {


Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/testdata/testinput6    2009-03-08 16:27:43 UTC (rev 384)
@@ -942,5 +942,13 @@
     \x{10a}\x{10b}
     \x{10b}\x{10b}
     \x{10b}\x{10a}
+    
+/The next two tests are for property support in non-UTF-8 mode/


+/(?:\p{Lu}|\x20)+/
+    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+
+/[\p{Lu}\x20]+/
+    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+
 / End of testinput6 /


Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6    2009-03-08 15:26:59 UTC (rev 383)
+++ code/trunk/testdata/testoutput6    2009-03-08 16:27:43 UTC (rev 384)
@@ -1746,5 +1746,15 @@
     \x{10b}\x{10a}
  0: \x{10b}\x{10a}
  1: \x{10b}
+    
+/The next two tests are for property support in non-UTF-8 mode/


+/(?:\p{Lu}|\x20)+/
+    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+ 0: A P\xc2T\xc9 TODAY
+
+/[\p{Lu}\x20]+/
+    \x41\x20\x50\xC2\x54\xC9\x20\x54\x4F\x44\x41\x59
+ 0: A P\xc2T\xc9 TODAY
+
 / End of testinput6 /