[Pcre-svn] [1045] code/trunk: Update character class handlin…

Startseite
Nachricht löschen
Autor: Subversion repository
Datum:  
To: pcre-svn
Betreff: [Pcre-svn] [1045] code/trunk: Update character class handling to use new character case information; rework
Revision: 1045
          http://vcs.pcre.org/viewvc?view=rev&revision=1045
Author:   ph10
Date:     2012-09-23 17:50:00 +0100 (Sun, 23 Sep 2012)


Log Message:
-----------
Update character class handling to use new character case information; rework
\h, \H, \v, and \V to use the same apparatus with centrally defined lists.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/maint/MultiStage2.py
    code/trunk/maint/ucptest.c
    code/trunk/pcre_compile.c
    code/trunk/pcre_internal.h
    code/trunk/pcre_printint.c
    code/trunk/pcre_tables.c
    code/trunk/pcre_ucd.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput6
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput11-16
    code/trunk/testdata/testoutput11-8
    code/trunk/testdata/testoutput15
    code/trunk/testdata/testoutput17
    code/trunk/testdata/testoutput18
    code/trunk/testdata/testoutput5
    code/trunk/testdata/testoutput6
    code/trunk/testdata/testoutput7


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/ChangeLog    2012-09-23 16:50:00 UTC (rev 1045)
@@ -85,6 +85,29 @@


 20. Turn case lists for horizontal and vertical white space into macros so that
     they are defined only once.
+    
+21. This set of changes together give more compatible Unicode case-folding
+    behaviour for characters that have more than one other case.
+    
+    (a) The Unicode property table now has offsets into a new table of sets of
+        three or more characters that are case-equivalent. The MultiStage2.py 
+        script that generates these tables (the pcre_ucd.c file) now scans 
+        CaseFolding.txt instead of UnicodeData.txt for character case 
+        information.
+        
+    (b) The code for adding characters or ranges of characters to a character
+        class has been abstracted into a generalized function that also handles
+        case-independence. In UTF-mode with UCP support, this uses the new data
+        to handle characters with more than one other case.
+        
+    (c) A bug that is fixed as a result of (b) is that codepoints less than 256
+        whose other case is greater than 256 are now correctly matched
+        caselessly. Previously, the high codepoint matched the low one, but not 
+        vice versa. 
+        
+    (d) The processing of \h, \H, \v, and \ in character classes now makes use
+        of the new class addition function, using character lists defined as
+        macros alongside the case definitions of 20 above.



Version 8.31 06-July-2012

Modified: code/trunk/maint/MultiStage2.py
===================================================================
--- code/trunk/maint/MultiStage2.py    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/maint/MultiStage2.py    2012-09-23 16:50:00 UTC (rev 1045)
@@ -56,9 +56,9 @@
 #
 # This script constructs four tables. The ucd_caseless_sets table contains
 # lists of characters that all match each other caselessly. Each list is
-# in order, and is terminated by 0xffffffff, which is of course larger than any
-# valid character. The first list is empty; this is used for characters that
-# are not part of any list.
+# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
+# any valid character. The first list is empty; this is used for characters
+# that are not part of any list.
 #
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number, and
@@ -435,12 +435,12 @@
 # --- Added by PH: output the table of caseless character sets ---


 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
-print "  0xffffffff,"
+print "  NOTACHAR,"
 for s in sets:
   s = sorted(s)
   for x in s:
     print '  0x%04x,' % x,
-  print '  0xffffffff,'   
+  print '  NOTACHAR,'   
 print '};'
 print



Modified: code/trunk/maint/ucptest.c
===================================================================
--- code/trunk/maint/ucptest.c    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/maint/ucptest.c    2012-09-23 16:50:00 UTC (rev 1045)
@@ -243,7 +243,7 @@
   if (caseset != 0)
     {
     const pcre_uint32 *p = PRIV(ucd_caseless_sets) + caseset - 1;
-    while (*(++p) < 0xffffffff)
+    while (*(++p) < NOTACHAR)
       if (*p != othercase && *p != c) printf(", %04x", *p);
     }   
   } 


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_compile.c    2012-09-23 16:50:00 UTC (rev 1045)
@@ -68,7 +68,7 @@


/* Macro for setting individual bits in class bitmaps. */

-#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
+#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))

/* Maximum length value to check against when making sure that the integer that
holds the compiled pattern length does not overflow. We make it a bit less than
@@ -77,7 +77,18 @@

#define OFLOW_MAX (INT_MAX - 20)

+/* Definitions to allow mutual recursion */

+static int
+  add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *, 
+    const pcre_uint32 *, unsigned int);
+
+static BOOL
+  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, 
+    int, int, int *, int *, branch_chain *, compile_data *, int *);
+
+
+
 /*************************************************
 *      Code parameters and static tables         *
 *************************************************/
@@ -631,14 +642,8 @@
 #endif



-/* Definition to allow mutual recursion */

-static BOOL
-  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
-    int *, int *, branch_chain *, compile_data *, int *);


-
-
 /*************************************************
 *            Find an error text                  *
 *************************************************/
@@ -2871,9 +2876,10 @@
 *************************************************/


/* This function is passed the start and end of a class range, in UTF-8 mode
-with UCP support. It searches up the characters, looking for internal ranges of
+with UCP support. It searches up the characters, looking for ranges of
characters in the "other" case. Each call returns the next one, updating the
-start address.
+start address. A character with multiple other cases is returned on its own
+with a special return value.

 Arguments:
   cptr        points to starting character value; updated
@@ -2881,19 +2887,34 @@
   ocptr       where to put start of othercase range
   odptr       where to put end of othercase range


-Yield:        TRUE when range returned; FALSE when no more
+Yield:        -1 when no more
+               0 when a range is returned
+              >0 the CASESET offset for char with multiple other cases
+                in this case, ocptr contains the original 
 */


-static BOOL
+static int
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
unsigned int *odptr)
{
unsigned int c, othercase, next;
+int co;

+/* Find the first character that has an other case. If it has multiple other 
+cases, return its case offset value. */
+
 for (c = *cptr; c <= d; c++)
-  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
+  { 
+  if ((co = UCD_CASESET(c)) != 0)
+    {
+    *ocptr = c++;   /* Character that has the set */
+    *cptr = c;      /* Rest of input range */
+    return co;
+    }    
+  if ((othercase = UCD_OTHERCASE(c)) != c) break; 
+  }


-if (c > d) return FALSE;
+if (c > d) return -1; /* Reached end of range */

*ocptr = othercase;
next = othercase + 1;
@@ -2904,10 +2925,9 @@
next++;
}

-*odptr = next - 1;
-*cptr = c;
-
-return TRUE;
+*odptr = next - 1;     /* End of othercase range */
+*cptr = c;             /* Rest of input range */
+return 0;
 }



@@ -3357,6 +3377,243 @@


 /*************************************************
+*        Add a character or range to a class     *
+*************************************************/
+
+/* This function packages up the logic of adding a character or range of
+characters to a class. The character values in the arguments will be within the 
+valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is 
+mutually recursive with the function immediately below.
+
+Arguments:
+  classbits     the bit map for characters < 256
+  uchardptr     points to the pointer for extra data
+  options       the options word
+  cd            contains pointers to tables etc. 
+  start         start of range character
+  end           end of range character
+  
+Returns:        the number of < 256 characters added
+                the pointer to extra data is updated
+*/
+
+static int
+add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
+  compile_data *cd, unsigned int start, unsigned int end)
+{
+unsigned int c;
+int n8 = 0;
+
+/* If caseless matching is required, scan the range and process alternate 
+cases. In Unicode, there are 8-bit characters that have alternate cases that 
+are greater than 255 and vice-versa. Sometimes we can just extend the original 
+range. */
+
+if ((options & PCRE_CASELESS) != 0)
+  {
+#ifdef SUPPORT_UCP
+  if ((options & PCRE_UTF8) != 0)
+    { 
+    int rc; 
+    unsigned int oc, od;
+     
+    options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
+    c = start;
+    
+    while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
+      {
+      /* Handle a single character that has more than one other case. */
+        
+      if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
+        PRIV(ucd_caseless_sets) + rc, oc);
+    
+      /* Do nothing if the other case range is within the original range. */
+       
+      else if (oc >= start && od <= end) continue;
+      
+      /* Extend the original range if there is overlap, noting that if oc < c, we
+      can't have od > end because a subrange is always shorter than the basic
+      range. Otherwise, use a recursive call to add the additional range. */
+       
+      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
+      else if (od > end && oc <= end + 1) end = od;       /* Extend upwards */
+      else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
+      }
+    }
+  else
+#endif  /* SUPPORT_UCP */
+
+  /* Not UTF-mode, or no UCP */
+   
+  for (c = start; c <= end && c < 256; c++) 
+    {        
+    SETBIT(classbits, cd->fcc[c]);
+    n8++; 
+    }   
+  }   
+ 
+/* Now handle the original range. Adjust the final value according to the bit
+length - this means that the same lists of (e.g.) horizontal spaces can be used
+in all cases. */
+
+#ifdef COMPILE_PCRE8
+#ifdef SUPPORT_UTF
+  if ((options & PCRE_UTF8) == 0)
+#endif
+  if (end > 0xff) end = 0xff;
+#endif
+
+#ifdef COMPILE_PCRE16
+#ifdef SUPPORT_UTF
+  if ((options & PCRE_UTF16) == 0)
+#endif
+  if (end > 0xffff) end = 0xffff;
+#endif
+
+/* If all characters are less than 256, use the bit map. Otherwise use extra
+data. */
+
+if (end < 0x100)
+  {
+  for (c = start; c <= end; c++)
+    {
+    n8++; 
+    SETBIT(classbits, c);
+    } 
+  }
+   
+else
+  {      
+  pcre_uchar *uchardata = *uchardptr;
+   
+#ifdef SUPPORT_UTF
+  if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
+    {
+    if (start < end)
+      {
+      *uchardata++ = XCL_RANGE;
+      uchardata += PRIV(ord2utf)(start, uchardata);  
+      uchardata += PRIV(ord2utf)(end, uchardata);  
+      }
+    else if (start == end)
+      {
+      *uchardata++ = XCL_SINGLE;
+      uchardata += PRIV(ord2utf)(start, uchardata);  
+      }
+    }
+  else
+#endif  /* SUPPORT_UTF */  
+   
+  /* Without UTF support, character values are constrained by the bit length,
+  and can only be > 256 for 16-bit and 32-bit libraries. */
+  
+#ifdef COMPILE_PCRE8
+    {}
+#else   
+  if (start < end)
+    {
+    *uchardata++ = XCL_RANGE;
+    *uchardata++ = start;
+    *uchardata++ = end;
+    }
+  else if (start == end)
+    {
+    *uchardata++ = XCL_SINGLE;
+    *uchardata++ = start;
+    }        
+#endif
+
+  *uchardptr = uchardata;   /* Updata extra data pointer */
+  } 
+
+return n8;    /* Number of 8-bit characters */
+}   
+ 
+ 
+
+
+/*************************************************
+*        Add a list of characters to a class     *
+*************************************************/
+
+/* This function is used for adding a list of case-equivalent characters to a 
+class, and also for adding a list of horizontal or vertical whitespace. If the
+list is in order (which it should be), ranges of characters are detected and
+handled appropriately. This function is mutually recursive with the function
+above.
+
+Arguments:
+  classbits     the bit map for characters < 256
+  uchardptr     points to the pointer for extra data
+  options       the options word
+  cd            contains pointers to tables etc. 
+  p             points to row of 32-bit values, terminated by NOTACHAR 
+  except        character to omit; this is used when adding lists of
+                  case-equivalent characters to avoid including the one we
+                  already know about   
+  
+Returns:        the number of < 256 characters added
+                the pointer to extra data is updated
+*/
+
+static int
+add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
+  compile_data *cd, const pcre_uint32 *p, unsigned int except)
+{
+int n8 = 0;
+while (p[0] < NOTACHAR)
+  {
+  int n = 0;
+  if (p[0] != except)
+    {  
+    while(p[n+1] == p[0] + n + 1) n++;
+    n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
+    } 
+  p += n + 1; 
+  } 
+return n8;
+}   
+
+
+
+/*************************************************
+*    Add characters not in a list to a class     *
+*************************************************/
+
+/* This function is used for adding the complement of a list of horizontal or
+vertical whitespace to a class. The list must be in order.
+
+Arguments:
+  classbits     the bit map for characters < 256
+  uchardptr     points to the pointer for extra data
+  options       the options word
+  cd            contains pointers to tables etc. 
+  p             points to row of 32-bit values, terminated by NOTACHAR 
+  
+Returns:        the number of < 256 characters added
+                the pointer to extra data is updated
+*/
+
+static int
+add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, 
+  int options, compile_data *cd, const pcre_uint32 *p)
+{
+int n8 = 0;
+if (p[0] > 0)
+  n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
+while (p[0] < NOTACHAR)
+  {
+  while (p[1] == p[0] + 1) p++;
+  n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1, 
+    (p[1] == NOTACHAR)? 0x10ffff : p[1] - 1);
+  p++; 
+  } 
+return n8;
+}   
+
+
+
+/*************************************************
 *           Compile one branch                   *
 *************************************************/


@@ -3474,7 +3731,7 @@
BOOL is_recurse;
BOOL reset_bracount;
int class_has_8bitchar;
- int class_single_char;
+ int class_one_char;
int newoptions;
int recno;
int refsign;
@@ -3772,25 +4029,25 @@

     should_flip_negation = FALSE;


-    /* For optimization purposes, we track some properties of the class.
-    class_has_8bitchar will be non-zero, if the class contains at least one
-    < 256 character. class_single_char will be 1 if the class contains only
-    a single character. */
+    /* For optimization purposes, we track some properties of the class:
+    class_has_8bitchar will be non-zero if the class contains at least one <
+    256 character; class_one_char will be 1 if the class contains just one
+    character. */


     class_has_8bitchar = 0;
-    class_single_char = 0;
+    class_one_char = 0;


     /* Initialize the 32-char bit map to all zeros. We build the map in a
-    temporary bit of memory, in case the class contains only 1 character (less
-    than 256), because in that case the compiled code doesn't use the bit map.
-    */
+    temporary bit of memory, in case the class contains fewer than two
+    8-bit characters because in that case the compiled code doesn't use the bit
+    map. */


     memset(classbits, 0, 32 * sizeof(pcre_uint8));


 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
-    xclass = FALSE;                           /* No chars >= 256 */
-    class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
-    class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
+    xclass = FALSE;
+    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
+    class_uchardata_base = class_uchardata;   /* Save the start */
 #endif


     /* Process characters until ] is reached. By writing this as a "do" it
@@ -3812,10 +4069,12 @@
       /* In the pre-compile phase, accumulate the length of any extra
       data and reset the pointer. This is so that very large classes that
       contain a zillion > 255 characters no longer overwrite the work space
-      (which is on the stack). */
+      (which is on the stack). We have to remember that there was XCLASS data, 
+      however. */


-      if (lengthptr != NULL)
+      if (lengthptr != NULL && class_uchardata > class_uchardata_base)
         {
+        xclass = TRUE;
         *lengthptr += class_uchardata - class_uchardata_base;
         class_uchardata = class_uchardata_base;
         }
@@ -3917,7 +4176,7 @@
             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
           }


-        /* Not see if we need to remove any special characters. An option
+        /* Now see if we need to remove any special characters. An option
         value of 1 removes vertical space and 2 removes underscore. */


         if (tabopt < 0) tabopt = -tabopt;
@@ -3933,10 +4192,10 @@
           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];


         ptr = tempptr + 1;
-        /* Every class contains at least one < 256 characters. */
+        /* Every class contains at least one < 256 character. */
         class_has_8bitchar = 1;
         /* Every class contains at least two characters. */
-        class_single_char = 2;
+        class_one_char = 2;
         continue;    /* End of POSIX syntax handling */
         }


@@ -3944,7 +4203,7 @@
       of the specials, which just set a flag. The sequence \b is a special
       case. Inside a class (and only there) it is treated as backspace. We
       assume that other escapes have more than one character in them, so
-      speculatively set both class_has_8bitchar and class_single_char bigger
+      speculatively set both class_has_8bitchar and class_one_char bigger
       than one. Unrecognized escapes fall through and are either treated
       as literal characters (by default), or are faulted if
       PCRE_EXTRA is set. */
@@ -3977,7 +4236,7 @@
           /* Every class contains at least two < 256 characters. */
           class_has_8bitchar++;
           /* Every class contains at least two characters. */
-          class_single_char += 2;
+          class_one_char += 2;


           switch (-c)
             {
@@ -4027,191 +4286,27 @@
             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
             continue;
+            
+            /* The rest apply in both UCP and non-UCP cases. */


             case ESC_h:
-            SETBIT(classbits, CHAR_HT);
-            SETBIT(classbits, CHAR_SPACE);
-#ifndef EBCDIC             
-            SETBIT(classbits, 0xa0); /* NSBP */
-#ifndef COMPILE_PCRE8
-            xclass = TRUE;
-            *class_uchardata++ = XCL_SINGLE;
-            *class_uchardata++ = 0x1680;
-            *class_uchardata++ = XCL_SINGLE;
-            *class_uchardata++ = 0x180e;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x2000;
-            *class_uchardata++ = 0x200a;
-            *class_uchardata++ = XCL_SINGLE;
-            *class_uchardata++ = 0x202f;
-            *class_uchardata++ = XCL_SINGLE;
-            *class_uchardata++ = 0x205f;
-            *class_uchardata++ = XCL_SINGLE;
-            *class_uchardata++ = 0x3000;
-#elif defined SUPPORT_UTF
-            if (utf)
-              {
-              xclass = TRUE;
-              *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
-              *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
-              *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
-              *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
-              *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
-              }
-#endif
-#endif  /* Not EBCDIC */
+            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
+              PRIV(hspace_list), NOTACHAR); 
             continue;


             case ESC_H:
-            for (c = 0; c < 32; c++)
-              {
-              int x = 0xff;
-              switch (c)
-                {
-                case CHAR_HT/8:    x ^= 1 << (CHAR_HT%8); break;
-                case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break;
-#ifndef EBCDIC  
-                case 0xa0/8: x ^= 1 << (0xa0%8); break;  /* NSBSP */
-#endif 
-                default: break;
-                }
-              classbits[c] |= x;
-              }
-#ifndef EBCDIC               
-#ifndef COMPILE_PCRE8
-            xclass = TRUE;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x0100;
-            *class_uchardata++ = 0x167f;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x1681;
-            *class_uchardata++ = 0x180d;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x180f;
-            *class_uchardata++ = 0x1fff;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x200b;
-            *class_uchardata++ = 0x202e;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x2030;
-            *class_uchardata++ = 0x205e;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x2060;
-            *class_uchardata++ = 0x2fff;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x3001;
-#ifdef SUPPORT_UTF
-            if (utf)
-              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
-            else
-#endif   /* SUPPORT_UTF */
-              *class_uchardata++ = 0xffff;
-#elif defined SUPPORT_UTF
-            if (utf)
-              {
-              xclass = TRUE;
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
-              }
-#endif
-#endif  /* Not EBCDIC */
+            (void)add_not_list_to_class(classbits, &class_uchardata, options, 
+              cd, PRIV(hspace_list)); 
             continue;


             case ESC_v:
-            SETBIT(classbits, CHAR_LF);
-            SETBIT(classbits, CHAR_VT);
-            SETBIT(classbits, CHAR_FF);
-            SETBIT(classbits, CHAR_CR);
-            SETBIT(classbits, CHAR_NEL);
-#ifndef EBCDIC             
-#ifndef COMPILE_PCRE8
-            xclass = TRUE;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x2028;
-            *class_uchardata++ = 0x2029;
-#elif defined SUPPORT_UTF
-            if (utf)
-              {
-              xclass = TRUE;
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
-              }
-#endif
-#endif  /* Not EBCDIC */
+            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
+              PRIV(vspace_list), NOTACHAR); 
             continue;


             case ESC_V:
-            for (c = 0; c < 32; c++)
-              {
-              int x = 0xff;
-              switch (c)
-                {
-                case CHAR_LF/8: x ^= 1 << (CHAR_LF%8);
-                                x ^= 1 << (CHAR_VT%8);
-                                x ^= 1 << (CHAR_FF%8);
-                                x ^= 1 << (CHAR_CR%8);
-                                break;
-                case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break;
-                default: break;
-                }
-              classbits[c] |= x;
-              }
-
-#ifndef EBCDIC
-#ifndef COMPILE_PCRE8
-            xclass = TRUE;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x0100;
-            *class_uchardata++ = 0x2027;
-            *class_uchardata++ = XCL_RANGE;
-            *class_uchardata++ = 0x202a;
-#ifdef SUPPORT_UTF
-            if (utf)
-              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
-            else
-#endif
-              *class_uchardata++ = 0xffff;
-#elif defined SUPPORT_UTF
-            if (utf)
-              {
-              xclass = TRUE;
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
-              *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
-              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
-              }
-#endif
-#endif  /* Not EBCDIC */
+            (void)add_not_list_to_class(classbits, &class_uchardata, options, 
+              cd, PRIV(vspace_list)); 
             continue;


 #ifdef SUPPORT_UCP
@@ -4222,7 +4317,6 @@
               int pdata;
               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
               if (ptype < 0) goto FAILED;
-              xclass = TRUE;
               *class_uchardata++ = ((-c == ESC_p) != negated)?
                 XCL_PROP : XCL_NOTPROP;
               *class_uchardata++ = ptype;
@@ -4242,21 +4336,21 @@
               goto FAILED;
               }
             class_has_8bitchar--;    /* Undo the speculative increase. */
-            class_single_char -= 2;  /* Undo the speculative increase. */
+            class_one_char -= 2;     /* Undo the speculative increase. */
             c = *ptr;                /* Get the final character and fall through */
             break;
             }
           }


-        /* Fall through if we have a single character (c >= 0). This may be
-        greater than 256. */
-
+        /* Fall through if the escape just defined a single character (c >= 0).
+        This may be greater than 256. */
+         
         }   /* End of backslash handling */


-      /* A single character may be followed by '-' to form a range. However,
-      Perl does not permit ']' to be the end of the range. A '-' character
-      at the end is treated as a literal. Perl ignores orphaned \E sequences
-      entirely. The code for handling \Q and \E is messy. */
+      /* A character may be followed by '-' to form a range. However, Perl does
+      not permit ']' to be the end of the range. A '-' character at the end is
+      treated as a literal. Perl ignores orphaned \E sequences entirely. The
+      code for handling \Q and \E is messy. */


       CHECK_RANGE:
       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
@@ -4264,10 +4358,9 @@
         inescq = FALSE;
         ptr += 2;
         }
-
       oldptr = ptr;


-      /* Remember \r or \n */
+      /* Remember if \r or \n were explicitly used */


       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;


@@ -4290,12 +4383,17 @@
           inescq = TRUE;
           break;
           }
+          
+        /* Minus (hyphen) at the end of a class is treated as a literal, so put
+        back the pointer and jump to handle the character that preceded it. */


         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
           {
           ptr = oldptr;
-          goto LONE_SINGLE_CHARACTER;
+          goto CLASS_SINGLE_CHARACTER;
           }
+          
+        /* Otherwise, we have a potential range; pick up the next character */


 #ifdef SUPPORT_UTF
         if (utf)
@@ -4315,203 +4413,63 @@
           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
           if (*errorcodeptr != 0) goto FAILED;


-          /* \b is backspace; any other special means the '-' was literal */
+          /* \b is backspace; any other special means the '-' was literal. */


           if (d < 0)
             {
             if (d == -ESC_b) d = CHAR_BS; else
               {
               ptr = oldptr;
-              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
+              goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
               }
             }
           }


         /* Check that the two values are in the correct order. Optimize
-        one-character ranges */
+        one-character ranges. */


         if (d < c)
           {
           *errorcodeptr = ERR8;
           goto FAILED;
           }
+        if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */


-        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
+        /* We have found a character range, so single character optimizations
+        cannot be done anymore. Any value greater than 1 indicates that there
+        is more than one character. */
+         
+        class_one_char = 2;


-        /* Remember \r or \n */
+        /* Remember an explicit \r or \n, and add the range to the class. */


         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
-
-        /* Since we found a character range, single character optimizations
-        cannot be done anymore. */
-        class_single_char = 2;
-
-        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
-        matching, we have to use an XCLASS with extra data items. Caseless
-        matching for characters > 127 is available only if UCP support is
-        available. */
-
-#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
-        if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
-#elif defined  SUPPORT_UTF
-        if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#elif !(defined COMPILE_PCRE8)
-        if (d > 255)
-#endif
-#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
-          {
-          xclass = TRUE;
-
-          /* With UCP support, we can find the other case equivalents of
-          the relevant characters. There may be several ranges. Optimize how
-          they fit with the basic range. */
-
-#ifdef SUPPORT_UCP
-#ifndef COMPILE_PCRE8
-          if (utf && (options & PCRE_CASELESS) != 0)
-#else
-          if ((options & PCRE_CASELESS) != 0)
-#endif
-            {
-            unsigned int occ, ocd;
-            unsigned int cc = c;
-            unsigned int origd = d;
-            while (get_othercase_range(&cc, origd, &occ, &ocd))
-              {
-              if (occ >= (unsigned int)c &&
-                  ocd <= (unsigned int)d)
-                continue;                          /* Skip embedded ranges */
-
-              if (occ < (unsigned int)c  &&
-                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
-                {                                  /* if there is overlap,   */
-                c = occ;                           /* noting that if occ < c */
-                continue;                          /* we can't have ocd > d  */
-                }                                  /* because a subrange is  */
-              if (ocd > (unsigned int)d &&
-                  occ <= (unsigned int)d + 1)      /* always shorter than    */
-                {                                  /* the basic range.       */
-                d = ocd;
-                continue;
-                }
-
-              if (occ == ocd)
-                {
-                *class_uchardata++ = XCL_SINGLE;
-                }
-              else
-                {
-                *class_uchardata++ = XCL_RANGE;
-                class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
-                }
-              class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
-              }
-            }
-#endif  /* SUPPORT_UCP */
-
-          /* Now record the original range, possibly modified for UCP caseless
-          overlapping ranges. */
-
-          *class_uchardata++ = XCL_RANGE;
-#ifdef SUPPORT_UTF
-#ifndef COMPILE_PCRE8
-          if (utf)
-            {
-            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
-            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
-            }
-          else
-            {
-            *class_uchardata++ = c;
-            *class_uchardata++ = d;
-            }
-#else
-          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
-          class_uchardata += PRIV(ord2utf)(d, class_uchardata);
-#endif
-#else /* SUPPORT_UTF */
-          *class_uchardata++ = c;
-          *class_uchardata++ = d;
-#endif /* SUPPORT_UTF */
-
-          /* With UCP support, we are done. Without UCP support, there is no
-          caseless matching for UTF characters > 127; we can use the bit map
-          for the smaller ones. As for 16 bit characters without UTF, we
-          can still use  */
-
-#ifdef SUPPORT_UCP
-#ifndef COMPILE_PCRE8
-          if (utf)
-#endif
-            continue;    /* With next character in the class */
-#endif  /* SUPPORT_UCP */
-
-#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
-          if (utf)
-            {
-            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
-            /* Adjust upper limit and fall through to set up the map */
-            d = 127;
-            }
-          else
-            {
-            if (c > 255) continue;
-            /* Adjust upper limit and fall through to set up the map */
-            d = 255;
-            }
-#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
-          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
-          /* Adjust upper limit and fall through to set up the map */
-          d = 127;
-#else
-          if (c > 255) continue;
-          /* Adjust upper limit and fall through to set up the map */
-          d = 255;
-#endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
-          }
-#endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
-
-        /* We use the bit map for 8 bit mode, or when the characters fall
-        partially or entirely to [0-255] ([0-127] for UCP) ranges. */
-
-        class_has_8bitchar = 1;
-
-        /* We can save a bit of time by skipping this in the pre-compile. */
-
-        if (lengthptr == NULL) for (; c <= d; c++)
-          {
-          classbits[c/8] |= (1 << (c&7));
-          if ((options & PCRE_CASELESS) != 0)
-            {
-            int uc = cd->fcc[c]; /* flip case */
-            classbits[uc/8] |= (1 << (uc&7));
-            }
-          }
-
+         
+        class_has_8bitchar += 
+          add_to_class(classbits, &class_uchardata, options, cd, c, d);
+           
         continue;   /* Go get the next char in the class */
         }


-      /* Handle a lone single character - we can get here for a normal
-      non-escape char, or after \ that introduces a single character or for an
-      apparent range that isn't. */
+      /* Handle a single character - we can get here for a normal non-escape
+      char, or after \ that introduces a single character or for an apparent
+      range that isn't. Only the value 1 matters for class_one_char, so don't 
+      increase it if it is already 2 or more ... just in case there's a class 
+      with a zillion characters in it. */


-      LONE_SINGLE_CHARACTER:
+      CLASS_SINGLE_CHARACTER:
+      if (class_one_char < 2) class_one_char++;


-      /* Only the value of 1 matters for class_single_char. */
+      /* If class_one_char is 1, we have the first single character in the
+      class, and there have been no prior ranges, or XCLASS items generated by
+      escapes. If this is the final character in the class, we can optimize by
+      turning the item into a 1-character OP_CHAR[I] if it's positive, or
+      OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
+      to be set. Otherwise, there can be no first char if this item is first,
+      whatever repeat count may follow. In the case of reqchar, save the
+      previous value for reinstating. */


-      if (class_single_char < 2) class_single_char++;
-
-      /* If class_charcount is 1, we saw precisely one character. As long as
-      there was no use of \p or \P, in other words, no use of any XCLASS
-      features, we can optimize.
-
-      The optimization throws away the bit map. We turn the item into a
-      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
-      In the positive case, it can cause firstchar to be set. Otherwise, there
-      can be no first char if this item is first, whatever repeat count may
-      follow. In the case of reqchar, save the previous value for reinstating. */
-
-      if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+      if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
         {
         ptr++;
         zeroreqchar = reqchar;
@@ -4544,64 +4502,12 @@
           }
         goto ONE_CHAR;
         }       /* End of 1-char optimization */
-
-      /* Handle a character that cannot go in the bit map. */
-
-#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
-      if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
-#elif defined SUPPORT_UTF
-      if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
-#elif !(defined COMPILE_PCRE8)
-      if (c > 255)
-#endif
-
-#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
-        {
-        xclass = TRUE;
-        *class_uchardata++ = XCL_SINGLE;
-#ifdef SUPPORT_UTF
-#ifndef COMPILE_PCRE8
-        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
-        if (!utf)
-          *class_uchardata++ = c;
-        else
-#endif
-          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
-#else /* SUPPORT_UTF */
-        *class_uchardata++ = c;
-#endif /* SUPPORT_UTF */
-
-#ifdef SUPPORT_UCP
-#ifdef COMPILE_PCRE8
-        if ((options & PCRE_CASELESS) != 0)
-#else
-        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
-        if (utf && (options & PCRE_CASELESS) != 0)
-#endif
-          {
-          unsigned int othercase;
-          if ((int)(othercase = UCD_OTHERCASE(c)) != c)
-            {
-            *class_uchardata++ = XCL_SINGLE;
-            class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
-            }
-          }
-#endif  /* SUPPORT_UCP */
-
-        }
-      else
-#endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
-
-      /* Handle a single-byte character */
-        {
-        class_has_8bitchar = 1;
-        classbits[c/8] |= (1 << (c&7));
-        if ((options & PCRE_CASELESS) != 0)
-          {
-          c = cd->fcc[c]; /* flip case */
-          classbits[c/8] |= (1 << (c&7));
-          }
-        }
+        
+      /* There is more than one character in the class, or an XCLASS item
+      has been generated. Add this character to the class. */
+ 
+      class_has_8bitchar += 
+        add_to_class(classbits, &class_uchardata, options, cd, c, c);
       }


     /* Loop until ']' reached. This "while" is the end of the "do" far above.
@@ -4621,6 +4527,18 @@
       goto FAILED;
       }


+    /* We will need an XCLASS if data has been placed in class_uchardata. In 
+    the second phase this is a sufficient test. However, in the pre-compile 
+    phase, class_uchardata gets emptied to prevent workspace overflow, so it 
+    only if the very last character in the class needs XCLASS will it contain 
+    anything at this point. For this reason, xclass gets set TRUE above when
+    uchar_classdata is emptied, and that's why this code is the way it is here
+    instead of just doing a test on class_uchardata below. */
+     
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+    if (class_uchardata > class_uchardata_base) xclass = TRUE;
+#endif
+
     /* If this is the first thing in the branch, there can be no first char
     setting, whatever the repeat count. Any reqchar setting must remain
     unchanged after any kind of repeat. */


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_internal.h    2012-09-23 16:50:00 UTC (rev 1045)
@@ -834,78 +834,105 @@


/* Tests for Unicode horizontal and vertical whitespace characters must check a
number of different values. Using a switch statement for this generates the
-fastest code (no loop, no memory access), and there are several places where
-this happens. In order to ensure that all the case lists remain in step, we use
-macros so that there is only one place where the lists are defined.
+fastest code (no loop, no memory access), and there are several places in the
+interpreter code where this happens. In order to ensure that all the case lists
+remain in step, we use macros so that there is only one place where the lists
+are defined.

-NOTE: These values are also used explicitly in pcre_compile.c when processing
-\h, \H, \v and \V in a character class, so any changes here should be
-duplicated there as well. They also appear in pcre_jit_compile.c. */
+These values are also required as lists in pcre_compile.c when processing \h,
+\H, \v and \V in a character class. The lists are defined in pcre_tables.c, but
+macros that define the values are here so that all the definitions are
+together. The lists must be in ascending character order, terminated by
+NOTACHAR (which is 0xffffffff).

+Any changes should ensure that the various macros are kept in step with each
+other. NOTE: The values also appear in pcre_jit_compile.c. */
+
/* ------ ASCII/Unicode environments ------ */

 #ifndef EBCDIC
+
+#define HSPACE_LIST \
+  CHAR_HT, CHAR_SPACE, 0xa0, \
+  0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
+  0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
+  NOTACHAR 
+
 #define HSPACE_MULTIBYTE_CASES \
-      case 0x1680:    /* OGHAM SPACE MARK */ \
-      case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */ \
-      case 0x2000:    /* EN QUAD */ \
-      case 0x2001:    /* EM QUAD */ \
-      case 0x2002:    /* EN SPACE */ \
-      case 0x2003:    /* EM SPACE */ \
-      case 0x2004:    /* THREE-PER-EM SPACE */ \
-      case 0x2005:    /* FOUR-PER-EM SPACE */ \
-      case 0x2006:    /* SIX-PER-EM SPACE */ \
-      case 0x2007:    /* FIGURE SPACE */ \
-      case 0x2008:    /* PUNCTUATION SPACE */ \
-      case 0x2009:    /* THIN SPACE */ \
-      case 0x200A:    /* HAIR SPACE */ \
-      case 0x202f:    /* NARROW NO-BREAK SPACE */ \
-      case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */ \
-      case 0x3000     /* IDEOGRAPHIC SPACE */
+  case 0x1680:  /* OGHAM SPACE MARK */ \
+  case 0x180e:  /* MONGOLIAN VOWEL SEPARATOR */ \
+  case 0x2000:  /* EN QUAD */ \
+  case 0x2001:  /* EM QUAD */ \
+  case 0x2002:  /* EN SPACE */ \
+  case 0x2003:  /* EM SPACE */ \
+  case 0x2004:  /* THREE-PER-EM SPACE */ \
+  case 0x2005:  /* FOUR-PER-EM SPACE */ \
+  case 0x2006:  /* SIX-PER-EM SPACE */ \
+  case 0x2007:  /* FIGURE SPACE */ \
+  case 0x2008:  /* PUNCTUATION SPACE */ \
+  case 0x2009:  /* THIN SPACE */ \
+  case 0x200A:  /* HAIR SPACE */ \
+  case 0x202f:  /* NARROW NO-BREAK SPACE */ \
+  case 0x205f:  /* MEDIUM MATHEMATICAL SPACE */ \
+  case 0x3000   /* IDEOGRAPHIC SPACE */


 #define HSPACE_BYTE_CASES \
-      case CHAR_HT: \
-      case CHAR_SPACE: \
-      case 0xa0       /* NBSP */
+  case CHAR_HT: \
+  case CHAR_SPACE: \
+  case 0xa0     /* NBSP */
+      
+#define HSPACE_CASES \
+  HSPACE_BYTE_CASES: \
+  HSPACE_MULTIBYTE_CASES


+#define VSPACE_LIST \
+  CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR 
+
 #define VSPACE_MULTIBYTE_CASES \
-      case 0x2028:    /* LINE SEPARATOR */ \
-      case 0x2029     /* PARAGRAPH SEPARATOR */
+  case 0x2028:    /* LINE SEPARATOR */ \
+  case 0x2029     /* PARAGRAPH SEPARATOR */


 #define VSPACE_BYTE_CASES \
-      case CHAR_LF: \
-      case CHAR_VT: \
-      case CHAR_FF: \
-      case CHAR_CR: \
-      case CHAR_NEL
+  case CHAR_LF: \
+  case CHAR_VT: \
+  case CHAR_FF: \
+  case CHAR_CR: \
+  case CHAR_NEL


-#define HSPACE_CASES \
-        HSPACE_BYTE_CASES: \
-        HSPACE_MULTIBYTE_CASES
-
 #define VSPACE_CASES \
-        VSPACE_BYTE_CASES: \
-        VSPACE_MULTIBYTE_CASES
+  VSPACE_BYTE_CASES: \
+  VSPACE_MULTIBYTE_CASES


/* ------ EBCDIC environments ------ */

 #else
+#define HSPACE_LIST CHAR_HT, CHAR_SPACE
+
 #define HSPACE_BYTE_CASES \
-      case CHAR_HT: \
-      case CHAR_SPACE
-      
+  case CHAR_HT: \
+  case CHAR_SPACE
+
+#define HSPACE_CASES HSPACE_BYTE_CASES
+
+#ifdef EBCDIC_NL25
+#define VSPACE_LIST \
+  CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR 
+#else
+#define VSPACE_LIST \
+  CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR 
+#endif   
+
 #define VSPACE_BYTE_CASES \
-      case CHAR_LF: \
-      case CHAR_VT: \
-      case CHAR_FF: \
-      case CHAR_CR: \
-      case CHAR_NEL
-      
-#define HSPACE_CASES HSPACE_BYTE_CASES
+  case CHAR_LF: \
+  case CHAR_VT: \
+  case CHAR_FF: \
+  case CHAR_CR: \
+  case CHAR_NEL
+
 #define VSPACE_CASES VSPACE_BYTE_CASES
 #endif  /* EBCDIC */


-/* ------ End of whitespace case macros ------ */
+/* ------ End of whitespace macros ------ */


/* In case there is no definition of offsetof() provided - though any proper
@@ -2351,22 +2378,22 @@
pcre_tables.c module. */

 #ifdef COMPILE_PCRE8
-
 extern const int            PRIV(utf8_table1)[];
 extern const int            PRIV(utf8_table1_size);
 extern const int            PRIV(utf8_table2)[];
 extern const int            PRIV(utf8_table3)[];
 extern const pcre_uint8     PRIV(utf8_table4)[];
-
 #endif /* COMPILE_PCRE8 */


 extern const char           PRIV(utt_names)[];
 extern const ucp_type_table PRIV(utt)[];
 extern const int            PRIV(utt_size);


+extern const pcre_uint8     PRIV(OP_lengths)[];
 extern const pcre_uint8     PRIV(default_tables)[];


-extern const pcre_uint8     PRIV(OP_lengths)[];
+extern const pcre_uint32    PRIV(hspace_list)[];
+extern const pcre_uint32    PRIV(vspace_list)[];



 /* Internal shared functions. These are functions that are used by more than
@@ -2435,7 +2462,7 @@
   pcre_uint8 script;     /* ucp_Arabic, etc. */
   pcre_uint8 chartype;   /* ucp_Cc, etc. (general categories) */
   pcre_uint8 gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
-  pcre_uint8 caseset;    /* offset to multichar other cases or zero */ 
+  pcre_uint8 caseset;    /* offset to multichar other cases or zero */
   pcre_int32 other_case; /* offset to other case, or zero if none */
 } ucd_record;



Modified: code/trunk/pcre_printint.c
===================================================================
--- code/trunk/pcre_printint.c    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_printint.c    2012-09-23 16:50:00 UTC (rev 1045)
@@ -130,7 +130,9 @@


if (!utf || (c & 0xc0) != 0xc0)
{
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ if (PRINTABLE(c)) fprintf(f, "%c", c);
+ else if (c < 0x80) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%02x}", c);
return 0;
}
else
@@ -167,8 +169,8 @@
if (!utf || (c & 0xfc00) != 0xd800)
{
if (PRINTABLE(c)) fprintf(f, "%c", c);
- else if (c <= 0xff) fprintf(f, "\\x%02x", c);
- else fprintf(f, "\\x{%x}", c);
+ else if (c <= 0x80) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%02x}", c);
return 0;
}
else

Modified: code/trunk/pcre_tables.c
===================================================================
--- code/trunk/pcre_tables.c    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_tables.c    2012-09-23 16:50:00 UTC (rev 1045)
@@ -58,8 +58,14 @@


const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };

+/* Tables of horizontal and vertical whitespace characters, suitable for
+adding to classes. */

+const pcre_uint32 PRIV(hspace_list)[] = { HSPACE_LIST };
+const pcre_uint32 PRIV(vspace_list)[] = { VSPACE_LIST };

+
+
 /*************************************************
 *           Tables for UTF-8 support             *
 *************************************************/


Modified: code/trunk/pcre_ucd.c
===================================================================
--- code/trunk/pcre_ucd.c    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_ucd.c    2012-09-23 16:50:00 UTC (rev 1045)
@@ -39,26 +39,26 @@



const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
- 0xffffffff,
- 0x0053, 0x0073, 0x017f, 0xffffffff,
- 0x01c4, 0x01c5, 0x01c6, 0xffffffff,
- 0x01c7, 0x01c8, 0x01c9, 0xffffffff,
- 0x01ca, 0x01cb, 0x01cc, 0xffffffff,
- 0x01f1, 0x01f2, 0x01f3, 0xffffffff,
- 0x0345, 0x0399, 0x03b9, 0x1fbe, 0xffffffff,
- 0x00b5, 0x039c, 0x03bc, 0xffffffff,
- 0x03a3, 0x03c2, 0x03c3, 0xffffffff,
- 0x0392, 0x03b2, 0x03d0, 0xffffffff,
- 0x0398, 0x03b8, 0x03d1, 0x03f4, 0xffffffff,
- 0x03a6, 0x03c6, 0x03d5, 0xffffffff,
- 0x03a0, 0x03c0, 0x03d6, 0xffffffff,
- 0x039a, 0x03ba, 0x03f0, 0xffffffff,
- 0x03a1, 0x03c1, 0x03f1, 0xffffffff,
- 0x0395, 0x03b5, 0x03f5, 0xffffffff,
- 0x1e60, 0x1e61, 0x1e9b, 0xffffffff,
- 0x03a9, 0x03c9, 0x2126, 0xffffffff,
- 0x004b, 0x006b, 0x212a, 0xffffffff,
- 0x00c5, 0x00e5, 0x212b, 0xffffffff,
+ NOTACHAR,
+ 0x0053, 0x0073, 0x017f, NOTACHAR,
+ 0x01c4, 0x01c5, 0x01c6, NOTACHAR,
+ 0x01c7, 0x01c8, 0x01c9, NOTACHAR,
+ 0x01ca, 0x01cb, 0x01cc, NOTACHAR,
+ 0x01f1, 0x01f2, 0x01f3, NOTACHAR,
+ 0x0345, 0x0399, 0x03b9, 0x1fbe, NOTACHAR,
+ 0x00b5, 0x039c, 0x03bc, NOTACHAR,
+ 0x03a3, 0x03c2, 0x03c3, NOTACHAR,
+ 0x0392, 0x03b2, 0x03d0, NOTACHAR,
+ 0x0398, 0x03b8, 0x03d1, 0x03f4, NOTACHAR,
+ 0x03a6, 0x03c6, 0x03d5, NOTACHAR,
+ 0x03a0, 0x03c0, 0x03d6, NOTACHAR,
+ 0x039a, 0x03ba, 0x03f0, NOTACHAR,
+ 0x03a1, 0x03c1, 0x03f1, NOTACHAR,
+ 0x0395, 0x03b5, 0x03f5, NOTACHAR,
+ 0x1e60, 0x1e61, 0x1e9b, NOTACHAR,
+ 0x03a9, 0x03c9, 0x2126, NOTACHAR,
+ 0x004b, 0x006b, 0x212a, NOTACHAR,
+ 0x00c5, 0x00e5, 0x212b, NOTACHAR,
};

const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */

Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testinput10    2012-09-23 16:50:00 UTC (rev 1045)
@@ -1090,4 +1090,22 @@


/-- --/

+/\x{1e9e}+/8i
+    \x{1e9e}\x{00df}
+
+/[z\x{1e9e}]+/8i
+    \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+    \x{1e9e}\x{00df}
+
+/[z\x{00df}]+/8i
+    \x{1e9e}\x{00df}
+
+/\x{1f88}+/8i
+    \x{1f88}\x{1f80} 
+
+/[z\x{1f88}]+/8i
+    \x{1f88}\x{1f80} 
+
 /-- End of testinput10 --/ 


Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testinput6    2012-09-23 16:50:00 UTC (rev 1045)
@@ -1,6 +1,5 @@
 /-- This set of tests is for Unicode property support. It is compatible with
-    Perl >= 5.10, but not 5.8 because it tests some extra properties that are
-    not in the earlier release. --/ 
+    Perl >= 5.15. --/


 /^\pC\pL\pM\pN\pP\pS\pZ</8
     \x7f\x{c0}\x{30f}\x{660}\x{66c}\x{f01}\x{1680}<
@@ -885,4 +884,205 @@


/-- --/

+/\x{1e9e}+/8i
+    \x{1e9e}\x{00df}
+
+/[z\x{1e9e}]+/8i
+    \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+    \x{1e9e}\x{00df}
+
+/[z\x{00df}]+/8i
+    \x{1e9e}\x{00df}
+
+/\x{1f88}+/8i
+    \x{1f88}\x{1f80} 
+
+/[z\x{1f88}]+/8i
+    \x{1f88}\x{1f80} 
+    
+/-- Characters with more than one other case; test in classes --/
+
+/[z\x{00b5}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{039c}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{03bc}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{00c5}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{00e5}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{212b}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{01c4}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c5}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c6}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c7}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01c8}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01c9}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01ca}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01cb}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01cc}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01f1}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{01f2}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{01f3}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{0345}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{0399}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{03b9}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{1fbe}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{0392}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{03b2}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{03d0}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{0395}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{03b5}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{03f5}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{0398}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03b8}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03d1}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03f4}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{039a}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03ba}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03f0}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03a0}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+
+/[z\x{03c0}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+
+/[z\x{03d6}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+
+/[z\x{03a1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03c1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03f1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03a3}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03c2}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03c3}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03a6}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+
+/[z\x{03c6}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+
+/[z\x{03d5}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+
+/[z\x{03c9}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{03a9}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{2126}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{1e60}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e61}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e9b}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/[z\x{004b}]+/8i
+    \x{004b}\x{006b}\x{212a}
+
+/[z\x{006b}]+/8i
+    \x{004b}\x{006b}\x{212a}
+
+/[z\x{212a}]+/8i
+    \x{004b}\x{006b}\x{212a}
+
+/[z\x{0053}]+/8i
+    \x{0053}\x{0073}\x{017f}
+
+/[z\x{0073}]+/8i
+    \x{0053}\x{0073}\x{017f}
+
+/[z\x{017f}]+/8i
+    \x{0053}\x{0073}\x{017f}
+    
+/-- --/ 
+
 /-- End of testinput6 --/


Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput10    2012-09-23 16:50:00 UTC (rev 1045)
@@ -2268,4 +2268,34 @@


/-- --/

+/\x{1e9e}+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/[z\x{1e9e}]+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/\x{00df}+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/[z\x{00df}]+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/\x{1f88}+/8i
+    \x{1f88}\x{1f80} 
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
+/[z\x{1f88}]+/8i
+    \x{1f88}\x{1f80} 
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
 /-- End of testinput10 --/ 


Modified: code/trunk/testdata/testoutput11-16
===================================================================
--- code/trunk/testdata/testoutput11-16    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput11-16    2012-09-23 16:50:00 UTC (rev 1045)
@@ -333,7 +333,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     \xff
+  2     \x{ff}
   4   4 Ket
   6     End
 ------------------------------------------------------------------
@@ -360,7 +360,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     \xff
+  2     \x{ff}
   4   4 Ket
   6     End
 ------------------------------------------------------------------
@@ -591,7 +591,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     \xaa
+  2     \x{aa}
   4   4 Ket
   6     End
 ------------------------------------------------------------------
@@ -600,7 +600,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     \xaa
+  2     \x{aa}
   4   4 Ket
   6     End
 ------------------------------------------------------------------
@@ -627,7 +627,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     [^\xaa]
+  2     [^\x{aa}]
   4   4 Ket
   6     End
 ------------------------------------------------------------------
@@ -636,7 +636,7 @@
 Memory allocation (code space): 14
 ------------------------------------------------------------------
   0   4 Bra
-  2     [^\xaa]
+  2     [^\x{aa}]
   4   4 Ket
   6     End
 ------------------------------------------------------------------


Modified: code/trunk/testdata/testoutput11-8
===================================================================
--- code/trunk/testdata/testoutput11-8    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput11-8    2012-09-23 16:50:00 UTC (rev 1045)
@@ -591,7 +591,7 @@
 Memory allocation (code space): 9
 ------------------------------------------------------------------
   0   5 Bra
-  3     \xaa
+  3     \x{aa}
   5   5 Ket
   8     End
 ------------------------------------------------------------------
@@ -627,7 +627,7 @@
 Memory allocation (code space): 9
 ------------------------------------------------------------------
   0   5 Bra
-  3     [^\xaa]
+  3     [^\x{aa}]
   5   5 Ket
   8     End
 ------------------------------------------------------------------


Modified: code/trunk/testdata/testoutput15
===================================================================
--- code/trunk/testdata/testoutput15    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput15    2012-09-23 16:50:00 UTC (rev 1045)
@@ -560,7 +560,7 @@
 /[^\x{c4}]/DZ
 ------------------------------------------------------------------
         Bra
-        [^\xc4]
+        [^\x{c4}]
         Ket
         End
 ------------------------------------------------------------------


Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput17    2012-09-23 16:50:00 UTC (rev 1045)
@@ -20,7 +20,7 @@
 /[^\x{c4}]/DZ
 ------------------------------------------------------------------
         Bra
-        [^\xc4]
+        [^\x{c4}]
         Ket
         End
 ------------------------------------------------------------------
@@ -271,7 +271,7 @@
 /[\H]/BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}]
+        [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -287,7 +287,7 @@
 /[\V]/BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}]
+        [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -295,7 +295,7 @@
 /[\x0a\V]/BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x0a\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}]
+        [\x00-\x0a\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -349,7 +349,7 @@
 /[\H\x{d800}]+/BZSI
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}\x{d800}]+
+        [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}\x{d800}]+
         Ket
         End
 ------------------------------------------------------------------
@@ -413,7 +413,7 @@
 /[\V\x{d800}]+/BZSI
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}\x{d800}]+
+        [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}\x{d800}]+
         Ket
         End
 ------------------------------------------------------------------
@@ -452,7 +452,7 @@
 ------------------------------------------------------------------
         Bra
         [^\x80]
-        [^\xff]
+        [^\x{ff}]
         [^\x{100}]
         [^\x{1000}]
         [^\x{ffff}]
@@ -464,7 +464,7 @@
 ------------------------------------------------------------------
         Bra
      /i [^\x80]
-     /i [^\xff]
+     /i [^\x{ff}]
      /i [^\x{100}]
      /i [^\x{1000}]
      /i [^\x{ffff}]


Modified: code/trunk/testdata/testoutput18
===================================================================
--- code/trunk/testdata/testoutput18    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput18    2012-09-23 16:50:00 UTC (rev 1045)
@@ -161,7 +161,7 @@
 /[\x{ff}]/8DZ
 ------------------------------------------------------------------
         Bra
-        \xff
+        \x{ff}
         Ket
         End
 ------------------------------------------------------------------
@@ -197,7 +197,7 @@
 /\xff/8DZ
 ------------------------------------------------------------------
         Bra
-        \xff
+        \x{ff}
         Ket
         End
 ------------------------------------------------------------------
@@ -249,7 +249,7 @@
 /\x{084}/DZ8
 ------------------------------------------------------------------
         Bra
-        \x84
+        \x{84}
         Ket
         End
 ------------------------------------------------------------------
@@ -489,7 +489,7 @@
 /[^\x{c4}]/DZ
 ------------------------------------------------------------------
         Bra
-        [^\xc4]
+        [^\x{c4}]
         Ket
         End
 ------------------------------------------------------------------
@@ -521,7 +521,7 @@
 /[\xff]/DZ8
 ------------------------------------------------------------------
         Bra
-        \xff
+        \x{ff}
         Ket
         End
 ------------------------------------------------------------------
@@ -535,7 +535,7 @@
 /[^\xff]/8DZ
 ------------------------------------------------------------------
         Bra
-        [^\xff]
+        [^\x{ff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -812,7 +812,7 @@
 /[^\x{c4}]/8DZ
 ------------------------------------------------------------------
         Bra
-        [^\xc4]
+        [^\x{c4}]
         Ket
         End
 ------------------------------------------------------------------
@@ -863,7 +863,7 @@
 ------------------------------------------------------------------
         Bra
         \w++
-        \xc4
+        \x{c4}
         Ket
         End
 ------------------------------------------------------------------
@@ -874,7 +874,7 @@
 ------------------------------------------------------------------
         Bra
         \w+
-        \xc4
+        \x{c4}
         Ket
         End
 ------------------------------------------------------------------
@@ -885,7 +885,7 @@
 ------------------------------------------------------------------
         Bra
         \W+
-        \xc4
+        \x{c4}
         Ket
         End
 ------------------------------------------------------------------
@@ -896,7 +896,7 @@
 ------------------------------------------------------------------
         Bra
         \W++
-        \xc4
+        \x{c4}
         Ket
         End
 ------------------------------------------------------------------
@@ -907,7 +907,7 @@
 ------------------------------------------------------------------
         Bra
         \W+
-        \xa1
+        \x{a1}
         Ket
         End
 ------------------------------------------------------------------
@@ -918,7 +918,7 @@
 ------------------------------------------------------------------
         Bra
         \W+
-        \xa1
+        \x{a1}
         Ket
         End
 ------------------------------------------------------------------
@@ -930,7 +930,7 @@
         Bra
         X
         \s++
-        \xa0
+        \x{a0}
         Ket
         End
 ------------------------------------------------------------------
@@ -942,7 +942,7 @@
         Bra
         X
         \s+
-        \xa0
+        \x{a0}
         Ket
         End
 ------------------------------------------------------------------
@@ -953,7 +953,7 @@
 ------------------------------------------------------------------
         Bra
         \S+
-        \xa0
+        \x{a0}
         Ket
         End
 ------------------------------------------------------------------
@@ -964,7 +964,7 @@
 ------------------------------------------------------------------
         Bra
         \S++
-        \xa0
+        \x{a0}
         Ket
         End
 ------------------------------------------------------------------
@@ -974,7 +974,7 @@
 /\x{a0}+\s!/8BZ
 ------------------------------------------------------------------
         Bra
-        \xa0++
+        \x{a0}++
         \s
         !
         Ket
@@ -986,7 +986,7 @@
 /\x{a0}+\s!/8BZT1
 ------------------------------------------------------------------
         Bra
-        \xa0+
+        \x{a0}+
         \s
         !
         Ket


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput5    2012-09-23 16:50:00 UTC (rev 1045)
@@ -276,7 +276,7 @@
 /[\xFF]/DZ
 ------------------------------------------------------------------
         Bra
-        \xff
+        \x{ff}
         Ket
         End
 ------------------------------------------------------------------
@@ -290,7 +290,7 @@
 /[^\xFF]/DZ
 ------------------------------------------------------------------
         Bra
-        [^\xff]
+        [^\x{ff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -786,7 +786,7 @@
 /[\H]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
+        [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -794,7 +794,7 @@
 /[\V]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{10ffff}]
+        [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -1596,7 +1596,7 @@
 /[\H\x{d7ff}]+/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}\x{d7ff}]+
+        [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}\x{d7ff}]+
         Ket
         End
 ------------------------------------------------------------------
@@ -1636,7 +1636,7 @@
 /[\V\x{d7ff}]+/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{10ffff}\x{d7ff}]+
+        [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{10ffff}\x{d7ff}]+
         Ket
         End
 ------------------------------------------------------------------


Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput6    2012-09-23 16:50:00 UTC (rev 1045)
@@ -1,6 +1,5 @@
 /-- This set of tests is for Unicode property support. It is compatible with
-    Perl >= 5.10, but not 5.8 because it tests some extra properties that are
-    not in the earlier release. --/ 
+    Perl >= 5.15. --/


 /^\pC\pL\pM\pN\pP\pS\pZ</8
     \x7f\x{c0}\x{30f}\x{660}\x{66c}\x{f01}\x{1680}<
@@ -1551,4 +1550,270 @@


/-- --/

+/\x{1e9e}+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/[z\x{1e9e}]+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/\x{00df}+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/[z\x{00df}]+/8i
+    \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/\x{1f88}+/8i
+    \x{1f88}\x{1f80} 
+ 0: \x{1f88}\x{1f80}
+
+/[z\x{1f88}]+/8i
+    \x{1f88}\x{1f80} 
+ 0: \x{1f88}\x{1f80}
+    
+/-- Characters with more than one other case; test in classes --/
+
+/[z\x{00b5}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{039c}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{03bc}]+/8i
+    \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{00c5}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{00e5}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{212b}]+/8i
+    \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{01c4}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c5}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c6}]+/8i
+    \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c7}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01c8}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01c9}]+/8i
+    \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01ca}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01cb}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01cc}]+/8i
+    \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01f1}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{01f2}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{01f3}]+/8i
+    \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{0345}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{0399}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{03b9}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{1fbe}]+/8i
+    \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{0392}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{03b2}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{03d0}]+/8i
+    \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{0395}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{03b5}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{03f5}]+/8i
+    \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{0398}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03b8}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03d1}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03f4}]+/8i
+    \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{039a}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03ba}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03f0}]+/8i
+    \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03a0}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03c0}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03d6}]+/8i
+    \x{03a0}\x{03c0}\x{03d6} 
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03a1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03c1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03f1}]+/8i
+    \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03a3}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03c2}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03c3}]+/8i
+    \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03a6}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03c6}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03d5}]+/8i
+    \x{03a6}\x{03c6}\x{03d5} 
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03c9}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{03a9}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{2126}]+/8i
+    \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{1e60}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e61}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e9b}]+/8i
+    \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/[z\x{004b}]+/8i
+    \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{006b}]+/8i
+    \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{212a}]+/8i
+    \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{0053}]+/8i
+    \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/[z\x{0073}]+/8i
+    \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/[z\x{017f}]+/8i
+    \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+    
+/-- --/ 
+
 /-- End of testinput6 --/


Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7    2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput7    2012-09-23 16:50:00 UTC (rev 1045)
@@ -124,7 +124,7 @@
 /[z-\x{100}]/8iDZ 
 ------------------------------------------------------------------
         Bra
-        [Z\x{39c}\x{178}z-\x{101}]
+        [Z\x{39c}\x{3bc}\x{1e9e}\x{178}z-\x{101}]
         Ket
         End
 ------------------------------------------------------------------
@@ -162,7 +162,7 @@
 /[z-\x{100}]/8DZi
 ------------------------------------------------------------------
         Bra
-        [Z\x{39c}\x{178}z-\x{101}]
+        [Z\x{39c}\x{3bc}\x{1e9e}\x{178}z-\x{101}]
         Ket
         End
 ------------------------------------------------------------------