Revision: 1045
http://vcs.pcre.org/viewvc?view=rev&revision=1045
Author: ph10
Date: 2012-09-23 17:50:00 +0100 (Sun, 23 Sep 2012)
Log Message:
-----------
Update character class handling to use new character case information; rework
\h, \H, \v, and \V to use the same apparatus with centrally defined lists.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/maint/MultiStage2.py
code/trunk/maint/ucptest.c
code/trunk/pcre_compile.c
code/trunk/pcre_internal.h
code/trunk/pcre_printint.c
code/trunk/pcre_tables.c
code/trunk/pcre_ucd.c
code/trunk/testdata/testinput10
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput11-16
code/trunk/testdata/testoutput11-8
code/trunk/testdata/testoutput15
code/trunk/testdata/testoutput17
code/trunk/testdata/testoutput18
code/trunk/testdata/testoutput5
code/trunk/testdata/testoutput6
code/trunk/testdata/testoutput7
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/ChangeLog 2012-09-23 16:50:00 UTC (rev 1045)
@@ -85,6 +85,29 @@
20. Turn case lists for horizontal and vertical white space into macros so that
they are defined only once.
+
+21. This set of changes together give more compatible Unicode case-folding
+ behaviour for characters that have more than one other case.
+
+ (a) The Unicode property table now has offsets into a new table of sets of
+ three or more characters that are case-equivalent. The MultiStage2.py
+ script that generates these tables (the pcre_ucd.c file) now scans
+ CaseFolding.txt instead of UnicodeData.txt for character case
+ information.
+
+ (b) The code for adding characters or ranges of characters to a character
+ class has been abstracted into a generalized function that also handles
+ case-independence. In UTF-mode with UCP support, this uses the new data
+ to handle characters with more than one other case.
+
+ (c) A bug that is fixed as a result of (b) is that codepoints less than 256
+ whose other case is greater than 256 are now correctly matched
+ caselessly. Previously, the high codepoint matched the low one, but not
+ vice versa.
+
+ (d) The processing of \h, \H, \v, and \ in character classes now makes use
+ of the new class addition function, using character lists defined as
+ macros alongside the case definitions of 20 above.
Version 8.31 06-July-2012
Modified: code/trunk/maint/MultiStage2.py
===================================================================
--- code/trunk/maint/MultiStage2.py 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/maint/MultiStage2.py 2012-09-23 16:50:00 UTC (rev 1045)
@@ -56,9 +56,9 @@
#
# This script constructs four tables. The ucd_caseless_sets table contains
# lists of characters that all match each other caselessly. Each list is
-# in order, and is terminated by 0xffffffff, which is of course larger than any
-# valid character. The first list is empty; this is used for characters that
-# are not part of any list.
+# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
+# any valid character. The first list is empty; this is used for characters
+# that are not part of any list.
#
# The ucd_records table contains one instance of every unique record that is
# required. The ucd_stage1 table is indexed by a character's block number, and
@@ -435,12 +435,12 @@
# --- Added by PH: output the table of caseless character sets ---
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
-print " 0xffffffff,"
+print " NOTACHAR,"
for s in sets:
s = sorted(s)
for x in s:
print ' 0x%04x,' % x,
- print ' 0xffffffff,'
+ print ' NOTACHAR,'
print '};'
print
Modified: code/trunk/maint/ucptest.c
===================================================================
--- code/trunk/maint/ucptest.c 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/maint/ucptest.c 2012-09-23 16:50:00 UTC (rev 1045)
@@ -243,7 +243,7 @@
if (caseset != 0)
{
const pcre_uint32 *p = PRIV(ucd_caseless_sets) + caseset - 1;
- while (*(++p) < 0xffffffff)
+ while (*(++p) < NOTACHAR)
if (*p != othercase && *p != c) printf(", %04x", *p);
}
}
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_compile.c 2012-09-23 16:50:00 UTC (rev 1045)
@@ -68,7 +68,7 @@
/* Macro for setting individual bits in class bitmaps. */
-#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
+#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
/* Maximum length value to check against when making sure that the integer that
holds the compiled pattern length does not overflow. We make it a bit less than
@@ -77,7 +77,18 @@
#define OFLOW_MAX (INT_MAX - 20)
+/* Definitions to allow mutual recursion */
+static int
+ add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
+ const pcre_uint32 *, unsigned int);
+
+static BOOL
+ compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL,
+ int, int, int *, int *, branch_chain *, compile_data *, int *);
+
+
+
/*************************************************
* Code parameters and static tables *
*************************************************/
@@ -631,14 +642,8 @@
#endif
-/* Definition to allow mutual recursion */
-static BOOL
- compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
- int *, int *, branch_chain *, compile_data *, int *);
-
-
/*************************************************
* Find an error text *
*************************************************/
@@ -2871,9 +2876,10 @@
*************************************************/
/* This function is passed the start and end of a class range, in UTF-8 mode
-with UCP support. It searches up the characters, looking for internal ranges of
+with UCP support. It searches up the characters, looking for ranges of
characters in the "other" case. Each call returns the next one, updating the
-start address.
+start address. A character with multiple other cases is returned on its own
+with a special return value.
Arguments:
cptr points to starting character value; updated
@@ -2881,19 +2887,34 @@
ocptr where to put start of othercase range
odptr where to put end of othercase range
-Yield: TRUE when range returned; FALSE when no more
+Yield: -1 when no more
+ 0 when a range is returned
+ >0 the CASESET offset for char with multiple other cases
+ in this case, ocptr contains the original
*/
-static BOOL
+static int
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
unsigned int *odptr)
{
unsigned int c, othercase, next;
+int co;
+/* Find the first character that has an other case. If it has multiple other
+cases, return its case offset value. */
+
for (c = *cptr; c <= d; c++)
- { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
+ {
+ if ((co = UCD_CASESET(c)) != 0)
+ {
+ *ocptr = c++; /* Character that has the set */
+ *cptr = c; /* Rest of input range */
+ return co;
+ }
+ if ((othercase = UCD_OTHERCASE(c)) != c) break;
+ }
-if (c > d) return FALSE;
+if (c > d) return -1; /* Reached end of range */
*ocptr = othercase;
next = othercase + 1;
@@ -2904,10 +2925,9 @@
next++;
}
-*odptr = next - 1;
-*cptr = c;
-
-return TRUE;
+*odptr = next - 1; /* End of othercase range */
+*cptr = c; /* Rest of input range */
+return 0;
}
@@ -3357,6 +3377,243 @@
/*************************************************
+* Add a character or range to a class *
+*************************************************/
+
+/* This function packages up the logic of adding a character or range of
+characters to a class. The character values in the arguments will be within the
+valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
+mutually recursive with the function immediately below.
+
+Arguments:
+ classbits the bit map for characters < 256
+ uchardptr points to the pointer for extra data
+ options the options word
+ cd contains pointers to tables etc.
+ start start of range character
+ end end of range character
+
+Returns: the number of < 256 characters added
+ the pointer to extra data is updated
+*/
+
+static int
+add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
+ compile_data *cd, unsigned int start, unsigned int end)
+{
+unsigned int c;
+int n8 = 0;
+
+/* If caseless matching is required, scan the range and process alternate
+cases. In Unicode, there are 8-bit characters that have alternate cases that
+are greater than 255 and vice-versa. Sometimes we can just extend the original
+range. */
+
+if ((options & PCRE_CASELESS) != 0)
+ {
+#ifdef SUPPORT_UCP
+ if ((options & PCRE_UTF8) != 0)
+ {
+ int rc;
+ unsigned int oc, od;
+
+ options &= ~PCRE_CASELESS; /* Remove for recursive calls */
+ c = start;
+
+ while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
+ {
+ /* Handle a single character that has more than one other case. */
+
+ if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
+ PRIV(ucd_caseless_sets) + rc, oc);
+
+ /* Do nothing if the other case range is within the original range. */
+
+ else if (oc >= start && od <= end) continue;
+
+ /* Extend the original range if there is overlap, noting that if oc < c, we
+ can't have od > end because a subrange is always shorter than the basic
+ range. Otherwise, use a recursive call to add the additional range. */
+
+ else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
+ else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
+ else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
+ }
+ }
+ else
+#endif /* SUPPORT_UCP */
+
+ /* Not UTF-mode, or no UCP */
+
+ for (c = start; c <= end && c < 256; c++)
+ {
+ SETBIT(classbits, cd->fcc[c]);
+ n8++;
+ }
+ }
+
+/* Now handle the original range. Adjust the final value according to the bit
+length - this means that the same lists of (e.g.) horizontal spaces can be used
+in all cases. */
+
+#ifdef COMPILE_PCRE8
+#ifdef SUPPORT_UTF
+ if ((options & PCRE_UTF8) == 0)
+#endif
+ if (end > 0xff) end = 0xff;
+#endif
+
+#ifdef COMPILE_PCRE16
+#ifdef SUPPORT_UTF
+ if ((options & PCRE_UTF16) == 0)
+#endif
+ if (end > 0xffff) end = 0xffff;
+#endif
+
+/* If all characters are less than 256, use the bit map. Otherwise use extra
+data. */
+
+if (end < 0x100)
+ {
+ for (c = start; c <= end; c++)
+ {
+ n8++;
+ SETBIT(classbits, c);
+ }
+ }
+
+else
+ {
+ pcre_uchar *uchardata = *uchardptr;
+
+#ifdef SUPPORT_UTF
+ if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
+ {
+ if (start < end)
+ {
+ *uchardata++ = XCL_RANGE;
+ uchardata += PRIV(ord2utf)(start, uchardata);
+ uchardata += PRIV(ord2utf)(end, uchardata);
+ }
+ else if (start == end)
+ {
+ *uchardata++ = XCL_SINGLE;
+ uchardata += PRIV(ord2utf)(start, uchardata);
+ }
+ }
+ else
+#endif /* SUPPORT_UTF */
+
+ /* Without UTF support, character values are constrained by the bit length,
+ and can only be > 256 for 16-bit and 32-bit libraries. */
+
+#ifdef COMPILE_PCRE8
+ {}
+#else
+ if (start < end)
+ {
+ *uchardata++ = XCL_RANGE;
+ *uchardata++ = start;
+ *uchardata++ = end;
+ }
+ else if (start == end)
+ {
+ *uchardata++ = XCL_SINGLE;
+ *uchardata++ = start;
+ }
+#endif
+
+ *uchardptr = uchardata; /* Updata extra data pointer */
+ }
+
+return n8; /* Number of 8-bit characters */
+}
+
+
+
+
+/*************************************************
+* Add a list of characters to a class *
+*************************************************/
+
+/* This function is used for adding a list of case-equivalent characters to a
+class, and also for adding a list of horizontal or vertical whitespace. If the
+list is in order (which it should be), ranges of characters are detected and
+handled appropriately. This function is mutually recursive with the function
+above.
+
+Arguments:
+ classbits the bit map for characters < 256
+ uchardptr points to the pointer for extra data
+ options the options word
+ cd contains pointers to tables etc.
+ p points to row of 32-bit values, terminated by NOTACHAR
+ except character to omit; this is used when adding lists of
+ case-equivalent characters to avoid including the one we
+ already know about
+
+Returns: the number of < 256 characters added
+ the pointer to extra data is updated
+*/
+
+static int
+add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
+ compile_data *cd, const pcre_uint32 *p, unsigned int except)
+{
+int n8 = 0;
+while (p[0] < NOTACHAR)
+ {
+ int n = 0;
+ if (p[0] != except)
+ {
+ while(p[n+1] == p[0] + n + 1) n++;
+ n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
+ }
+ p += n + 1;
+ }
+return n8;
+}
+
+
+
+/*************************************************
+* Add characters not in a list to a class *
+*************************************************/
+
+/* This function is used for adding the complement of a list of horizontal or
+vertical whitespace to a class. The list must be in order.
+
+Arguments:
+ classbits the bit map for characters < 256
+ uchardptr points to the pointer for extra data
+ options the options word
+ cd contains pointers to tables etc.
+ p points to row of 32-bit values, terminated by NOTACHAR
+
+Returns: the number of < 256 characters added
+ the pointer to extra data is updated
+*/
+
+static int
+add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
+ int options, compile_data *cd, const pcre_uint32 *p)
+{
+int n8 = 0;
+if (p[0] > 0)
+ n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
+while (p[0] < NOTACHAR)
+ {
+ while (p[1] == p[0] + 1) p++;
+ n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
+ (p[1] == NOTACHAR)? 0x10ffff : p[1] - 1);
+ p++;
+ }
+return n8;
+}
+
+
+
+/*************************************************
* Compile one branch *
*************************************************/
@@ -3474,7 +3731,7 @@
BOOL is_recurse;
BOOL reset_bracount;
int class_has_8bitchar;
- int class_single_char;
+ int class_one_char;
int newoptions;
int recno;
int refsign;
@@ -3772,25 +4029,25 @@
should_flip_negation = FALSE;
- /* For optimization purposes, we track some properties of the class.
- class_has_8bitchar will be non-zero, if the class contains at least one
- < 256 character. class_single_char will be 1 if the class contains only
- a single character. */
+ /* For optimization purposes, we track some properties of the class:
+ class_has_8bitchar will be non-zero if the class contains at least one <
+ 256 character; class_one_char will be 1 if the class contains just one
+ character. */
class_has_8bitchar = 0;
- class_single_char = 0;
+ class_one_char = 0;
/* Initialize the 32-char bit map to all zeros. We build the map in a
- temporary bit of memory, in case the class contains only 1 character (less
- than 256), because in that case the compiled code doesn't use the bit map.
- */
+ temporary bit of memory, in case the class contains fewer than two
+ 8-bit characters because in that case the compiled code doesn't use the bit
+ map. */
memset(classbits, 0, 32 * sizeof(pcre_uint8));
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
- xclass = FALSE; /* No chars >= 256 */
- class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */
- class_uchardata_base = class_uchardata; /* For resetting in pass 1 */
+ xclass = FALSE;
+ class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
+ class_uchardata_base = class_uchardata; /* Save the start */
#endif
/* Process characters until ] is reached. By writing this as a "do" it
@@ -3812,10 +4069,12 @@
/* In the pre-compile phase, accumulate the length of any extra
data and reset the pointer. This is so that very large classes that
contain a zillion > 255 characters no longer overwrite the work space
- (which is on the stack). */
+ (which is on the stack). We have to remember that there was XCLASS data,
+ however. */
- if (lengthptr != NULL)
+ if (lengthptr != NULL && class_uchardata > class_uchardata_base)
{
+ xclass = TRUE;
*lengthptr += class_uchardata - class_uchardata_base;
class_uchardata = class_uchardata_base;
}
@@ -3917,7 +4176,7 @@
for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
}
- /* Not see if we need to remove any special characters. An option
+ /* Now see if we need to remove any special characters. An option
value of 1 removes vertical space and 2 removes underscore. */
if (tabopt < 0) tabopt = -tabopt;
@@ -3933,10 +4192,10 @@
for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
ptr = tempptr + 1;
- /* Every class contains at least one < 256 characters. */
+ /* Every class contains at least one < 256 character. */
class_has_8bitchar = 1;
/* Every class contains at least two characters. */
- class_single_char = 2;
+ class_one_char = 2;
continue; /* End of POSIX syntax handling */
}
@@ -3944,7 +4203,7 @@
of the specials, which just set a flag. The sequence \b is a special
case. Inside a class (and only there) it is treated as backspace. We
assume that other escapes have more than one character in them, so
- speculatively set both class_has_8bitchar and class_single_char bigger
+ speculatively set both class_has_8bitchar and class_one_char bigger
than one. Unrecognized escapes fall through and are either treated
as literal characters (by default), or are faulted if
PCRE_EXTRA is set. */
@@ -3977,7 +4236,7 @@
/* Every class contains at least two < 256 characters. */
class_has_8bitchar++;
/* Every class contains at least two characters. */
- class_single_char += 2;
+ class_one_char += 2;
switch (-c)
{
@@ -4027,191 +4286,27 @@
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
+
+ /* The rest apply in both UCP and non-UCP cases. */
case ESC_h:
- SETBIT(classbits, CHAR_HT);
- SETBIT(classbits, CHAR_SPACE);
-#ifndef EBCDIC
- SETBIT(classbits, 0xa0); /* NSBP */
-#ifndef COMPILE_PCRE8
- xclass = TRUE;
- *class_uchardata++ = XCL_SINGLE;
- *class_uchardata++ = 0x1680;
- *class_uchardata++ = XCL_SINGLE;
- *class_uchardata++ = 0x180e;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x2000;
- *class_uchardata++ = 0x200a;
- *class_uchardata++ = XCL_SINGLE;
- *class_uchardata++ = 0x202f;
- *class_uchardata++ = XCL_SINGLE;
- *class_uchardata++ = 0x205f;
- *class_uchardata++ = XCL_SINGLE;
- *class_uchardata++ = 0x3000;
-#elif defined SUPPORT_UTF
- if (utf)
- {
- xclass = TRUE;
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
- }
-#endif
-#endif /* Not EBCDIC */
+ (void)add_list_to_class(classbits, &class_uchardata, options, cd,
+ PRIV(hspace_list), NOTACHAR);
continue;
case ESC_H:
- for (c = 0; c < 32; c++)
- {
- int x = 0xff;
- switch (c)
- {
- case CHAR_HT/8: x ^= 1 << (CHAR_HT%8); break;
- case CHAR_SPACE/8: x ^= 1 << (CHAR_SPACE%8); break;
-#ifndef EBCDIC
- case 0xa0/8: x ^= 1 << (0xa0%8); break; /* NSBSP */
-#endif
- default: break;
- }
- classbits[c] |= x;
- }
-#ifndef EBCDIC
-#ifndef COMPILE_PCRE8
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x0100;
- *class_uchardata++ = 0x167f;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x1681;
- *class_uchardata++ = 0x180d;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x180f;
- *class_uchardata++ = 0x1fff;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x200b;
- *class_uchardata++ = 0x202e;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x2030;
- *class_uchardata++ = 0x205e;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x2060;
- *class_uchardata++ = 0x2fff;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x3001;
-#ifdef SUPPORT_UTF
- if (utf)
- class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
- else
-#endif /* SUPPORT_UTF */
- *class_uchardata++ = 0xffff;
-#elif defined SUPPORT_UTF
- if (utf)
- {
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
- }
-#endif
-#endif /* Not EBCDIC */
+ (void)add_not_list_to_class(classbits, &class_uchardata, options,
+ cd, PRIV(hspace_list));
continue;
case ESC_v:
- SETBIT(classbits, CHAR_LF);
- SETBIT(classbits, CHAR_VT);
- SETBIT(classbits, CHAR_FF);
- SETBIT(classbits, CHAR_CR);
- SETBIT(classbits, CHAR_NEL);
-#ifndef EBCDIC
-#ifndef COMPILE_PCRE8
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x2028;
- *class_uchardata++ = 0x2029;
-#elif defined SUPPORT_UTF
- if (utf)
- {
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
- }
-#endif
-#endif /* Not EBCDIC */
+ (void)add_list_to_class(classbits, &class_uchardata, options, cd,
+ PRIV(vspace_list), NOTACHAR);
continue;
case ESC_V:
- for (c = 0; c < 32; c++)
- {
- int x = 0xff;
- switch (c)
- {
- case CHAR_LF/8: x ^= 1 << (CHAR_LF%8);
- x ^= 1 << (CHAR_VT%8);
- x ^= 1 << (CHAR_FF%8);
- x ^= 1 << (CHAR_CR%8);
- break;
- case CHAR_NEL/8: x ^= 1 << (CHAR_NEL%8); break;
- default: break;
- }
- classbits[c] |= x;
- }
-
-#ifndef EBCDIC
-#ifndef COMPILE_PCRE8
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x0100;
- *class_uchardata++ = 0x2027;
- *class_uchardata++ = XCL_RANGE;
- *class_uchardata++ = 0x202a;
-#ifdef SUPPORT_UTF
- if (utf)
- class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
- else
-#endif
- *class_uchardata++ = 0xffff;
-#elif defined SUPPORT_UTF
- if (utf)
- {
- xclass = TRUE;
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
- class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
- }
-#endif
-#endif /* Not EBCDIC */
+ (void)add_not_list_to_class(classbits, &class_uchardata, options,
+ cd, PRIV(vspace_list));
continue;
#ifdef SUPPORT_UCP
@@ -4222,7 +4317,6 @@
int pdata;
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
if (ptype < 0) goto FAILED;
- xclass = TRUE;
*class_uchardata++ = ((-c == ESC_p) != negated)?
XCL_PROP : XCL_NOTPROP;
*class_uchardata++ = ptype;
@@ -4242,21 +4336,21 @@
goto FAILED;
}
class_has_8bitchar--; /* Undo the speculative increase. */
- class_single_char -= 2; /* Undo the speculative increase. */
+ class_one_char -= 2; /* Undo the speculative increase. */
c = *ptr; /* Get the final character and fall through */
break;
}
}
- /* Fall through if we have a single character (c >= 0). This may be
- greater than 256. */
-
+ /* Fall through if the escape just defined a single character (c >= 0).
+ This may be greater than 256. */
+
} /* End of backslash handling */
- /* A single character may be followed by '-' to form a range. However,
- Perl does not permit ']' to be the end of the range. A '-' character
- at the end is treated as a literal. Perl ignores orphaned \E sequences
- entirely. The code for handling \Q and \E is messy. */
+ /* A character may be followed by '-' to form a range. However, Perl does
+ not permit ']' to be the end of the range. A '-' character at the end is
+ treated as a literal. Perl ignores orphaned \E sequences entirely. The
+ code for handling \Q and \E is messy. */
CHECK_RANGE:
while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
@@ -4264,10 +4358,9 @@
inescq = FALSE;
ptr += 2;
}
-
oldptr = ptr;
- /* Remember \r or \n */
+ /* Remember if \r or \n were explicitly used */
if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
@@ -4290,12 +4383,17 @@
inescq = TRUE;
break;
}
+
+ /* Minus (hyphen) at the end of a class is treated as a literal, so put
+ back the pointer and jump to handle the character that preceded it. */
if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
{
ptr = oldptr;
- goto LONE_SINGLE_CHARACTER;
+ goto CLASS_SINGLE_CHARACTER;
}
+
+ /* Otherwise, we have a potential range; pick up the next character */
#ifdef SUPPORT_UTF
if (utf)
@@ -4315,203 +4413,63 @@
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
- /* \b is backspace; any other special means the '-' was literal */
+ /* \b is backspace; any other special means the '-' was literal. */
if (d < 0)
{
if (d == -ESC_b) d = CHAR_BS; else
{
ptr = oldptr;
- goto LONE_SINGLE_CHARACTER; /* A few lines below */
+ goto CLASS_SINGLE_CHARACTER; /* A few lines below */
}
}
}
/* Check that the two values are in the correct order. Optimize
- one-character ranges */
+ one-character ranges. */
if (d < c)
{
*errorcodeptr = ERR8;
goto FAILED;
}
+ if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */
- if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
+ /* We have found a character range, so single character optimizations
+ cannot be done anymore. Any value greater than 1 indicates that there
+ is more than one character. */
+
+ class_one_char = 2;
- /* Remember \r or \n */
+ /* Remember an explicit \r or \n, and add the range to the class. */
if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
-
- /* Since we found a character range, single character optimizations
- cannot be done anymore. */
- class_single_char = 2;
-
- /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
- matching, we have to use an XCLASS with extra data items. Caseless
- matching for characters > 127 is available only if UCP support is
- available. */
-
-#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
- if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
-#elif defined SUPPORT_UTF
- if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#elif !(defined COMPILE_PCRE8)
- if (d > 255)
-#endif
-#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
- {
- xclass = TRUE;
-
- /* With UCP support, we can find the other case equivalents of
- the relevant characters. There may be several ranges. Optimize how
- they fit with the basic range. */
-
-#ifdef SUPPORT_UCP
-#ifndef COMPILE_PCRE8
- if (utf && (options & PCRE_CASELESS) != 0)
-#else
- if ((options & PCRE_CASELESS) != 0)
-#endif
- {
- unsigned int occ, ocd;
- unsigned int cc = c;
- unsigned int origd = d;
- while (get_othercase_range(&cc, origd, &occ, &ocd))
- {
- if (occ >= (unsigned int)c &&
- ocd <= (unsigned int)d)
- continue; /* Skip embedded ranges */
-
- if (occ < (unsigned int)c &&
- ocd >= (unsigned int)c - 1) /* Extend the basic range */
- { /* if there is overlap, */
- c = occ; /* noting that if occ < c */
- continue; /* we can't have ocd > d */
- } /* because a subrange is */
- if (ocd > (unsigned int)d &&
- occ <= (unsigned int)d + 1) /* always shorter than */
- { /* the basic range. */
- d = ocd;
- continue;
- }
-
- if (occ == ocd)
- {
- *class_uchardata++ = XCL_SINGLE;
- }
- else
- {
- *class_uchardata++ = XCL_RANGE;
- class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
- }
- class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
- }
- }
-#endif /* SUPPORT_UCP */
-
- /* Now record the original range, possibly modified for UCP caseless
- overlapping ranges. */
-
- *class_uchardata++ = XCL_RANGE;
-#ifdef SUPPORT_UTF
-#ifndef COMPILE_PCRE8
- if (utf)
- {
- class_uchardata += PRIV(ord2utf)(c, class_uchardata);
- class_uchardata += PRIV(ord2utf)(d, class_uchardata);
- }
- else
- {
- *class_uchardata++ = c;
- *class_uchardata++ = d;
- }
-#else
- class_uchardata += PRIV(ord2utf)(c, class_uchardata);
- class_uchardata += PRIV(ord2utf)(d, class_uchardata);
-#endif
-#else /* SUPPORT_UTF */
- *class_uchardata++ = c;
- *class_uchardata++ = d;
-#endif /* SUPPORT_UTF */
-
- /* With UCP support, we are done. Without UCP support, there is no
- caseless matching for UTF characters > 127; we can use the bit map
- for the smaller ones. As for 16 bit characters without UTF, we
- can still use */
-
-#ifdef SUPPORT_UCP
-#ifndef COMPILE_PCRE8
- if (utf)
-#endif
- continue; /* With next character in the class */
-#endif /* SUPPORT_UCP */
-
-#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
- if (utf)
- {
- if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
- /* Adjust upper limit and fall through to set up the map */
- d = 127;
- }
- else
- {
- if (c > 255) continue;
- /* Adjust upper limit and fall through to set up the map */
- d = 255;
- }
-#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
- if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
- /* Adjust upper limit and fall through to set up the map */
- d = 127;
-#else
- if (c > 255) continue;
- /* Adjust upper limit and fall through to set up the map */
- d = 255;
-#endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
- }
-#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
-
- /* We use the bit map for 8 bit mode, or when the characters fall
- partially or entirely to [0-255] ([0-127] for UCP) ranges. */
-
- class_has_8bitchar = 1;
-
- /* We can save a bit of time by skipping this in the pre-compile. */
-
- if (lengthptr == NULL) for (; c <= d; c++)
- {
- classbits[c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- int uc = cd->fcc[c]; /* flip case */
- classbits[uc/8] |= (1 << (uc&7));
- }
- }
-
+
+ class_has_8bitchar +=
+ add_to_class(classbits, &class_uchardata, options, cd, c, d);
+
continue; /* Go get the next char in the class */
}
- /* Handle a lone single character - we can get here for a normal
- non-escape char, or after \ that introduces a single character or for an
- apparent range that isn't. */
+ /* Handle a single character - we can get here for a normal non-escape
+ char, or after \ that introduces a single character or for an apparent
+ range that isn't. Only the value 1 matters for class_one_char, so don't
+ increase it if it is already 2 or more ... just in case there's a class
+ with a zillion characters in it. */
- LONE_SINGLE_CHARACTER:
+ CLASS_SINGLE_CHARACTER:
+ if (class_one_char < 2) class_one_char++;
- /* Only the value of 1 matters for class_single_char. */
+ /* If class_one_char is 1, we have the first single character in the
+ class, and there have been no prior ranges, or XCLASS items generated by
+ escapes. If this is the final character in the class, we can optimize by
+ turning the item into a 1-character OP_CHAR[I] if it's positive, or
+ OP_NOT[I] if it's negative. In the positive case, it can cause firstchar
+ to be set. Otherwise, there can be no first char if this item is first,
+ whatever repeat count may follow. In the case of reqchar, save the
+ previous value for reinstating. */
- if (class_single_char < 2) class_single_char++;
-
- /* If class_charcount is 1, we saw precisely one character. As long as
- there was no use of \p or \P, in other words, no use of any XCLASS
- features, we can optimize.
-
- The optimization throws away the bit map. We turn the item into a
- 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
- In the positive case, it can cause firstchar to be set. Otherwise, there
- can be no first char if this item is first, whatever repeat count may
- follow. In the case of reqchar, save the previous value for reinstating. */
-
- if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+ if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
ptr++;
zeroreqchar = reqchar;
@@ -4544,64 +4502,12 @@
}
goto ONE_CHAR;
} /* End of 1-char optimization */
-
- /* Handle a character that cannot go in the bit map. */
-
-#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
- if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
-#elif defined SUPPORT_UTF
- if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
-#elif !(defined COMPILE_PCRE8)
- if (c > 255)
-#endif
-
-#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
- {
- xclass = TRUE;
- *class_uchardata++ = XCL_SINGLE;
-#ifdef SUPPORT_UTF
-#ifndef COMPILE_PCRE8
- /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
- if (!utf)
- *class_uchardata++ = c;
- else
-#endif
- class_uchardata += PRIV(ord2utf)(c, class_uchardata);
-#else /* SUPPORT_UTF */
- *class_uchardata++ = c;
-#endif /* SUPPORT_UTF */
-
-#ifdef SUPPORT_UCP
-#ifdef COMPILE_PCRE8
- if ((options & PCRE_CASELESS) != 0)
-#else
- /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
- if (utf && (options & PCRE_CASELESS) != 0)
-#endif
- {
- unsigned int othercase;
- if ((int)(othercase = UCD_OTHERCASE(c)) != c)
- {
- *class_uchardata++ = XCL_SINGLE;
- class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
- }
- }
-#endif /* SUPPORT_UCP */
-
- }
- else
-#endif /* SUPPORT_UTF || COMPILE_PCRE16 */
-
- /* Handle a single-byte character */
- {
- class_has_8bitchar = 1;
- classbits[c/8] |= (1 << (c&7));
- if ((options & PCRE_CASELESS) != 0)
- {
- c = cd->fcc[c]; /* flip case */
- classbits[c/8] |= (1 << (c&7));
- }
- }
+
+ /* There is more than one character in the class, or an XCLASS item
+ has been generated. Add this character to the class. */
+
+ class_has_8bitchar +=
+ add_to_class(classbits, &class_uchardata, options, cd, c, c);
}
/* Loop until ']' reached. This "while" is the end of the "do" far above.
@@ -4621,6 +4527,18 @@
goto FAILED;
}
+ /* We will need an XCLASS if data has been placed in class_uchardata. In
+ the second phase this is a sufficient test. However, in the pre-compile
+ phase, class_uchardata gets emptied to prevent workspace overflow, so it
+ only if the very last character in the class needs XCLASS will it contain
+ anything at this point. For this reason, xclass gets set TRUE above when
+ uchar_classdata is emptied, and that's why this code is the way it is here
+ instead of just doing a test on class_uchardata below. */
+
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ if (class_uchardata > class_uchardata_base) xclass = TRUE;
+#endif
+
/* If this is the first thing in the branch, there can be no first char
setting, whatever the repeat count. Any reqchar setting must remain
unchanged after any kind of repeat. */
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_internal.h 2012-09-23 16:50:00 UTC (rev 1045)
@@ -834,78 +834,105 @@
/* Tests for Unicode horizontal and vertical whitespace characters must check a
number of different values. Using a switch statement for this generates the
-fastest code (no loop, no memory access), and there are several places where
-this happens. In order to ensure that all the case lists remain in step, we use
-macros so that there is only one place where the lists are defined.
+fastest code (no loop, no memory access), and there are several places in the
+interpreter code where this happens. In order to ensure that all the case lists
+remain in step, we use macros so that there is only one place where the lists
+are defined.
-NOTE: These values are also used explicitly in pcre_compile.c when processing
-\h, \H, \v and \V in a character class, so any changes here should be
-duplicated there as well. They also appear in pcre_jit_compile.c. */
+These values are also required as lists in pcre_compile.c when processing \h,
+\H, \v and \V in a character class. The lists are defined in pcre_tables.c, but
+macros that define the values are here so that all the definitions are
+together. The lists must be in ascending character order, terminated by
+NOTACHAR (which is 0xffffffff).
+Any changes should ensure that the various macros are kept in step with each
+other. NOTE: The values also appear in pcre_jit_compile.c. */
+
/* ------ ASCII/Unicode environments ------ */
#ifndef EBCDIC
+
+#define HSPACE_LIST \
+ CHAR_HT, CHAR_SPACE, 0xa0, \
+ 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
+ 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
+ NOTACHAR
+
#define HSPACE_MULTIBYTE_CASES \
- case 0x1680: /* OGHAM SPACE MARK */ \
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
- case 0x2000: /* EN QUAD */ \
- case 0x2001: /* EM QUAD */ \
- case 0x2002: /* EN SPACE */ \
- case 0x2003: /* EM SPACE */ \
- case 0x2004: /* THREE-PER-EM SPACE */ \
- case 0x2005: /* FOUR-PER-EM SPACE */ \
- case 0x2006: /* SIX-PER-EM SPACE */ \
- case 0x2007: /* FIGURE SPACE */ \
- case 0x2008: /* PUNCTUATION SPACE */ \
- case 0x2009: /* THIN SPACE */ \
- case 0x200A: /* HAIR SPACE */ \
- case 0x202f: /* NARROW NO-BREAK SPACE */ \
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
- case 0x3000 /* IDEOGRAPHIC SPACE */
+ case 0x1680: /* OGHAM SPACE MARK */ \
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
+ case 0x2000: /* EN QUAD */ \
+ case 0x2001: /* EM QUAD */ \
+ case 0x2002: /* EN SPACE */ \
+ case 0x2003: /* EM SPACE */ \
+ case 0x2004: /* THREE-PER-EM SPACE */ \
+ case 0x2005: /* FOUR-PER-EM SPACE */ \
+ case 0x2006: /* SIX-PER-EM SPACE */ \
+ case 0x2007: /* FIGURE SPACE */ \
+ case 0x2008: /* PUNCTUATION SPACE */ \
+ case 0x2009: /* THIN SPACE */ \
+ case 0x200A: /* HAIR SPACE */ \
+ case 0x202f: /* NARROW NO-BREAK SPACE */ \
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
+ case 0x3000 /* IDEOGRAPHIC SPACE */
#define HSPACE_BYTE_CASES \
- case CHAR_HT: \
- case CHAR_SPACE: \
- case 0xa0 /* NBSP */
+ case CHAR_HT: \
+ case CHAR_SPACE: \
+ case 0xa0 /* NBSP */
+
+#define HSPACE_CASES \
+ HSPACE_BYTE_CASES: \
+ HSPACE_MULTIBYTE_CASES
+#define VSPACE_LIST \
+ CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR
+
#define VSPACE_MULTIBYTE_CASES \
- case 0x2028: /* LINE SEPARATOR */ \
- case 0x2029 /* PARAGRAPH SEPARATOR */
+ case 0x2028: /* LINE SEPARATOR */ \
+ case 0x2029 /* PARAGRAPH SEPARATOR */
#define VSPACE_BYTE_CASES \
- case CHAR_LF: \
- case CHAR_VT: \
- case CHAR_FF: \
- case CHAR_CR: \
- case CHAR_NEL
+ case CHAR_LF: \
+ case CHAR_VT: \
+ case CHAR_FF: \
+ case CHAR_CR: \
+ case CHAR_NEL
-#define HSPACE_CASES \
- HSPACE_BYTE_CASES: \
- HSPACE_MULTIBYTE_CASES
-
#define VSPACE_CASES \
- VSPACE_BYTE_CASES: \
- VSPACE_MULTIBYTE_CASES
+ VSPACE_BYTE_CASES: \
+ VSPACE_MULTIBYTE_CASES
/* ------ EBCDIC environments ------ */
#else
+#define HSPACE_LIST CHAR_HT, CHAR_SPACE
+
#define HSPACE_BYTE_CASES \
- case CHAR_HT: \
- case CHAR_SPACE
-
+ case CHAR_HT: \
+ case CHAR_SPACE
+
+#define HSPACE_CASES HSPACE_BYTE_CASES
+
+#ifdef EBCDIC_NL25
+#define VSPACE_LIST \
+ CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR
+#else
+#define VSPACE_LIST \
+ CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR
+#endif
+
#define VSPACE_BYTE_CASES \
- case CHAR_LF: \
- case CHAR_VT: \
- case CHAR_FF: \
- case CHAR_CR: \
- case CHAR_NEL
-
-#define HSPACE_CASES HSPACE_BYTE_CASES
+ case CHAR_LF: \
+ case CHAR_VT: \
+ case CHAR_FF: \
+ case CHAR_CR: \
+ case CHAR_NEL
+
#define VSPACE_CASES VSPACE_BYTE_CASES
#endif /* EBCDIC */
-/* ------ End of whitespace case macros ------ */
+/* ------ End of whitespace macros ------ */
/* In case there is no definition of offsetof() provided - though any proper
@@ -2351,22 +2378,22 @@
pcre_tables.c module. */
#ifdef COMPILE_PCRE8
-
extern const int PRIV(utf8_table1)[];
extern const int PRIV(utf8_table1_size);
extern const int PRIV(utf8_table2)[];
extern const int PRIV(utf8_table3)[];
extern const pcre_uint8 PRIV(utf8_table4)[];
-
#endif /* COMPILE_PCRE8 */
extern const char PRIV(utt_names)[];
extern const ucp_type_table PRIV(utt)[];
extern const int PRIV(utt_size);
+extern const pcre_uint8 PRIV(OP_lengths)[];
extern const pcre_uint8 PRIV(default_tables)[];
-extern const pcre_uint8 PRIV(OP_lengths)[];
+extern const pcre_uint32 PRIV(hspace_list)[];
+extern const pcre_uint32 PRIV(vspace_list)[];
/* Internal shared functions. These are functions that are used by more than
@@ -2435,7 +2462,7 @@
pcre_uint8 script; /* ucp_Arabic, etc. */
pcre_uint8 chartype; /* ucp_Cc, etc. (general categories) */
pcre_uint8 gbprop; /* ucp_gbControl, etc. (grapheme break property) */
- pcre_uint8 caseset; /* offset to multichar other cases or zero */
+ pcre_uint8 caseset; /* offset to multichar other cases or zero */
pcre_int32 other_case; /* offset to other case, or zero if none */
} ucd_record;
Modified: code/trunk/pcre_printint.c
===================================================================
--- code/trunk/pcre_printint.c 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_printint.c 2012-09-23 16:50:00 UTC (rev 1045)
@@ -130,7 +130,9 @@
if (!utf || (c & 0xc0) != 0xc0)
{
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ if (PRINTABLE(c)) fprintf(f, "%c", c);
+ else if (c < 0x80) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%02x}", c);
return 0;
}
else
@@ -167,8 +169,8 @@
if (!utf || (c & 0xfc00) != 0xd800)
{
if (PRINTABLE(c)) fprintf(f, "%c", c);
- else if (c <= 0xff) fprintf(f, "\\x%02x", c);
- else fprintf(f, "\\x{%x}", c);
+ else if (c <= 0x80) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%02x}", c);
return 0;
}
else
Modified: code/trunk/pcre_tables.c
===================================================================
--- code/trunk/pcre_tables.c 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_tables.c 2012-09-23 16:50:00 UTC (rev 1045)
@@ -58,8 +58,14 @@
const pcre_uint8 PRIV(OP_lengths)[] = { OP_LENGTHS };
+/* Tables of horizontal and vertical whitespace characters, suitable for
+adding to classes. */
+const pcre_uint32 PRIV(hspace_list)[] = { HSPACE_LIST };
+const pcre_uint32 PRIV(vspace_list)[] = { VSPACE_LIST };
+
+
/*************************************************
* Tables for UTF-8 support *
*************************************************/
Modified: code/trunk/pcre_ucd.c
===================================================================
--- code/trunk/pcre_ucd.c 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/pcre_ucd.c 2012-09-23 16:50:00 UTC (rev 1045)
@@ -39,26 +39,26 @@
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
- 0xffffffff,
- 0x0053, 0x0073, 0x017f, 0xffffffff,
- 0x01c4, 0x01c5, 0x01c6, 0xffffffff,
- 0x01c7, 0x01c8, 0x01c9, 0xffffffff,
- 0x01ca, 0x01cb, 0x01cc, 0xffffffff,
- 0x01f1, 0x01f2, 0x01f3, 0xffffffff,
- 0x0345, 0x0399, 0x03b9, 0x1fbe, 0xffffffff,
- 0x00b5, 0x039c, 0x03bc, 0xffffffff,
- 0x03a3, 0x03c2, 0x03c3, 0xffffffff,
- 0x0392, 0x03b2, 0x03d0, 0xffffffff,
- 0x0398, 0x03b8, 0x03d1, 0x03f4, 0xffffffff,
- 0x03a6, 0x03c6, 0x03d5, 0xffffffff,
- 0x03a0, 0x03c0, 0x03d6, 0xffffffff,
- 0x039a, 0x03ba, 0x03f0, 0xffffffff,
- 0x03a1, 0x03c1, 0x03f1, 0xffffffff,
- 0x0395, 0x03b5, 0x03f5, 0xffffffff,
- 0x1e60, 0x1e61, 0x1e9b, 0xffffffff,
- 0x03a9, 0x03c9, 0x2126, 0xffffffff,
- 0x004b, 0x006b, 0x212a, 0xffffffff,
- 0x00c5, 0x00e5, 0x212b, 0xffffffff,
+ NOTACHAR,
+ 0x0053, 0x0073, 0x017f, NOTACHAR,
+ 0x01c4, 0x01c5, 0x01c6, NOTACHAR,
+ 0x01c7, 0x01c8, 0x01c9, NOTACHAR,
+ 0x01ca, 0x01cb, 0x01cc, NOTACHAR,
+ 0x01f1, 0x01f2, 0x01f3, NOTACHAR,
+ 0x0345, 0x0399, 0x03b9, 0x1fbe, NOTACHAR,
+ 0x00b5, 0x039c, 0x03bc, NOTACHAR,
+ 0x03a3, 0x03c2, 0x03c3, NOTACHAR,
+ 0x0392, 0x03b2, 0x03d0, NOTACHAR,
+ 0x0398, 0x03b8, 0x03d1, 0x03f4, NOTACHAR,
+ 0x03a6, 0x03c6, 0x03d5, NOTACHAR,
+ 0x03a0, 0x03c0, 0x03d6, NOTACHAR,
+ 0x039a, 0x03ba, 0x03f0, NOTACHAR,
+ 0x03a1, 0x03c1, 0x03f1, NOTACHAR,
+ 0x0395, 0x03b5, 0x03f5, NOTACHAR,
+ 0x1e60, 0x1e61, 0x1e9b, NOTACHAR,
+ 0x03a9, 0x03c9, 0x2126, NOTACHAR,
+ 0x004b, 0x006b, 0x212a, NOTACHAR,
+ 0x00c5, 0x00e5, 0x212b, NOTACHAR,
};
const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testinput10 2012-09-23 16:50:00 UTC (rev 1045)
@@ -1090,4 +1090,22 @@
/-- --/
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+
+/[z\x{1e9e}]+/8i
+ \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+
+/[z\x{00df}]+/8i
+ \x{1e9e}\x{00df}
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+
+/[z\x{1f88}]+/8i
+ \x{1f88}\x{1f80}
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testinput6 2012-09-23 16:50:00 UTC (rev 1045)
@@ -1,6 +1,5 @@
/-- This set of tests is for Unicode property support. It is compatible with
- Perl >= 5.10, but not 5.8 because it tests some extra properties that are
- not in the earlier release. --/
+ Perl >= 5.15. --/
/^\pC\pL\pM\pN\pP\pS\pZ</8
\x7f\x{c0}\x{30f}\x{660}\x{66c}\x{f01}\x{1680}<
@@ -885,4 +884,205 @@
/-- --/
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+
+/[z\x{1e9e}]+/8i
+ \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+
+/[z\x{00df}]+/8i
+ \x{1e9e}\x{00df}
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+
+/[z\x{1f88}]+/8i
+ \x{1f88}\x{1f80}
+
+/-- Characters with more than one other case; test in classes --/
+
+/[z\x{00b5}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{039c}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{03bc}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/[z\x{00c5}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{00e5}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{212b}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/[z\x{01c4}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c5}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c6}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/[z\x{01c7}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01c8}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01c9}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/[z\x{01ca}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01cb}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01cc}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/[z\x{01f1}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{01f2}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{01f3}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/[z\x{0345}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{0399}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{03b9}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{1fbe}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/[z\x{0392}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{03b2}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{03d0}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/[z\x{0395}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{03b5}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{03f5}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/[z\x{0398}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03b8}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03d1}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{03f4}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/[z\x{039a}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03ba}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03f0}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/[z\x{03a0}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/[z\x{03c0}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/[z\x{03d6}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/[z\x{03a1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03c1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03f1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/[z\x{03a3}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03c2}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03c3}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/[z\x{03a6}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/[z\x{03c6}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/[z\x{03d5}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/[z\x{03c9}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{03a9}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{2126}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/[z\x{1e60}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e61}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e9b}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/[z\x{004b}]+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/[z\x{006b}]+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/[z\x{212a}]+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/[z\x{0053}]+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/[z\x{0073}]+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/[z\x{017f}]+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/-- --/
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput10 2012-09-23 16:50:00 UTC (rev 1045)
@@ -2268,4 +2268,34 @@
/-- --/
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/[z\x{1e9e}]+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/[z\x{00df}]+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
+/[z\x{1f88}]+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testoutput11-16
===================================================================
--- code/trunk/testdata/testoutput11-16 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput11-16 2012-09-23 16:50:00 UTC (rev 1045)
@@ -333,7 +333,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 \xff
+ 2 \x{ff}
4 4 Ket
6 End
------------------------------------------------------------------
@@ -360,7 +360,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 \xff
+ 2 \x{ff}
4 4 Ket
6 End
------------------------------------------------------------------
@@ -591,7 +591,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 \xaa
+ 2 \x{aa}
4 4 Ket
6 End
------------------------------------------------------------------
@@ -600,7 +600,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 \xaa
+ 2 \x{aa}
4 4 Ket
6 End
------------------------------------------------------------------
@@ -627,7 +627,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 [^\xaa]
+ 2 [^\x{aa}]
4 4 Ket
6 End
------------------------------------------------------------------
@@ -636,7 +636,7 @@
Memory allocation (code space): 14
------------------------------------------------------------------
0 4 Bra
- 2 [^\xaa]
+ 2 [^\x{aa}]
4 4 Ket
6 End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput11-8
===================================================================
--- code/trunk/testdata/testoutput11-8 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput11-8 2012-09-23 16:50:00 UTC (rev 1045)
@@ -591,7 +591,7 @@
Memory allocation (code space): 9
------------------------------------------------------------------
0 5 Bra
- 3 \xaa
+ 3 \x{aa}
5 5 Ket
8 End
------------------------------------------------------------------
@@ -627,7 +627,7 @@
Memory allocation (code space): 9
------------------------------------------------------------------
0 5 Bra
- 3 [^\xaa]
+ 3 [^\x{aa}]
5 5 Ket
8 End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput15
===================================================================
--- code/trunk/testdata/testoutput15 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput15 2012-09-23 16:50:00 UTC (rev 1045)
@@ -560,7 +560,7 @@
/[^\x{c4}]/DZ
------------------------------------------------------------------
Bra
- [^\xc4]
+ [^\x{c4}]
Ket
End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput17 2012-09-23 16:50:00 UTC (rev 1045)
@@ -20,7 +20,7 @@
/[^\x{c4}]/DZ
------------------------------------------------------------------
Bra
- [^\xc4]
+ [^\x{c4}]
Ket
End
------------------------------------------------------------------
@@ -271,7 +271,7 @@
/[\H]/BZ
------------------------------------------------------------------
Bra
- [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}]
+ [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}]
Ket
End
------------------------------------------------------------------
@@ -287,7 +287,7 @@
/[\V]/BZ
------------------------------------------------------------------
Bra
- [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}]
+ [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}]
Ket
End
------------------------------------------------------------------
@@ -295,7 +295,7 @@
/[\x0a\V]/BZ
------------------------------------------------------------------
Bra
- [\x00-\x0a\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}]
+ [\x00-\x0a\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}]
Ket
End
------------------------------------------------------------------
@@ -349,7 +349,7 @@
/[\H\x{d800}]+/BZSI
------------------------------------------------------------------
Bra
- [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}\x{d800}]+
+ [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{ffff}\x{d800}]+
Ket
End
------------------------------------------------------------------
@@ -413,7 +413,7 @@
/[\V\x{d800}]+/BZSI
------------------------------------------------------------------
Bra
- [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{ffff}\x{d800}]+
+ [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{ffff}\x{d800}]+
Ket
End
------------------------------------------------------------------
@@ -452,7 +452,7 @@
------------------------------------------------------------------
Bra
[^\x80]
- [^\xff]
+ [^\x{ff}]
[^\x{100}]
[^\x{1000}]
[^\x{ffff}]
@@ -464,7 +464,7 @@
------------------------------------------------------------------
Bra
/i [^\x80]
- /i [^\xff]
+ /i [^\x{ff}]
/i [^\x{100}]
/i [^\x{1000}]
/i [^\x{ffff}]
Modified: code/trunk/testdata/testoutput18
===================================================================
--- code/trunk/testdata/testoutput18 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput18 2012-09-23 16:50:00 UTC (rev 1045)
@@ -161,7 +161,7 @@
/[\x{ff}]/8DZ
------------------------------------------------------------------
Bra
- \xff
+ \x{ff}
Ket
End
------------------------------------------------------------------
@@ -197,7 +197,7 @@
/\xff/8DZ
------------------------------------------------------------------
Bra
- \xff
+ \x{ff}
Ket
End
------------------------------------------------------------------
@@ -249,7 +249,7 @@
/\x{084}/DZ8
------------------------------------------------------------------
Bra
- \x84
+ \x{84}
Ket
End
------------------------------------------------------------------
@@ -489,7 +489,7 @@
/[^\x{c4}]/DZ
------------------------------------------------------------------
Bra
- [^\xc4]
+ [^\x{c4}]
Ket
End
------------------------------------------------------------------
@@ -521,7 +521,7 @@
/[\xff]/DZ8
------------------------------------------------------------------
Bra
- \xff
+ \x{ff}
Ket
End
------------------------------------------------------------------
@@ -535,7 +535,7 @@
/[^\xff]/8DZ
------------------------------------------------------------------
Bra
- [^\xff]
+ [^\x{ff}]
Ket
End
------------------------------------------------------------------
@@ -812,7 +812,7 @@
/[^\x{c4}]/8DZ
------------------------------------------------------------------
Bra
- [^\xc4]
+ [^\x{c4}]
Ket
End
------------------------------------------------------------------
@@ -863,7 +863,7 @@
------------------------------------------------------------------
Bra
\w++
- \xc4
+ \x{c4}
Ket
End
------------------------------------------------------------------
@@ -874,7 +874,7 @@
------------------------------------------------------------------
Bra
\w+
- \xc4
+ \x{c4}
Ket
End
------------------------------------------------------------------
@@ -885,7 +885,7 @@
------------------------------------------------------------------
Bra
\W+
- \xc4
+ \x{c4}
Ket
End
------------------------------------------------------------------
@@ -896,7 +896,7 @@
------------------------------------------------------------------
Bra
\W++
- \xc4
+ \x{c4}
Ket
End
------------------------------------------------------------------
@@ -907,7 +907,7 @@
------------------------------------------------------------------
Bra
\W+
- \xa1
+ \x{a1}
Ket
End
------------------------------------------------------------------
@@ -918,7 +918,7 @@
------------------------------------------------------------------
Bra
\W+
- \xa1
+ \x{a1}
Ket
End
------------------------------------------------------------------
@@ -930,7 +930,7 @@
Bra
X
\s++
- \xa0
+ \x{a0}
Ket
End
------------------------------------------------------------------
@@ -942,7 +942,7 @@
Bra
X
\s+
- \xa0
+ \x{a0}
Ket
End
------------------------------------------------------------------
@@ -953,7 +953,7 @@
------------------------------------------------------------------
Bra
\S+
- \xa0
+ \x{a0}
Ket
End
------------------------------------------------------------------
@@ -964,7 +964,7 @@
------------------------------------------------------------------
Bra
\S++
- \xa0
+ \x{a0}
Ket
End
------------------------------------------------------------------
@@ -974,7 +974,7 @@
/\x{a0}+\s!/8BZ
------------------------------------------------------------------
Bra
- \xa0++
+ \x{a0}++
\s
!
Ket
@@ -986,7 +986,7 @@
/\x{a0}+\s!/8BZT1
------------------------------------------------------------------
Bra
- \xa0+
+ \x{a0}+
\s
!
Ket
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput5 2012-09-23 16:50:00 UTC (rev 1045)
@@ -276,7 +276,7 @@
/[\xFF]/DZ
------------------------------------------------------------------
Bra
- \xff
+ \x{ff}
Ket
End
------------------------------------------------------------------
@@ -290,7 +290,7 @@
/[^\xFF]/DZ
------------------------------------------------------------------
Bra
- [^\xff]
+ [^\x{ff}]
Ket
End
------------------------------------------------------------------
@@ -786,7 +786,7 @@
/[\H]/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
+ [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
@@ -794,7 +794,7 @@
/[\V]/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{10ffff}]
+ [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
@@ -1596,7 +1596,7 @@
/[\H\x{d7ff}]+/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}\x{d7ff}]+
+ [\x00-\x08\x0a-\x1f!-\x9f\x{a1}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}\x{d7ff}]+
Ket
End
------------------------------------------------------------------
@@ -1636,7 +1636,7 @@
/[\V\x{d7ff}]+/8BZ
------------------------------------------------------------------
Bra
- [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{202a}-\x{10ffff}\x{d7ff}]+
+ [\x00-\x09\x0e-\x84\x{86}-\x{2027}\x{202a}-\x{10ffff}\x{d7ff}]+
Ket
End
------------------------------------------------------------------
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput6 2012-09-23 16:50:00 UTC (rev 1045)
@@ -1,6 +1,5 @@
/-- This set of tests is for Unicode property support. It is compatible with
- Perl >= 5.10, but not 5.8 because it tests some extra properties that are
- not in the earlier release. --/
+ Perl >= 5.15. --/
/^\pC\pL\pM\pN\pP\pS\pZ</8
\x7f\x{c0}\x{30f}\x{660}\x{66c}\x{f01}\x{1680}<
@@ -1551,4 +1550,270 @@
/-- --/
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/[z\x{1e9e}]+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/[z\x{00df}]+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+
+/[z\x{1f88}]+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+
+/-- Characters with more than one other case; test in classes --/
+
+/[z\x{00b5}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{039c}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{03bc}]+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/[z\x{00c5}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{00e5}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{212b}]+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/[z\x{01c4}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c5}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c6}]+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/[z\x{01c7}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01c8}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01c9}]+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/[z\x{01ca}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01cb}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01cc}]+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/[z\x{01f1}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{01f2}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{01f3}]+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/[z\x{0345}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{0399}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{03b9}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{1fbe}]+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/[z\x{0392}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{03b2}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{03d0}]+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/[z\x{0395}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{03b5}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{03f5}]+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/[z\x{0398}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03b8}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03d1}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{03f4}]+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/[z\x{039a}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03ba}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03f0}]+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/[z\x{03a0}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03c0}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03d6}]+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/[z\x{03a1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03c1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03f1}]+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/[z\x{03a3}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03c2}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03c3}]+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/[z\x{03a6}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03c6}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03d5}]+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/[z\x{03c9}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{03a9}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{2126}]+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/[z\x{1e60}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e61}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/[z\x{1e9b}]+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/[z\x{004b}]+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{006b}]+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{212a}]+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/[z\x{0053}]+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/[z\x{0073}]+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/[z\x{017f}]+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/-- --/
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7 2012-09-20 16:23:57 UTC (rev 1044)
+++ code/trunk/testdata/testoutput7 2012-09-23 16:50:00 UTC (rev 1045)
@@ -124,7 +124,7 @@
/[z-\x{100}]/8iDZ
------------------------------------------------------------------
Bra
- [Z\x{39c}\x{178}z-\x{101}]
+ [Z\x{39c}\x{3bc}\x{1e9e}\x{178}z-\x{101}]
Ket
End
------------------------------------------------------------------
@@ -162,7 +162,7 @@
/[z-\x{100}]/8DZi
------------------------------------------------------------------
Bra
- [Z\x{39c}\x{178}z-\x{101}]
+ [Z\x{39c}\x{3bc}\x{1e9e}\x{178}z-\x{101}]
Ket
End
------------------------------------------------------------------