Revision: 1046
http://vcs.pcre.org/viewvc?view=rev&revision=1046
Author: ph10
Date: 2012-09-25 17:27:58 +0100 (Tue, 25 Sep 2012)
Log Message:
-----------
All the remaining changes for handling characters with more than one other
case.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcreunicode.3
code/trunk/maint/MultiStage2.py
code/trunk/pcre_compile.c
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcre_printint.c
code/trunk/pcre_ucd.c
code/trunk/pcretest.c
code/trunk/perltest.pl
code/trunk/testdata/testinput10
code/trunk/testdata/testinput6
code/trunk/testdata/testinput7
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput6
code/trunk/testdata/testoutput7
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/ChangeLog 2012-09-25 16:27:58 UTC (rev 1046)
@@ -87,7 +87,8 @@
they are defined only once.
21. This set of changes together give more compatible Unicode case-folding
- behaviour for characters that have more than one other case.
+ behaviour for characters that have more than one other case when UCP
+ support is available.
(a) The Unicode property table now has offsets into a new table of sets of
three or more characters that are case-equivalent. The MultiStage2.py
@@ -108,6 +109,12 @@
(d) The processing of \h, \H, \v, and \ in character classes now makes use
of the new class addition function, using character lists defined as
macros alongside the case definitions of 20 above.
+
+ (e) Caseless back references now work with characters that have more than
+ one other case.
+
+ (f) General caseless matching of characters with more than one other case
+ is supported.
Version 8.31 06-July-2012
Modified: code/trunk/doc/pcreunicode.3
===================================================================
--- code/trunk/doc/pcreunicode.3 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/doc/pcreunicode.3 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1,4 +1,4 @@
-.TH PCREUNICODE 3 "14 April 2012" "PCRE 8.30"
+.TH PCREUNICODE 3 "25 September 2012" "PCRE 8.32"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "UTF-8, UTF-16, AND UNICODE PROPERTY SUPPORT"
@@ -197,13 +197,11 @@
PCRE_UCP is set.
.P
9. Case-insensitive matching applies only to characters whose values are less
-than 128, unless PCRE is built with Unicode property support. Even when Unicode
-property support is available, PCRE still uses its own character tables when
-checking the case of low-valued characters, so as not to degrade performance.
-The Unicode property information is used only for characters with higher
-values. Furthermore, PCRE supports case-insensitive matching only when there is
-a one-to-one mapping between a letter's cases. There are a small number of
-many-to-one mappings in Unicode; these are not supported by PCRE.
+than 128, unless PCRE is built with Unicode property support. A few Unicode
+characters such as Greek sigma have more than two codepoints that are
+case-equivalent. Up to and including PCRE release 8.31, only one-to-one case
+mappings were supported, but later releases (with Unicode property support) do
+treat as case-equivalent all versions of characters such as Greek sigma.
.
.
.SH AUTHOR
@@ -220,6 +218,6 @@
.rs
.sp
.nf
-Last updated: 14 April 2012
+Last updated: 25 September 2012
Copyright (c) 1997-2012 University of Cambridge.
.fi
Modified: code/trunk/maint/MultiStage2.py
===================================================================
--- code/trunk/maint/MultiStage2.py 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/maint/MultiStage2.py 2012-09-25 16:27:58 UTC (rev 1046)
@@ -404,12 +404,26 @@
min_stage1, min_stage2 = stage1, stage2
min_block_size = block_size
+print "/* This module is generated by the maint/MultiStage2.py script."
+print "Do not modify it by hand. Instead modify the script and run it"
+print "to regenerate this code."
+print
+print "As well as being part of the PCRE library, this module is #included"
+print "by the pcretest program, which redefines the PRIV macro to change"
+print "table names from _pcre_xxx to xxxx, thereby avoiding name clashes"
+print "with the library. At present, just one of these tables is actually"
+print "needed. */"
+print
+print "#ifndef PCRE_INCLUDED"
+print
print "#ifdef HAVE_CONFIG_H"
print "#include \"config.h\""
print "#endif"
print
print "#include \"pcre_internal.h\""
print
+print "#endif /* PCRE_INCLUDED */"
+print
print "/* Unicode character database. */"
print "/* This file was autogenerated by the MultiStage2.py script. */"
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
@@ -424,7 +438,7 @@
print "Instead, just supply small dummy tables. */"
print
print "#ifndef SUPPORT_UCP"
-print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0 }};"
+print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
@@ -446,6 +460,10 @@
# ------
+print "/* When #included in pcretest, we don't need this large table. */"
+print
+print "#ifndef PCRE_INCLUDED"
+print
print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)')
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
@@ -453,6 +471,8 @@
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
print "#endif"
print "#endif /* SUPPORT_UCP */"
+print
+print "#endif /* PCRE_INCLUDED */"
"""
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_compile.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1859,7 +1859,8 @@
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
- if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
+ if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
+ cc += 2;
cc += 1 + IMM2_SIZE + 1;
break;
@@ -2097,8 +2098,8 @@
case OP_TYPEMINUPTO:
case OP_TYPEEXACT:
case OP_TYPEPOSUPTO:
- if (code[1 + IMM2_SIZE] == OP_PROP
- || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
+ if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
+ code += 2;
break;
case OP_MARK:
@@ -2217,8 +2218,8 @@
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEEXACT:
- if (code[1 + IMM2_SIZE] == OP_PROP
- || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
+ if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
+ code += 2;
break;
case OP_MARK:
@@ -2543,8 +2544,8 @@
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- if (code[1 + IMM2_SIZE] == OP_PROP
- || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
+ if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
+ code += 2;
break;
/* End of branch */
@@ -2951,7 +2952,12 @@
static BOOL
check_char_prop(int c, int ptype, int pdata, BOOL negated)
{
+#ifdef SUPPORT_UCP
+const pcre_uint32 *p;
+#endif
+
const ucd_record *prop = GET_UCD(c);
+
switch(ptype)
{
case PT_LAMP:
@@ -2989,7 +2995,19 @@
return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE) == negated;
+
+#ifdef SUPPORT_UCP
+ case PT_CLIST:
+ p = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if ((unsigned int)c < *p) return !negated;
+ if ((unsigned int)c == *p++) return negated;
+ }
+ break; /* Control never reaches here */
+#endif
}
+
return FALSE;
}
#endif /* SUPPORT_UCP */
@@ -3091,135 +3109,139 @@
if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
return FALSE;
+
+/* If the previous item is a character, get its value. */
-/* Now compare the next item with the previous opcode. First, handle cases when
-the next item is a character. */
-
-if (next >= 0) switch(op_code)
- {
- case OP_CHAR:
+if (op_code == OP_CHAR || op_code == OP_CHARI ||
+ op_code == OP_NOT || op_code == OP_NOTI)
+ {
#ifdef SUPPORT_UTF
GETCHARTEST(c, previous);
#else
c = *previous;
#endif
- return c != next;
+ }
- /* For CHARI (caseless character) we must check the other case. If we have
- Unicode property support, we can use it to test the other case of
- high-valued characters. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. For a caseless UTF match, the next character may
+have more than one other case; convert this to a special property. */
- case OP_CHARI:
-#ifdef SUPPORT_UTF
- GETCHARTEST(c, previous);
-#else
- c = *previous;
+if (next >= 0)
+ {
+#ifdef SUPPORT_UCP
+ if (utf && (options & PCRE_CASELESS) != 0)
+ {
+ int ocs = UCD_CASESET(next);
+ if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, FALSE);
+ }
#endif
- if (c == next) return FALSE;
+
+ switch(op_code)
+ {
+ case OP_CHAR:
+ return c != next;
+
+ /* For CHARI (caseless character) we must check the other case. If we have
+ Unicode property support, we can use it to test the other case of
+ high-valued characters. We know that next can have only one other case,
+ because multi-other-case characters are dealt with above. */
+
+ case OP_CHARI:
+ if (c == next) return FALSE;
#ifdef SUPPORT_UTF
- if (utf)
- {
- unsigned int othercase;
- if (next < 128) othercase = cd->fcc[next]; else
+ if (utf)
+ {
+ unsigned int othercase;
+ if (next < 128) othercase = cd->fcc[next]; else
#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE((unsigned int)next);
+ othercase = UCD_OTHERCASE((unsigned int)next);
#else
- othercase = NOTACHAR;
+ othercase = NOTACHAR;
#endif
- return (unsigned int)c != othercase;
- }
- else
+ return (unsigned int)c != othercase;
+ }
+ else
#endif /* SUPPORT_UTF */
- return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
-
- case OP_NOT:
+ return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Not UTF */
+
+ case OP_NOT:
+ return c == next;
+
+ case OP_NOTI:
+ if (c == next) return TRUE;
#ifdef SUPPORT_UTF
- GETCHARTEST(c, previous);
+ if (utf)
+ {
+ unsigned int othercase;
+ if (next < 128) othercase = cd->fcc[next]; else
+#ifdef SUPPORT_UCP
+ othercase = UCD_OTHERCASE((unsigned int)next);
#else
- c = *previous;
+ othercase = NOTACHAR;
#endif
- return c == next;
-
- case OP_NOTI:
-#ifdef SUPPORT_UTF
- GETCHARTEST(c, previous);
-#else
- c = *previous;
-#endif
- if (c == next) return TRUE;
-#ifdef SUPPORT_UTF
- if (utf)
- {
- unsigned int othercase;
- if (next < 128) othercase = cd->fcc[next]; else
+ return (unsigned int)c == othercase;
+ }
+ else
+#endif /* SUPPORT_UTF */
+ return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Not UTF */
+
+ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
+ When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
+
+ case OP_DIGIT:
+ return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
+
+ case OP_NOT_DIGIT:
+ return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
+
+ case OP_WHITESPACE:
+ return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
+
+ case OP_NOT_WHITESPACE:
+ return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
+
+ case OP_WORDCHAR:
+ return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
+
+ case OP_NOT_WORDCHAR:
+ return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
+
+ case OP_HSPACE:
+ case OP_NOT_HSPACE:
+ switch(next)
+ {
+ HSPACE_CASES:
+ return op_code == OP_NOT_HSPACE;
+
+ default:
+ return op_code != OP_NOT_HSPACE;
+ }
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ case OP_NOT_VSPACE:
+ switch(next)
+ {
+ VSPACE_CASES:
+ return op_code == OP_NOT_VSPACE;
+
+ default:
+ return op_code != OP_NOT_VSPACE;
+ }
+
#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE((unsigned int)next);
-#else
- othercase = NOTACHAR;
+ case OP_PROP:
+ return check_char_prop(next, previous[0], previous[1], FALSE);
+
+ case OP_NOTPROP:
+ return check_char_prop(next, previous[0], previous[1], TRUE);
#endif
- return (unsigned int)c == othercase;
- }
- else
-#endif /* SUPPORT_UTF */
- return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
-
- /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
- When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
-
- case OP_DIGIT:
- return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
-
- case OP_NOT_DIGIT:
- return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
-
- case OP_WHITESPACE:
- return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
-
- case OP_NOT_WHITESPACE:
- return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
-
- case OP_WORDCHAR:
- return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
-
- case OP_NOT_WORDCHAR:
- return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
-
- case OP_HSPACE:
- case OP_NOT_HSPACE:
- switch(next)
- {
- HSPACE_CASES:
- return op_code == OP_NOT_HSPACE;
-
+
default:
- return op_code != OP_NOT_HSPACE;
+ return FALSE;
}
+ }
- case OP_ANYNL:
- case OP_VSPACE:
- case OP_NOT_VSPACE:
- switch(next)
- {
- VSPACE_CASES:
- return op_code == OP_NOT_VSPACE;
-
- default:
- return op_code != OP_NOT_VSPACE;
- }
-
-#ifdef SUPPORT_UCP
- case OP_PROP:
- return check_char_prop(next, previous[0], previous[1], FALSE);
-
- case OP_NOTPROP:
- return check_char_prop(next, previous[0], previous[1], TRUE);
-#endif
-
- default:
- return FALSE;
- }
-
-
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
generated only when PCRE_UCP is *not* set, that is, when only ASCII
@@ -3230,11 +3252,6 @@
{
case OP_CHAR:
case OP_CHARI:
-#ifdef SUPPORT_UTF
- GETCHARTEST(c, previous);
-#else
- c = *previous;
-#endif
switch(-next)
{
case ESC_d:
@@ -3683,11 +3700,14 @@
BOOL utf = FALSE;
#endif
-/* Helper variables for OP_XCLASS opcode (for characters > 255). */
+/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
+class_uchardata always so that it can be passed to add_to_class() always,
+though it will not be used in non-UTF 8-bit cases. This avoids having to supply
+alternative calls for the different cases. */
+pcre_uchar *class_uchardata;
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
BOOL xclass;
-pcre_uchar *class_uchardata;
pcre_uchar *class_uchardata_base;
#endif
@@ -4133,7 +4153,7 @@
alpha. This relies on the fact that the class table starts with
alpha, lower, upper as the first 3 entries. */
- if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
+ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
/* When PCRE_UCP is set, some of the POSIX classes are converted to
@@ -4476,16 +4496,41 @@
if (negate_class)
{
+#ifdef SUPPORT_UCP
+ int d;
+#endif
if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
zerofirstchar = firstchar;
- *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
+
+ /* For caseless UTF-8 mode when UCP support is available, check
+ whether this character has more than one other case. If so, generate
+ a special OP_NOTPROP item instead of OP_NOTI. */
+
+#ifdef SUPPORT_UCP
+ if (utf && (options & PCRE_CASELESS) != 0 &&
+ (d = UCD_CASESET(c)) != 0)
+ {
+ *code++ = OP_NOTPROP;
+ *code++ = PT_CLIST;
+ *code++ = d;
+ }
+ else
+#endif
+ /* Char has only one other case, or UCP not available */
+
+ {
+ *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
#ifdef SUPPORT_UTF
- if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
- code += PRIV(ord2utf)(c, code);
- else
+ if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
+ code += PRIV(ord2utf)(c, code);
+ else
#endif
- *code++ = c;
- goto NOT_CHAR;
+ *code++ = c;
+ }
+
+ /* We are finished with this character class */
+
+ goto END_CLASS;
}
/* For a single, positive character, get the value into mcbuffer, and
@@ -4601,7 +4646,8 @@
memcpy(code, classbits, 32);
}
code += 32 / sizeof(pcre_uchar);
- NOT_CHAR:
+
+ END_CLASS:
break;
@@ -6836,6 +6882,28 @@
ONE_CHAR:
previous = code;
+
+ /* For caseless UTF-8 mode when UCP support is available, check whether
+ this character has more than one other case. If so, generate a special
+ OP_PROP item instead of OP_CHARI. */
+
+#ifdef SUPPORT_UCP
+ if (utf && (options & PCRE_CASELESS) != 0)
+ {
+ GETCHAR(c, mcbuffer);
+ if ((c = UCD_CASESET(c)) != 0)
+ {
+ *code++ = OP_PROP;
+ *code++ = PT_CLIST;
+ *code++ = c;
+ if (firstchar == REQ_UNSET) firstchar = zerofirstchar = REQ_NONE;
+ break;
+ }
+ }
+#endif
+
+ /* Caseful matches, or not one of the multicase characters. */
+
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_dfa_exec.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1060,6 +1060,7 @@
if (clen > 0)
{
BOOL OK;
+ const pcre_uint32 *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1])
{
@@ -1107,6 +1108,15 @@
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
break;
+
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if (c < *cp) { OK = FALSE; break; }
+ if (c == *cp++) { OK = TRUE; break; }
+ }
+ break;
/* Should never occur, but keep compilers from grumbling. */
@@ -1294,6 +1304,7 @@
if (clen > 0)
{
BOOL OK;
+ const pcre_uint32 *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
{
@@ -1342,6 +1353,15 @@
c == CHAR_UNDERSCORE;
break;
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if (c < *cp) { OK = FALSE; break; }
+ if (c == *cp++) { OK = TRUE; break; }
+ }
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1522,6 +1542,7 @@
if (clen > 0)
{
BOOL OK;
+ const pcre_uint32 *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
{
@@ -1570,6 +1591,15 @@
c == CHAR_UNDERSCORE;
break;
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if (c < *cp) { OK = FALSE; break; }
+ if (c == *cp++) { OK = TRUE; break; }
+ }
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
@@ -1775,6 +1805,7 @@
if (clen > 0)
{
BOOL OK;
+ const pcre_uint32 *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1 + IMM2_SIZE + 1])
{
@@ -1823,6 +1854,15 @@
c == CHAR_UNDERSCORE;
break;
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if (c < *cp) { OK = FALSE; break; }
+ if (c == *cp++) { OK = TRUE; break; }
+ }
+ break;
+
/* Should never occur, but keep compilers from grumbling. */
default:
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_exec.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -180,21 +180,32 @@
if (md->utf)
{
/* Match characters up to the end of the reference. NOTE: the number of
- bytes matched may differ, because there are some characters whose upper and
- lower case versions code as different numbers of bytes. For example, U+023A
- (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
- a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
- the latter. It is important, therefore, to check the length along the
- reference, not along the subject (earlier code did this wrong). */
+ data units matched may differ, because in UTF-8 there are some characters
+ whose upper and lower case versions code have different numbers of bytes.
+ For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
+ (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
+ sequence of two of the latter. It is important, therefore, to check the
+ length along the reference, not along the subject (earlier code did this
+ wrong). */
PCRE_PUCHAR endptr = p + length;
while (p < endptr)
{
- int c, d;
+ unsigned int c, d;
+ const ucd_record *ur;
if (eptr >= md->end_subject) return -2; /* Partial match */
GETCHARINC(c, eptr);
GETCHARINC(d, p);
- if (c != d && c != UCD_OTHERCASE(d)) return -1;
+ ur = GET_UCD(d);
+ if (c != d && c != d + ur->other_case)
+ {
+ const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
+ for (;;)
+ {
+ if (c < *pp) return -1;
+ if (c == *pp++) break;
+ }
+ }
}
}
else
@@ -2512,6 +2523,7 @@
}
GETCHARINCTEST(c, eptr);
{
+ const pcre_uint32 *cp;
const ucd_record *prop = GET_UCD(c);
switch(ecode[1])
@@ -2571,6 +2583,17 @@
c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
+
+ case PT_CLIST:
+ cp = PRIV(ucd_caseless_sets) + prop->caseset;
+ for (;;)
+ {
+ if (c < *cp)
+ { if (op == OP_PROP) RRETURN(MATCH_NOMATCH); else break; }
+ if (c == *cp++)
+ { if (op == OP_PROP) break; else RRETURN(MATCH_NOMATCH); }
+ }
+ break;
/* This should never occur */
@@ -2609,7 +2632,7 @@
CHECK_PARTIAL();
ecode++;
break;
-#endif
+#endif /* SUPPORT_UCP */
/* Match a back reference, possibly repeatedly. Look past the end of the
@@ -4162,7 +4185,28 @@
RRETURN(MATCH_NOMATCH);
}
break;
-
+
+ case PT_CLIST:
+ for (i = 1; i <= min; i++)
+ {
+ const pcre_uint32 *cp;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
+ for (;;)
+ {
+ if (c < *cp)
+ { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+ if (c == *cp++)
+ { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
+ }
+ }
+ break;
+
/* This should not occur */
default:
@@ -4875,8 +4919,31 @@
}
/* Control never gets here */
+ case PT_CLIST:
+ for (fi = min;; fi++)
+ {
+ const pcre_uint32 *cp;
+ RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) RRETURN(MATCH_NOMATCH);
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+ GETCHARINCTEST(c, eptr);
+ cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
+ for (;;)
+ {
+ if (c < *cp)
+ { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+ if (c == *cp++)
+ { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
+ }
+ }
+ /* Control never gets here */
+
/* This should never occur */
-
default:
RRETURN(PCRE_ERROR_INTERNAL);
}
@@ -5345,6 +5412,30 @@
eptr+= len;
}
break;
+
+ case PT_CLIST:
+ for (i = min; i < max; i++)
+ {
+ const pcre_uint32 *cp;
+ int len = 1;
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ break;
+ }
+ GETCHARLENTEST(c, eptr, len);
+ cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
+ for (;;)
+ {
+ if (c < *cp)
+ { if (prop_fail_result) break; else goto GOT_MAX; }
+ if (c == *cp++)
+ { if (prop_fail_result) goto GOT_MAX; else break; }
+ }
+ eptr += len;
+ }
+ GOT_MAX:
+ break;
default:
RRETURN(PCRE_ERROR_INTERNAL);
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_internal.h 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1654,6 +1654,7 @@
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
#define PT_WORD 8 /* Word - L plus N plus underscore */
+#define PT_CLIST 9 /* Pseudo-property: match character list */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain characters with values greater than 255. */
@@ -1676,7 +1677,7 @@
non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
-when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
They must be contiguous, and remain in order so that the replacements can be
looked up from a table.
@@ -1718,7 +1719,7 @@
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
- OP_ANY, /* 12 Match any character except newline */
+ OP_ANY, /* 12 Match any character except newline (\N) */
OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
@@ -1729,8 +1730,8 @@
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
OP_VSPACE, /* 21 \v (vertical whitespace) */
OP_EXTUNI, /* 22 \X (extended Unicode sequence */
- OP_EODN, /* 23 End of data or \n at end of data: \Z. */
- OP_EOD, /* 24 End of data: \z */
+ OP_EODN, /* 23 End of data or \n at end of data (\Z) */
+ OP_EOD, /* 24 End of data (\z) */
OP_CIRC, /* 25 Start of line - not multiline */
OP_CIRCM, /* 26 Start of line - multiline */
Modified: code/trunk/pcre_printint.c
===================================================================
--- code/trunk/pcre_printint.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_printint.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -120,7 +120,7 @@
(void)utf; /* Avoid compiler warning */
if (PRINTABLE(c)) fprintf(f, "%c", c);
-else if (c <= 0xff) fprintf(f, "\\x%02x", c);
+else if (c <= 0x80) fprintf(f, "\\x%02x", c);
else fprintf(f, "\\x{%x}", c);
return 0;
@@ -233,7 +233,41 @@
}
+/*************************************************
+* Print Unicode property value *
+*************************************************/
+/* "Normal" properties can be printed from tables. The PT_CLIST property is a
+pseudo-property that contains a pointer to a list of case-equivalent
+characters. This is used only when UCP support is available and UTF mode is
+selected. It should never occur otherwise, but just in case it does, have
+something ready to print. */
+
+static void
+print_prop(FILE *f, pcre_uchar *code, const char *before, const char *after)
+{
+if (code[1] != PT_CLIST)
+ {
+ fprintf(f, "%s%s %s%s", before, priv_OP_names[*code], get_ucpname(code[1],
+ code[2]), after);
+ }
+else
+ {
+ const char *not = (*code == OP_PROP)? "" : "not ";
+#ifndef SUPPORT_UCP
+ fprintf(f, "%s%sclist %d%s", before, not, code[2], after);
+#else
+ const pcre_uint32 *p = PRIV(ucd_caseless_sets) + code[2];
+ fprintf (f, "%s%sclist", before, not);
+ while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
+ fprintf(f, "%s", after);
+#endif
+ }
+}
+
+
+
+
/*************************************************
* Print compiled regex *
*************************************************/
@@ -427,12 +461,12 @@
fprintf(f, " %s ", flag);
if (*code >= OP_TYPESTAR)
{
- fprintf(f, "%s", priv_OP_names[code[1]]);
if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
{
- fprintf(f, " %s ", get_ucpname(code[2], code[3]));
+ print_prop(f, code + 1, "", " ");
extra = 2;
}
+ else fprintf(f, "%s", priv_OP_names[code[1]]);
}
else extra = print_char(f, code+1, utf);
fprintf(f, "%s", priv_OP_names[*code]);
@@ -461,13 +495,12 @@
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- fprintf(f, " %s", priv_OP_names[code[1 + IMM2_SIZE]]);
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
{
- fprintf(f, " %s ", get_ucpname(code[1 + IMM2_SIZE + 1],
- code[1 + IMM2_SIZE + 2]));
+ print_prop(f, code + IMM2_SIZE + 1, " ", " ");
extra = 2;
}
+ else fprintf(f, " %s", priv_OP_names[code[1 + IMM2_SIZE]]);
fprintf(f, "{");
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
@@ -552,7 +585,7 @@
case OP_PROP:
case OP_NOTPROP:
- fprintf(f, " %s %s", priv_OP_names[*code], get_ucpname(code[1], code[2]));
+ print_prop(f, code, " ", "");
break;
/* OP_XCLASS can only occur in UTF or PCRE16 modes. However, there's no
Modified: code/trunk/pcre_ucd.c
===================================================================
--- code/trunk/pcre_ucd.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcre_ucd.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1,3 +1,15 @@
+/* This module is generated by the maint/MultiStage2.py script.
+Do not modify it by hand. Instead modify the script and run it
+to regenerate this code.
+
+As well as being part of the PCRE library, this module is #included
+by the pcretest program, which redefines the PRIV macro to change
+table names from _pcre_xxx to xxxx, thereby avoiding name clashes
+with the library. At present, just one of these tables is actually
+needed. */
+
+#ifndef PCRE_INCLUDED
+
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
@@ -4,6 +16,8 @@
#include "pcre_internal.h"
+#endif /* PCRE_INCLUDED */
+
/* Unicode character database. */
/* This file was autogenerated by the MultiStage2.py script. */
/* Total size: 65688 bytes, block size: 128. */
@@ -18,7 +32,7 @@
Instead, just supply small dummy tables. */
#ifndef SUPPORT_UCP
-const ucd_record PRIV(ucd_records)[] = {{0,0,0,0 }};
+const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};
const pcre_uint8 PRIV(ucd_stage1)[] = {0};
const pcre_uint16 PRIV(ucd_stage2)[] = {0};
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
@@ -61,6 +75,10 @@
0x00c5, 0x00e5, 0x212b, NOTACHAR,
};
+/* When #included in pcretest, we don't need this large table. */
+
+#ifndef PCRE_INCLUDED
+
const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
{ 9, 0, 2, 0, 0, }, /* 0 */
{ 9, 0, 1, 0, 0, }, /* 1 */
@@ -3275,3 +3293,5 @@
#error Please correct UCD_BLOCK_SIZE in pcre_internal.h
#endif
#endif /* SUPPORT_UCP */
+
+#endif /* PCRE_INCLUDED */
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/pcretest.c 2012-09-25 16:27:58 UTC (rev 1046)
@@ -154,12 +154,13 @@
#endif
/* We need access to some of the data tables that PCRE uses. So as not to have
-to keep two copies, we include the source file here, changing the names of the
+to keep two copies, we include the source files here, changing the names of the
external symbols to prevent clashes. */
#define PCRE_INCLUDED
#include "pcre_tables.c"
+#include "pcre_ucd.c"
/* The definition of the macro PRINTABLE, which determines whether to print an
output character as-is or as a hex value when showing compiled patterns, is
Modified: code/trunk/perltest.pl
===================================================================
--- code/trunk/perltest.pl 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/perltest.pl 2012-09-25 16:27:58 UTC (rev 1046)
@@ -12,6 +12,7 @@
# Function for turning a string into a string of printing chars.
+#use utf8;
#require Encode;
sub pchars {
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testinput10 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1108,4 +1108,214 @@
/[z\x{1f88}]+/8i
\x{1f88}\x{1f80}
+/-- Perl matches these --/
+
+/\x{00b5}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/\x{039c}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/\x{03bc}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+
+/\x{00c5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/\x{00e5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/\x{212b}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+
+/\x{01c4}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/\x{01c5}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/\x{01c6}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+
+/\x{01c7}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/\x{01c8}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/\x{01c9}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+
+/\x{01ca}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/\x{01cb}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/\x{01cc}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+
+/\x{01f1}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/\x{01f2}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/\x{01f3}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+
+/\x{0345}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{0399}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{03b9}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{1fbe}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+
+/\x{0392}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/\x{03b2}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/\x{03d0}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+
+/\x{0395}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/\x{03b5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/\x{03f5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+
+/\x{0398}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03b8}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03d1}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03f4}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+
+/\x{039a}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/\x{03ba}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/\x{03f0}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+
+/\x{03a0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/\x{03c0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/\x{03d6}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+
+/\x{03a1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/\x{03c1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/\x{03f1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+
+/\x{03a3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/\x{03c2}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/\x{03c3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+
+/\x{03a6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/\x{03c6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/\x{03d5}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+
+/\x{03c9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/\x{03a9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/\x{2126}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+
+/\x{1e60}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e61}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e9b}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+
+/\x{1f80}+/8i
+ \x{1f88}\x{1f80}
+
+/\x{004b}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/\x{006b}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/\x{212a}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+
+/\x{0053}+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/\x{0073}+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/\x{017f}+/8i
+ \x{0053}\x{0073}\x{017f}
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testinput6 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1085,4 +1085,235 @@
/-- --/
+/(ΣΆΜΟΣ) \1/8i
+ ΣΆΜΟΣ ΣΆΜΟΣ
+ ΣΆΜΟΣ σάμος
+ σάμος σάμος
+ σάμος σάμοσ
+ σάμος ΣΆΜΟΣ
+
+/(σάμος) \1/8i
+ ΣΆΜΟΣ ΣΆΜΟΣ
+ ΣΆΜΟΣ σάμος
+ σάμος σάμος
+ σάμος σάμοσ
+ σάμος ΣΆΜΟΣ
+
+/(ΣΆΜΟΣ) \1*/8i
+ ΣΆΜΟΣ\x20
+ ΣΆΜΟΣ ΣΆΜΟΣσάμοςσάμος
+
+/-- Perl matches these --/
+
+/\x{00b5}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/\x{039c}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+/\x{03bc}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+
+
+/\x{00c5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/\x{00e5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+/\x{212b}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+
+
+/\x{01c4}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/\x{01c5}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+/\x{01c6}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+
+
+/\x{01c7}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/\x{01c8}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+/\x{01c9}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+
+
+/\x{01ca}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/\x{01cb}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+/\x{01cc}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+
+
+/\x{01f1}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/\x{01f2}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+/\x{01f3}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+
+
+/\x{0345}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{0399}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{03b9}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+/\x{1fbe}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+
+
+/\x{0392}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/\x{03b2}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+/\x{03d0}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+
+
+/\x{0395}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/\x{03b5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+/\x{03f5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+
+
+/\x{0398}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03b8}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03d1}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+/\x{03f4}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+
+
+/\x{039a}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/\x{03ba}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+/\x{03f0}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+
+
+/\x{03a0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/\x{03c0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+/\x{03d6}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+
+
+/\x{03a1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/\x{03c1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+/\x{03f1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+
+
+/\x{03a3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/\x{03c2}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+/\x{03c3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+
+
+/\x{03a6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/\x{03c6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+/\x{03d5}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+
+
+/\x{03c9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/\x{03a9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+/\x{2126}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+
+
+/\x{1e60}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e61}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e9b}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+
+
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+
+/\x{1f80}+/8i
+ \x{1f88}\x{1f80}
+
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/\x{004b}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/\x{006b}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+/\x{212a}+/8i
+ \x{004b}\x{006b}\x{212a}
+
+
+/\x{0053}+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/\x{0073}+/8i
+ \x{0053}\x{0073}\x{017f}
+
+/\x{017f}+/8i
+ \x{0053}\x{0073}\x{017f}
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testinput7
===================================================================
--- code/trunk/testdata/testinput7 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testinput7 2012-09-25 16:27:58 UTC (rev 1046)
@@ -613,4 +613,38 @@
AA\P
AA\P\P
+/A\x{3a3}B/8iDZ
+
+/\x{3a3}B/8iDZ
+
+/[\x{3a3}]/8iBZ
+
+/[^\x{3a3}]/8iBZ
+
+/[\x{3a3}]+/8iBZ
+
+/[^\x{3a3}]+/8iBZ
+
+/a*\x{3a3}/8iBZ
+
+/\x{3a3}+a/8iBZ
+
+/\x{3a3}*\x{3c2}/8iBZ
+
+/\x{3a3}{3}/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}{2,4}/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}{2,4}?/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}+./8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}++./8i+
+ ** Failers
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
/-- End of testinput7 --/
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testoutput10 2012-09-25 16:27:58 UTC (rev 1046)
@@ -2298,4 +2298,407 @@
0: \x{1f88}\x{1f80}
1: \x{1f88}
+/-- Perl matches these --/
+
+/\x{00b5}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+ 1: \x{b5}\x{39c}
+ 2: \x{b5}
+
+/\x{039c}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+ 1: \x{b5}\x{39c}
+ 2: \x{b5}
+
+/\x{03bc}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+ 1: \x{b5}\x{39c}
+ 2: \x{b5}
+
+
+/\x{00c5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+ 1: \x{c5}\x{e5}
+ 2: \x{c5}
+
+/\x{00e5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+ 1: \x{c5}\x{e5}
+ 2: \x{c5}
+
+/\x{212b}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+ 1: \x{c5}\x{e5}
+ 2: \x{c5}
+
+
+/\x{01c4}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+ 1: \x{1c4}\x{1c5}
+ 2: \x{1c4}
+
+/\x{01c5}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+ 1: \x{1c4}\x{1c5}
+ 2: \x{1c4}
+
+/\x{01c6}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+ 1: \x{1c4}\x{1c5}
+ 2: \x{1c4}
+
+
+/\x{01c7}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+ 1: \x{1c7}\x{1c8}
+ 2: \x{1c7}
+
+/\x{01c8}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+ 1: \x{1c7}\x{1c8}
+ 2: \x{1c7}
+
+/\x{01c9}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+ 1: \x{1c7}\x{1c8}
+ 2: \x{1c7}
+
+
+/\x{01ca}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+ 1: \x{1ca}\x{1cb}
+ 2: \x{1ca}
+
+/\x{01cb}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+ 1: \x{1ca}\x{1cb}
+ 2: \x{1ca}
+
+/\x{01cc}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+ 1: \x{1ca}\x{1cb}
+ 2: \x{1ca}
+
+
+/\x{01f1}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+ 1: \x{1f1}\x{1f2}
+ 2: \x{1f1}
+
+/\x{01f2}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+ 1: \x{1f1}\x{1f2}
+ 2: \x{1f1}
+
+/\x{01f3}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+ 1: \x{1f1}\x{1f2}
+ 2: \x{1f1}
+
+
+/\x{0345}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+ 1: \x{345}\x{399}\x{3b9}
+ 2: \x{345}\x{399}
+ 3: \x{345}
+
+/\x{0399}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+ 1: \x{345}\x{399}\x{3b9}
+ 2: \x{345}\x{399}
+ 3: \x{345}
+
+/\x{03b9}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+ 1: \x{345}\x{399}\x{3b9}
+ 2: \x{345}\x{399}
+ 3: \x{345}
+
+/\x{1fbe}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+ 1: \x{345}\x{399}\x{3b9}
+ 2: \x{345}\x{399}
+ 3: \x{345}
+
+
+/\x{0392}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+ 1: \x{392}\x{3b2}
+ 2: \x{392}
+
+/\x{03b2}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+ 1: \x{392}\x{3b2}
+ 2: \x{392}
+
+/\x{03d0}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+ 1: \x{392}\x{3b2}
+ 2: \x{392}
+
+
+/\x{0395}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+ 1: \x{395}\x{3b5}
+ 2: \x{395}
+
+/\x{03b5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+ 1: \x{395}\x{3b5}
+ 2: \x{395}
+
+/\x{03f5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+ 1: \x{395}\x{3b5}
+ 2: \x{395}
+
+
+/\x{0398}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+ 1: \x{398}\x{3b8}\x{3d1}
+ 2: \x{398}\x{3b8}
+ 3: \x{398}
+
+/\x{03b8}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+ 1: \x{398}\x{3b8}\x{3d1}
+ 2: \x{398}\x{3b8}
+ 3: \x{398}
+
+/\x{03d1}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+ 1: \x{398}\x{3b8}\x{3d1}
+ 2: \x{398}\x{3b8}
+ 3: \x{398}
+
+/\x{03f4}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+ 1: \x{398}\x{3b8}\x{3d1}
+ 2: \x{398}\x{3b8}
+ 3: \x{398}
+
+
+/\x{039a}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+ 1: \x{39a}\x{3ba}
+ 2: \x{39a}
+
+/\x{03ba}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+ 1: \x{39a}\x{3ba}
+ 2: \x{39a}
+
+/\x{03f0}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+ 1: \x{39a}\x{3ba}
+ 2: \x{39a}
+
+
+/\x{03a0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+ 1: \x{3a0}\x{3c0}
+ 2: \x{3a0}
+
+/\x{03c0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+ 1: \x{3a0}\x{3c0}
+ 2: \x{3a0}
+
+/\x{03d6}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+ 1: \x{3a0}\x{3c0}
+ 2: \x{3a0}
+
+
+/\x{03a1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+ 1: \x{3a1}\x{3c1}
+ 2: \x{3a1}
+
+/\x{03c1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+ 1: \x{3a1}\x{3c1}
+ 2: \x{3a1}
+
+/\x{03f1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+ 1: \x{3a1}\x{3c1}
+ 2: \x{3a1}
+
+
+/\x{03a3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+ 1: \x{3a3}\x{3c2}
+ 2: \x{3a3}
+
+/\x{03c2}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+ 1: \x{3a3}\x{3c2}
+ 2: \x{3a3}
+
+/\x{03c3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+ 1: \x{3a3}\x{3c2}
+ 2: \x{3a3}
+
+
+/\x{03a6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+ 1: \x{3a6}\x{3c6}
+ 2: \x{3a6}
+
+/\x{03c6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+ 1: \x{3a6}\x{3c6}
+ 2: \x{3a6}
+
+/\x{03d5}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+ 1: \x{3a6}\x{3c6}
+ 2: \x{3a6}
+
+
+/\x{03c9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+ 1: \x{3c9}\x{3a9}
+ 2: \x{3c9}
+
+/\x{03a9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+ 1: \x{3c9}\x{3a9}
+ 2: \x{3c9}
+
+/\x{2126}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+ 1: \x{3c9}\x{3a9}
+ 2: \x{3c9}
+
+
+/\x{1e60}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+ 1: \x{1e60}\x{1e61}
+ 2: \x{1e60}
+
+/\x{1e61}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+ 1: \x{1e60}\x{1e61}
+ 2: \x{1e60}
+
+/\x{1e9b}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+ 1: \x{1e60}\x{1e61}
+ 2: \x{1e60}
+
+
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+ 1: \x{1e9e}
+
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
+/\x{1f80}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+ 1: \x{1f88}
+
+/\x{004b}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+ 1: Kk
+ 2: K
+
+/\x{006b}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+ 1: Kk
+ 2: K
+
+/\x{212a}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+ 1: Kk
+ 2: K
+
+
+/\x{0053}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+ 1: Ss
+ 2: S
+
+/\x{0073}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+ 1: Ss
+ 2: S
+
+/\x{017f}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+ 1: Ss
+ 2: S
+
/-- End of testinput10 --/
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testoutput6 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1816,4 +1816,322 @@
/-- --/
+/(ΣΆΜΟΣ) \1/8i
+ ΣΆΜΟΣ ΣΆΜΟΣ
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ ΣΆΜΟΣ σάμος
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ σάμος σάμος
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ σάμος σάμοσ
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c3}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ σάμος ΣΆΜΟΣ
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+
+/(σάμος) \1/8i
+ ΣΆΜΟΣ ΣΆΜΟΣ
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ ΣΆΜΟΣ σάμος
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ σάμος σάμος
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ σάμος σάμοσ
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c3}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ σάμος ΣΆΜΟΣ
+ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+
+/(ΣΆΜΟΣ) \1*/8i
+ ΣΆΜΟΣ\x20
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+ ΣΆΜΟΣ ΣΆΜΟΣσάμοςσάμος
+ 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}\x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}\x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}
+ 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}
+
+/-- Perl matches these --/
+
+/\x{00b5}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/\x{039c}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+/\x{03bc}+/8i
+ \x{00b5}\x{039c}\x{03bc}
+ 0: \x{b5}\x{39c}\x{3bc}
+
+
+/\x{00c5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/\x{00e5}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+/\x{212b}+/8i
+ \x{00c5}\x{00e5}\x{212b}
+ 0: \x{c5}\x{e5}\x{212b}
+
+
+/\x{01c4}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/\x{01c5}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+/\x{01c6}+/8i
+ \x{01c4}\x{01c5}\x{01c6}
+ 0: \x{1c4}\x{1c5}\x{1c6}
+
+
+/\x{01c7}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/\x{01c8}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+/\x{01c9}+/8i
+ \x{01c7}\x{01c8}\x{01c9}
+ 0: \x{1c7}\x{1c8}\x{1c9}
+
+
+/\x{01ca}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/\x{01cb}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+/\x{01cc}+/8i
+ \x{01ca}\x{01cb}\x{01cc}
+ 0: \x{1ca}\x{1cb}\x{1cc}
+
+
+/\x{01f1}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/\x{01f2}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+/\x{01f3}+/8i
+ \x{01f1}\x{01f2}\x{01f3}
+ 0: \x{1f1}\x{1f2}\x{1f3}
+
+
+/\x{0345}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/\x{0399}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/\x{03b9}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+/\x{1fbe}+/8i
+ \x{0345}\x{0399}\x{03b9}\x{1fbe}
+ 0: \x{345}\x{399}\x{3b9}\x{1fbe}
+
+
+/\x{0392}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/\x{03b2}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+/\x{03d0}+/8i
+ \x{0392}\x{03b2}\x{03d0}
+ 0: \x{392}\x{3b2}\x{3d0}
+
+
+/\x{0395}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/\x{03b5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+/\x{03f5}+/8i
+ \x{0395}\x{03b5}\x{03f5}
+ 0: \x{395}\x{3b5}\x{3f5}
+
+
+/\x{0398}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/\x{03b8}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/\x{03d1}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+/\x{03f4}+/8i
+ \x{0398}\x{03b8}\x{03d1}\x{03f4}
+ 0: \x{398}\x{3b8}\x{3d1}\x{3f4}
+
+
+/\x{039a}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/\x{03ba}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+/\x{03f0}+/8i
+ \x{039a}\x{03ba}\x{03f0}
+ 0: \x{39a}\x{3ba}\x{3f0}
+
+
+/\x{03a0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/\x{03c0}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+/\x{03d6}+/8i
+ \x{03a0}\x{03c0}\x{03d6}
+ 0: \x{3a0}\x{3c0}\x{3d6}
+
+
+/\x{03a1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/\x{03c1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+/\x{03f1}+/8i
+ \x{03a1}\x{03c1}\x{03f1}
+ 0: \x{3a1}\x{3c1}\x{3f1}
+
+
+/\x{03a3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/\x{03c2}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+/\x{03c3}+/8i
+ \x{03A3}\x{03C2}\x{03C3}
+ 0: \x{3a3}\x{3c2}\x{3c3}
+
+
+/\x{03a6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/\x{03c6}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+/\x{03d5}+/8i
+ \x{03a6}\x{03c6}\x{03d5}
+ 0: \x{3a6}\x{3c6}\x{3d5}
+
+
+/\x{03c9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/\x{03a9}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+/\x{2126}+/8i
+ \x{03c9}\x{03a9}\x{2126}
+ 0: \x{3c9}\x{3a9}\x{2126}
+
+
+/\x{1e60}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e61}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+/\x{1e9b}+/8i
+ \x{1e60}\x{1e61}\x{1e9b}
+ 0: \x{1e60}\x{1e61}\x{1e9b}
+
+
+/\x{1e9e}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+/\x{00df}+/8i
+ \x{1e9e}\x{00df}
+ 0: \x{1e9e}\x{df}
+
+
+/\x{1f88}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+
+/\x{1f80}+/8i
+ \x{1f88}\x{1f80}
+ 0: \x{1f88}\x{1f80}
+
+
+/-- Perl 5.12.4 gets these wrong, but 5.15.3 is OK --/
+
+/\x{004b}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/\x{006b}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+/\x{212a}+/8i
+ \x{004b}\x{006b}\x{212a}
+ 0: Kk\x{212a}
+
+
+/\x{0053}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/\x{0073}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
+/\x{017f}+/8i
+ \x{0053}\x{0073}\x{017f}
+ 0: Ss\x{17f}
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput7
===================================================================
--- code/trunk/testdata/testoutput7 2012-09-23 16:50:00 UTC (rev 1045)
+++ code/trunk/testdata/testoutput7 2012-09-25 16:27:58 UTC (rev 1046)
@@ -1301,4 +1301,116 @@
AA\P\P
Partial match: AA
+/A\x{3a3}B/8iDZ
+------------------------------------------------------------------
+ Bra
+ /i A
+ clist 03a3 03c2 03c3
+ /i B
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: caseless utf
+First char = 'A' (caseless)
+Need char = 'B' (caseless)
+
+/\x{3a3}B/8iDZ
+------------------------------------------------------------------
+ Bra
+ clist 03a3 03c2 03c3
+ /i B
+ Ket
+ End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+Need char = 'B' (caseless)
+
+/[\x{3a3}]/8iBZ
+------------------------------------------------------------------
+ Bra
+ clist 03a3 03c2 03c3
+ Ket
+ End
+------------------------------------------------------------------
+
+/[^\x{3a3}]/8iBZ
+------------------------------------------------------------------
+ Bra
+ not clist 03a3 03c2 03c3
+ Ket
+ End
+------------------------------------------------------------------
+
+/[\x{3a3}]+/8iBZ
+------------------------------------------------------------------
+ Bra
+ clist 03a3 03c2 03c3 +
+ Ket
+ End
+------------------------------------------------------------------
+
+/[^\x{3a3}]+/8iBZ
+------------------------------------------------------------------
+ Bra
+ not clist 03a3 03c2 03c3 +
+ Ket
+ End
+------------------------------------------------------------------
+
+/a*\x{3a3}/8iBZ
+------------------------------------------------------------------
+ Bra
+ /i a*+
+ clist 03a3 03c2 03c3
+ Ket
+ End
+------------------------------------------------------------------
+
+/\x{3a3}+a/8iBZ
+------------------------------------------------------------------
+ Bra
+ clist 03a3 03c2 03c3 ++
+ /i a
+ Ket
+ End
+------------------------------------------------------------------
+
+/\x{3a3}*\x{3c2}/8iBZ
+------------------------------------------------------------------
+ Bra
+ clist 03a3 03c2 03c3 *
+ clist 03a3 03c2 03c3
+ Ket
+ End
+------------------------------------------------------------------
+
+/\x{3a3}{3}/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+ 0: \x{3a3}\x{3c3}\x{3c2}
+ 0+ \x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}{2,4}/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+ 0: \x{3a3}\x{3c3}\x{3c2}\x{3a3}
+ 0+ \x{3c3}\x{3c2}
+
+/\x{3a3}{2,4}?/8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+ 0: \x{3a3}\x{3c3}
+ 0+ \x{3c2}\x{3a3}\x{3c3}\x{3c2}
+
+/\x{3a3}+./8i+
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+ 0: \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+ 0+
+
+/\x{3a3}++./8i+
+ ** Failers
+No match
+ \x{3a3}\x{3c3}\x{3c2}\x{3a3}\x{3c3}\x{3c2}
+No match
+
/-- End of testinput7 --/