Revision: 1224
http://www.exim.org/viewvc/pcre2?view=rev&revision=1224
Author: ph10
Date: 2020-02-23 16:40:05 +0000 (Sun, 23 Feb 2020)
Log Message:
-----------
Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.
This is not yet documented, and it not yet implemented in JIT.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/maint/ManyConfigTests
code/trunk/src/pcre2_auto_possess.c
code/trunk/src/pcre2_compile.c
code/trunk/src/pcre2_dfa_match.c
code/trunk/src/pcre2_internal.h
code/trunk/src/pcre2_match.c
code/trunk/src/pcre2_study.c
code/trunk/src/pcre2_substitute.c
code/trunk/testdata/testinput10
code/trunk/testdata/testinput12
code/trunk/testdata/testinput14
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput12-16
code/trunk/testdata/testoutput12-32
code/trunk/testdata/testoutput14-16
code/trunk/testdata/testoutput14-32
code/trunk/testdata/testoutput14-8
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/ChangeLog 2020-02-23 16:40:05 UTC (rev 1224)
@@ -66,7 +66,12 @@
17. Fix a crash which occurs when the character type of an invalid UTF
character is decoded in JIT.
+18. Changes in many areas of the code so that when Unicode is supported and
+PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
+upper/lower case computations on characters whose code points are greater than
+127. Documentation is not yet updated. JIT is not yet updated.
+
Version 10.34 21-November-2019
------------------------------
Modified: code/trunk/maint/ManyConfigTests
===================================================================
--- code/trunk/maint/ManyConfigTests 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/maint/ManyConfigTests 2020-02-23 16:40:05 UTC (rev 1224)
@@ -28,8 +28,6 @@
# The -v option causes a call to 'pcre2test -C' to happen for each
# configuration.
-# Currently -fsanitize=undefined is not working (locks machine).
-
useasan=1
useusan=1
usedebug=1
Modified: code/trunk/src/pcre2_auto_possess.c
===================================================================
--- code/trunk/src/pcre2_auto_possess.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_auto_possess.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -292,6 +292,7 @@
Arguments:
code points to start of expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
fcc points to the case-flipping table
list points to output list
list[0] will be filled with the opcode
@@ -304,7 +305,7 @@
*/
static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list)
{
PCRE2_UCHAR c = *code;
@@ -316,7 +317,8 @@
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
-(void)utf; /* Suppress "unused parameter" compiler warning */
+(void)utf; /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
#endif
list[0] = c;
@@ -396,7 +398,7 @@
list[2] = chr;
#ifdef SUPPORT_UNICODE
- if (chr < 128 || (chr < 256 && !utf))
+ if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr];
else
list[3] = UCD_OTHERCASE(chr);
@@ -503,6 +505,7 @@
Arguments:
code points to the byte code
utf TRUE in UTF mode
+ ucp TRUE in UCP mode
cb compile data block
base_list the data list of the base opcode
base_end the end of the base opcode
@@ -512,7 +515,7 @@
*/
static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@@ -651,7 +654,7 @@
while (*next_code == OP_ALT)
{
- if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
@@ -672,7 +675,8 @@
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
- if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
+ rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
@@ -688,7 +692,7 @@
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
- code = get_chr_property_list(code, utf, cb->fcc, list);
+ code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing
@@ -1100,7 +1104,6 @@
Arguments:
code points to start of the byte code
- utf TRUE in UTF mode
cb compile data block
Returns: 0 for success
@@ -1108,7 +1111,7 @@
*/
int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
@@ -1115,6 +1118,8 @@
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;)
{
@@ -1126,10 +1131,11 @@
{
c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ?
- get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+ get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
- if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
+ &rec_limit))
{
switch(c)
{
@@ -1181,11 +1187,11 @@
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{
/* end must not be NULL. */
- end = get_chr_property_list(code, utf, cb->fcc, list);
+ end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0;
- if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{
switch (c)
{
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_compile.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -4904,7 +4904,7 @@
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
- if ((options & PCRE2_UTF) != 0)
+ if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
int rc;
uint32_t oc, od;
@@ -5319,7 +5319,8 @@
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
-#else /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
+#else /* No Unicode support */
BOOL utf = FALSE;
#endif
@@ -5602,7 +5603,7 @@
uint32_t d;
#ifdef SUPPORT_UNICODE
- if (utf && c > 127) d = UCD_OTHERCASE(c); else
+ if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif
{
#if PCRE2_CODE_UNIT_WIDTH != 8
@@ -9632,6 +9633,7 @@
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{
BOOL utf; /* Set TRUE for UTF mode */
+BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */
@@ -9919,8 +9921,8 @@
/* Check UCP lockout. */
-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
- (PCRE2_UCP|PCRE2_NEVER_UCP))
+ucp = (cb.external_options & PCRE2_UCP) != 0;
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{
errorcode = ERR75;
goto HAD_EARLY_ERROR;
@@ -10296,7 +10298,7 @@
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
- if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+ if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
}
/* Failed to compile, or error while post-processing. */
@@ -10344,21 +10346,25 @@
if ((firstcuflags & REQ_CASELESS) != 0)
{
- if (firstcu < 128 || (!utf && firstcu < 255))
+ if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
}
- /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
- 8-bit UTF mode, codepoints in the range 128-255 are introductory code
- points and cannot have another case. In 16-bit and 32-bit modes, we can
- check wide characters when UTF (and therefore UCP) is supported. */
+ /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
+ In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
+ points and cannot have another case, but if UCP is set they may do. */
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- else if (firstcu <= MAX_UTF_CODE_POINT &&
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
+ re->flags |= PCRE2_FIRSTCASELESS;
+#else
+ else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#endif
+#endif /* SUPPORT_UNICODE */
}
}
@@ -10407,14 +10413,20 @@
if ((reqcuflags & REQ_CASELESS) != 0)
{
- if (reqcu < 128 || (!utf && reqcu < 255))
+ if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
- re->flags |= PCRE2_LASTCASELESS;
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
+ re->flags |= PCRE2_LASTCASELESS;
+#else
+ else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
+ UCD_OTHERCASE(reqcu) != reqcu)
+ re->flags |= PCRE2_LASTCASELESS;
#endif
+#endif /* SUPPORT_UNICODE */
}
}
}
Modified: code/trunk/src/pcre2_dfa_match.c
===================================================================
--- code/trunk/src/pcre2_dfa_match.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_dfa_match.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -548,6 +548,7 @@
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
#endif
@@ -2190,7 +2191,7 @@
if (clen == 0) break;
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf_or_ucp)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@@ -2204,7 +2205,7 @@
}
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF mode */
+ /* Not UTF or UCP mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); }
@@ -2339,7 +2340,7 @@
{
uint32_t otherd;
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2374,7 +2375,7 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2417,7 +2418,7 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2458,7 +2459,7 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2491,7 +2492,7 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2531,7 +2532,7 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -3526,10 +3527,15 @@
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
-#endif
+#else
+ if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+ first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#endif
+#endif /* SUPPORT_UNICODE */
}
}
else
@@ -3545,9 +3551,15 @@
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+ if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_internal.h 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1952,7 +1952,7 @@
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
-extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *);
Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_match.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2015-2019 University of Cambridge
+ New API code Copyright (c) 2015-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -598,12 +598,13 @@
BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */
-/* UTF flag */
+/* UTF and UCP flags */
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else
-BOOL utf = FALSE;
+BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif
/* This is the length of the last part of a backtracking frame that must be
@@ -928,6 +929,7 @@
}
else
#endif
+
/* Not UTF mode */
{
if (mb->end_subject - Feptr < 1)
@@ -987,10 +989,30 @@
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
}
+
+ /* If UCP is set without UTF we must do the same as above, but with one
+ character per code unit. */
+
+ else if (ucp)
+ {
+ uint32_t cc = UCHAR21(Feptr);
+ fc = Fecode[1];
+ if (fc < 128)
+ {
+ if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
+ }
+ Feptr++;
+ Fecode += 2;
+ }
+
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF mode; use the table for characters < 256. */
+ /* Not UTF or UCP mode; use the table for characters < 256. */
{
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@@ -1010,6 +1032,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
+
#ifdef SUPPORT_UNICODE
if (utf)
{
@@ -1026,15 +1049,42 @@
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
- ch = TABLE_GET(ch, mb->fcc, ch);
+ ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
+
+ /* UCP without UTF is as above, but with one character per code unit. */
+
+ else if (ucp)
+ {
+ uint32_t ch;
+ fc = UCHAR21INC(Feptr);
+ ch = Fecode[1];
+ Fecode += 2;
+
+ if (ch == fc)
+ {
+ RRETURN(MATCH_NOMATCH); /* Caseful match */
+ }
+ else if (Fop == OP_NOTI) /* If caseless */
+ {
+ if (ch > 127)
+ ch = UCD_OTHERCASE(ch);
+ else
+ ch = (mb->fcc)[ch];
+ if (ch == fc) RRETURN(MATCH_NOMATCH);
+ }
+ }
+
else
#endif /* SUPPORT_UNICODE */
+
+ /* Neither UTF nor UCP is set */
+
{
uint32_t ch = Fecode[1];
- fc = *Feptr++;
+ fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH);
Fecode += 2;
@@ -1244,7 +1294,7 @@
#endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as
- above. */
+ above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++;
@@ -1253,11 +1303,15 @@
if (Fop >= OP_STARI)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
- /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
+ if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+ else
+#endif /* SUPPORT_UNICODE */
+ /* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE
- if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+ if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc);
@@ -1490,7 +1544,7 @@
if (Fop >= OP_NOTSTARI) /* Caseless */
{
#ifdef SUPPORT_UNICODE
- if (utf && Lc > 127)
+ if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
@@ -6045,7 +6099,6 @@
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
BOOL startline;
-BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
@@ -6069,13 +6122,19 @@
BOOL use_jit;
#endif
+/* This flag is needed even when Unicode is not supported for convenience
+(it is used by the IS_NEWLINE macro). */
+
+BOOL utf = FALSE;
+
#ifdef SUPPORT_UNICODE
+BOOL ucp = FALSE;
BOOL allow_invalid;
uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE;
#endif
-#endif
+#endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size;
@@ -6147,12 +6206,13 @@
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif
-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */
+#ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
-#ifdef SUPPORT_UNICODE
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
+#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */
@@ -6589,9 +6649,13 @@
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
+#else
+ if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
else
@@ -6607,9 +6671,13 @@
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
+#else
+ if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
@@ -6756,15 +6824,16 @@
#endif
}
- /* If we can't find the required code unit, having reached the true end
- of the subject, break the bumpalong loop, to force a match failure,
- except when doing partial matching, when we let the next cycle run at
- the end of the subject. To see why, consider the pattern /(?<=abc)def/,
- which partially matches "abc", even though the string does not contain
- the starting character "d". If we have not reached the true end of the
- subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
- we also let the cycle run, because the matching string is legitimately
- allowed to start with the first code unit of a newline. */
+ /* If we can't find the required first code unit, having reached the
+ true end of the subject, break the bumpalong loop, to force a match
+ failure, except when doing partial matching, when we let the next cycle
+ run at the end of the subject. To see why, consider the pattern
+ /(?<=abc)def/, which partially matches "abc", even though the string
+ does not contain the starting character "d". If we have not reached the
+ true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
+ temporarily modified) we also let the cycle run, because the matching
+ string is legitimately allowed to start with the first code unit of a
+ newline. */
if (mb->partial == 0 && start_match >= mb->end_subject)
{
Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_study.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -772,16 +772,20 @@
p points to the first code unit of the character
caseless TRUE if caseless
utf TRUE for UTF mode
+ ucp TRUE for UCP mode
Returns: pointer after the character
*/
static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
+ BOOL ucp)
{
uint32_t c = *p++; /* First code unit */
-(void)utf; /* Stop compiler warning when UTF not supported */
+(void)utf; /* Stop compiler warnings when UTF not supported */
+(void)ucp;
+
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */
@@ -810,22 +814,26 @@
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
+ c = UCD_OTHERCASE(c);
#if PCRE2_CODE_UNIT_WIDTH == 8
- PCRE2_UCHAR buff[6];
- c = UCD_OTHERCASE(c);
- (void)PRIV(ord2utf)(c, buff);
- SET_BIT(buff[0]);
+ if (utf)
+ {
+ PCRE2_UCHAR buff[6];
+ (void)PRIV(ord2utf)(c, buff);
+ SET_BIT(buff[0]);
+ }
+ else SET_BIT(c);
#else /* 16-bit or 32-bit mode */
- c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
+
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF */
+ /* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
@@ -931,6 +939,7 @@
re points to the compiled regex block
code points to an expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units
@@ -941,7 +950,8 @@
*/
static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+ int *depthptr)
{
uint32_t c;
int yield = SSB_DONE;
@@ -1111,7 +1121,7 @@
case OP_SCRIPT_RUN:
case OP_ASSERT:
case OP_ASSERT_NA:
- rc = set_start_bits(re, tcode, utf, depthptr);
+ rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE)
{
try_next = FALSE;
@@ -1167,7 +1177,7 @@
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
- rc = set_start_bits(re, ++tcode, utf, depthptr);
+ rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
@@ -1189,7 +1199,7 @@
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break;
case OP_STARI:
@@ -1198,7 +1208,7 @@
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
- tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break;
/* Single-char upto sets the bit and tries the next */
@@ -1206,13 +1216,13 @@
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break;
/* At least one single char sets the bit and stops */
@@ -1224,7 +1234,7 @@
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(re, tcode + 1, FALSE, utf);
+ (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE;
break;
@@ -1235,7 +1245,7 @@
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
- (void)set_table_bit(re, tcode + 1, TRUE, utf);
+ (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE;
break;
@@ -1664,6 +1674,7 @@
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */
@@ -1677,7 +1688,7 @@
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int depth = 0;
- int rc = set_start_bits(re, code, utf, &depth);
+ int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only
@@ -1695,7 +1706,7 @@
int b = -1;
uint8_t *p = re->start_bitmap;
uint32_t flags = PCRE2_FIRSTMAPSET;
-
+
for (i = 0; i < 256; p++, i += 8)
{
uint8_t x = *p;
@@ -1725,27 +1736,27 @@
}
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
- mode, only values < 128 can be used. */
+ mode, only values < 128 can be used. In all the other cases, c is a
+ character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8
- if (c > 127) goto DONE;
+ if (utf && c > 127) goto DONE;
#endif
- if (a < 0) a = c; /* First one found */
+ if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */
{
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
-
+
#ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
- if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
-#else /* 16-bit or 32-bit */
- if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
- if (utf && c > 127) d = UCD_OTHERCASE(c);
-#endif /* Code width */
+ if (utf || ucp)
+ {
+ if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
+ if (c > 127) d = UCD_OTHERCASE(c);
+ }
#endif /* SUPPORT_UNICODE */
- if (d != a) goto DONE; /* Not other case of a */
- b = c;
+ if (d != a) goto DONE; /* Not the other case of a */
+ b = c; /* Save second in b */
}
else goto DONE; /* More than two characters found */
}
Modified: code/trunk/src/pcre2_substitute.c
===================================================================
--- code/trunk/src/pcre2_substitute.c 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_substitute.c 2020-02-23 16:40:05 UTC (rev 1224)
@@ -236,6 +236,7 @@
BOOL replacement_only;
#ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif
PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr;
@@ -758,7 +759,7 @@
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
@@ -860,7 +861,7 @@
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput10 2020-02-23 16:40:05 UTC (rev 1224)
@@ -570,8 +570,10 @@
/[\xff\x{ffff}]/I,utf
/[\xff\x{ff}]/I,utf
+ abc\x{ff}def
/[\xff\x{ff}]/I
+ abc\x{ff}def
/[Ss]/I
@@ -585,4 +587,31 @@
abc\x80\=startchar
abc\x80\=startchar,offset=3
+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+ \x{c1}\x{c1}\x{c1}
+ \x{e1}\x{e1}\x{e1}
+
+/a|\x{c1}/iI,ucp
+ \x{e1}xxx
+
+/a|\x{c1}/iI,utf
+ \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+ X\x{c1}Y
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+
+#subject
+
# End of testinput10
Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput12 2020-02-23 16:40:05 UTC (rev 1224)
@@ -463,4 +463,71 @@
/(?:\x{ff}|\x{3000})/I,utf
+# ----------------------------------------------------
+# UCP and casing tests
+
+/\x{120}/i,I
+
+/\x{c1}/i,I,ucp
+
+/[\x{120}\x{121}]/iB,ucp
+
+/[ab\x{120}]+/iB,ucp
+ aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+
+/[^\x{120}]/i
+ \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+
+/\x{c1}+\x{e1}/iB,ucp
+ \x{c1}\x{c1}\x{c1}
+
+/\x{c1}+\x{e1}/iIB,ucp
+ \x{c1}\x{c1}\x{c1}
+ \x{e1}\x{e1}\x{e1}
+
+/a|\x{c1}/iI,ucp
+ \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{121}Y
+
+#subject
+
+# ----------------------------------------------------
+
# End of testinput12
Modified: code/trunk/testdata/testinput14
===================================================================
--- code/trunk/testdata/testinput14 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput14 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.
#subject dfa
+# ----------------------------------------------------
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
/X/utf
XX\x{d800}
XX\x{d800}\=offset=3
@@ -33,5 +36,46 @@
XX\xef\x80\=ph
\xf7\=ph
\xf7\x80\=ph
+
+# ----------------------------------------------------
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+
+/\x{c1}+\x{e1}/iB,ucp
+ \x{c1}\x{c1}\x{c1}
+ \x{e1}\x{e1}\x{e1}
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+
+/[^\x{120}]/i
+ \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+
+# ----------------------------------------------------
+
# End of testinput14
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput10 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1780,11 +1780,15 @@
Options: utf
Starting code units: \xc3
Subject length lower bound = 1
+ abc\x{ff}def
+ 0: \x{ff}
/[\xff\x{ff}]/I
Capture group count = 0
-Starting code units: \xff
+First code unit = \xff
Subject length lower bound = 1
+ abc\x{ff}def
+ 0: \xff
/[Ss]/I
Capture group count = 0
@@ -1813,4 +1817,62 @@
abc\x80\=startchar,offset=3
Error -36 (bad UTF-8 offset)
+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1
+Subject length lower bound = 1
+ \x{e1}xxx
+ 0: \xe1
+
+/a|\x{c1}/iI,utf
+Capture group count = 0
+Options: caseless utf
+Starting code units: A a \xc3
+Subject length lower bound = 1
+ \x{e1}xxx
+ 0: \x{e1}
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+ X\x{c1}Y
+ 1: >\xe1<
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+ 1: >\xe1<
+
+#subject
+
# End of testinput10
Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput12-16 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1613,7 +1613,7 @@
/[Ss]/I
Capture group count = 0
-Starting code units: S s
+First code unit = 'S' (caseless)
Subject length lower bound = 1
/[Ss]/I,utf
@@ -1628,4 +1628,134 @@
Starting code units: \xff
Subject length lower bound = 1
+# ----------------------------------------------------
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{120}
+ Ket
+ End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+ Bra
+ [ABab\x{120}-\x{121}]++
+ Ket
+ End
+------------------------------------------------------------------
+ aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+No match
+
+/[^\x{120}]/i
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+No match
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1
+Subject length lower bound = 1
+ \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{121}Y
+ 1: >\x{120}<
+
+#subject
+
+# ----------------------------------------------------
+
# End of testinput12
Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput12-32 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1611,7 +1611,7 @@
/[Ss]/I
Capture group count = 0
-Starting code units: S s
+First code unit = 'S' (caseless)
Subject length lower bound = 1
/[Ss]/I,utf
@@ -1626,4 +1626,134 @@
Starting code units: \xff
Subject length lower bound = 1
+# ----------------------------------------------------
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{120}
+ Ket
+ End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+ Bra
+ [ABab\x{120}-\x{121}]++
+ Ket
+ End
+------------------------------------------------------------------
+ aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+No match
+
+/[^\x{120}]/i
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+No match
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1
+Subject length lower bound = 1
+ \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+ X\x{121}Y
+ 1: >\x{120}<
+
+#subject
+
+# ----------------------------------------------------
+
# End of testinput12
Modified: code/trunk/testdata/testoutput14-16
===================================================================
--- code/trunk/testdata/testoutput14-16 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-16 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.
#subject dfa
+# ----------------------------------------------------
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
/X/utf
XX\x{d800}
Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@@ -57,5 +60,66 @@
No match
\xf7\x80\=ph
No match
+
+# ----------------------------------------------------
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+No match
+
+/[^\x{120}]/i
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+No match
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+No match
+
+# ----------------------------------------------------
+
# End of testinput14
Modified: code/trunk/testdata/testoutput14-32
===================================================================
--- code/trunk/testdata/testoutput14-32 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-32 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.
#subject dfa
+# ----------------------------------------------------
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
/X/utf
XX\x{d800}
Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@@ -57,5 +60,66 @@
No match
\xf7\x80\=ph
No match
+
+# ----------------------------------------------------
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+ \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+ \x{121}
+No match
+
+/[^\x{120}]/i
+ \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+ \x{121}
+No match
+
+/\x{120}{2}/i,ucp
+ \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+ \x{121}\x{121}
+No match
+
+# ----------------------------------------------------
+
# End of testinput14
Modified: code/trunk/testdata/testoutput14-8
===================================================================
--- code/trunk/testdata/testoutput14-8 2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-8 2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.
#subject dfa
+# ----------------------------------------------------
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
/X/utf
XX\x{d800}
Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@@ -57,5 +60,66 @@
Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
\xf7\x80\=ph
Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
+
+# ----------------------------------------------------
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+ \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+ Bra
+ /i \x{c1}+
+ /i \x{e1}
+ Ket
+ End
+------------------------------------------------------------------
+ \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+ \x{e1}\x{e1}\x{e1}
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+ \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+ \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+ \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+ \x{121}
+
+/[^\x{120}]/i
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+ \x{121}
+
+/[^\x{120}]/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+ \x{121}
+
+/\x{120}{2}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+ \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+ \x{121}\x{121}
+
+# ----------------------------------------------------
+
# End of testinput14