[Pcre-svn] [1224] code/trunk: Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

Author: Subversion repository
Date:
To: pcre-svn
Subject: [Pcre-svn] [1224] code/trunk: Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.

Revision: 1224

          http://www.exim.org/viewvc/pcre2?view=rev&revision=1224
Author:   ph10
Date:     2020-02-23 16:40:05 +0000 (Sun, 23 Feb 2020)
Log Message:
-----------
Unicode upper/lower casing is now used when UCP is set, even if UTF is not set. 
This is not yet documented, and it not yet implemented in JIT.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/maint/ManyConfigTests
    code/trunk/src/pcre2_auto_possess.c
    code/trunk/src/pcre2_compile.c
    code/trunk/src/pcre2_dfa_match.c
    code/trunk/src/pcre2_internal.h
    code/trunk/src/pcre2_match.c
    code/trunk/src/pcre2_study.c
    code/trunk/src/pcre2_substitute.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput14
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput12-16
    code/trunk/testdata/testoutput12-32
    code/trunk/testdata/testoutput14-16
    code/trunk/testdata/testoutput14-32
    code/trunk/testdata/testoutput14-8

Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/ChangeLog    2020-02-23 16:40:05 UTC (rev 1224)
@@ -66,7 +66,12 @@
 17. Fix a crash which occurs when the character type of an invalid UTF
 character is decoded in JIT.

+18. Changes in many areas of the code so that when Unicode is supported and
+PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
+upper/lower case computations on characters whose code points are greater than
+127. Documentation is not yet updated. JIT is not yet updated.

+
Version 10.34 21-November-2019
------------------------------

Modified: code/trunk/maint/ManyConfigTests
===================================================================
--- code/trunk/maint/ManyConfigTests    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/maint/ManyConfigTests    2020-02-23 16:40:05 UTC (rev 1224)
@@ -28,8 +28,6 @@
 # The -v option causes a call to 'pcre2test -C' to happen for each
 # configuration.

-# Currently -fsanitize=undefined is not working (locks machine).
-
useasan=1
useusan=1
usedebug=1

Modified: code/trunk/src/pcre2_auto_possess.c
===================================================================
--- code/trunk/src/pcre2_auto_possess.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_auto_possess.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@

                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -292,6 +292,7 @@
 Arguments:
   code        points to start of expression
   utf         TRUE if in UTF mode
+  ucp         TRUE if in UCP mode
   fcc         points to the case-flipping table
   list        points to output list
               list[0] will be filled with the opcode
@@ -304,7 +305,7 @@
 */

 static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
   uint32_t *list)
 {
 PCRE2_UCHAR c = *code;
@@ -316,7 +317,8 @@
 uint32_t *clist_dest;
 const uint32_t *clist_src;
 #else
-(void)utf;    /* Suppress "unused parameter" compiler warning */
+(void)utf;    /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
 #endif

list[0] = c;
@@ -396,7 +398,7 @@
list[2] = chr;

 #ifdef SUPPORT_UNICODE
-  if (chr < 128 || (chr < 256 && !utf))
+  if (chr < 128 || (chr < 256 && !utf && !ucp))
     list[3] = fcc[chr];
   else
     list[3] = UCD_OTHERCASE(chr);
@@ -503,6 +505,7 @@
 Arguments:
   code        points to the byte code
   utf         TRUE in UTF mode
+  ucp         TRUE in UCP mode 
   cb          compile data block
   base_list   the data list of the base opcode
   base_end    the end of the base opcode
@@ -512,7 +515,7 @@
 */

static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@@ -651,7 +654,7 @@

     while (*next_code == OP_ALT)
       {
-      if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+      if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
         return FALSE;
       code = next_code + 1 + LINK_SIZE;
       next_code += GET(next_code, 1);
@@ -672,7 +675,8 @@
     /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */

     next_code += 1 + LINK_SIZE;
-    if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+    if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end, 
+         rec_limit))
       return FALSE;

     code += PRIV(OP_lengths)[c];
@@ -688,7 +692,7 @@
   /* We now have the next appropriate opcode to compare with the base. Check
   for a supported opcode, and load its properties. */

-  code = get_chr_property_list(code, utf, cb->fcc, list);
+  code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
   if (code == NULL) return FALSE;    /* Unsupported */

/* If either opcode is a small character list, set pointers for comparing
@@ -1100,7 +1104,6 @@

 Arguments:
   code        points to start of the byte code
-  utf         TRUE in UTF mode
   cb          compile data block

 Returns:      0 for success
@@ -1108,7 +1111,7 @@
 */

int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
@@ -1115,6 +1118,8 @@
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;

 for (;;)
   {
@@ -1126,10 +1131,11 @@
     {
     c -= get_repeat_base(c) - OP_STAR;
     end = (c <= OP_MINUPTO) ?
-      get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+      get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;

-    if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+    if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end, 
+        &rec_limit))
       {
       switch(c)
         {
@@ -1181,11 +1187,11 @@
     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
       {
       /* end must not be NULL. */
-      end = get_chr_property_list(code, utf, cb->fcc, list);
+      end = get_chr_property_list(code, utf, ucp, cb->fcc, list);

       list[1] = (c & 1) == 0;

-      if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+      if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
         {
         switch (c)
           {

Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_compile.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -4904,7 +4904,7 @@
 if ((options & PCRE2_CASELESS) != 0)
   {
 #ifdef SUPPORT_UNICODE
-  if ((options & PCRE2_UTF) != 0)
+  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
     {
     int rc;
     uint32_t oc, od;
@@ -5319,7 +5319,8 @@

#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
-#else /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
+#else /* No Unicode support */
BOOL utf = FALSE;
#endif

@@ -5602,7 +5603,7 @@
         uint32_t d;

 #ifdef SUPPORT_UNICODE
-        if (utf && c > 127) d = UCD_OTHERCASE(c); else
+        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
 #endif
           {
 #if PCRE2_CODE_UNIT_WIDTH != 8
@@ -9632,6 +9633,7 @@
    int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
 {
 BOOL utf;                             /* Set TRUE for UTF mode */
+BOOL ucp;                             /* Set TRUE for UCP mode */
 BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
 BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
 pcre2_real_code *re = NULL;           /* What we will return */
@@ -9919,8 +9921,8 @@

/* Check UCP lockout. */

-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
-    (PCRE2_UCP|PCRE2_NEVER_UCP))
+ucp = (cb.external_options & PCRE2_UCP) != 0;
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
   {
   errorcode = ERR75;
   goto HAD_EARLY_ERROR;
@@ -10296,7 +10298,7 @@
 if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
   {
   PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
-  if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
   }

/* Failed to compile, or error while post-processing. */
@@ -10344,21 +10346,25 @@

     if ((firstcuflags & REQ_CASELESS) != 0)
       {
-      if (firstcu < 128 || (!utf && firstcu < 255))
+      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
         {
         if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
         }

-      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
-      8-bit UTF mode, codepoints in the range 128-255 are introductory code
-      points and cannot have another case. In 16-bit and 32-bit modes, we can
-      check wide characters when UTF (and therefore UCP) is supported. */
+      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
+      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
+      points and cannot have another case, but if UCP is set they may do. */

-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      else if (firstcu <= MAX_UTF_CODE_POINT &&
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
+        re->flags |= PCRE2_FIRSTCASELESS;
+#else
+      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
                UCD_OTHERCASE(firstcu) != firstcu)
         re->flags |= PCRE2_FIRSTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
       }
     }

@@ -10407,14 +10413,20 @@

       if ((reqcuflags & REQ_CASELESS) != 0)
         {
-        if (reqcu < 128 || (!utf && reqcu < 255))
+        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
           {
           if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
           }
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-        else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
-          re->flags |= PCRE2_LASTCASELESS;
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
+#else
+      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
+               UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
 #endif
+#endif  /* SUPPORT_UNICODE */
         }
       }
     }

Modified: code/trunk/src/pcre2_dfa_match.c
===================================================================
--- code/trunk/src/pcre2_dfa_match.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_dfa_match.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@

                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2019 University of Cambridge
+          New API code Copyright (c) 2016-2020 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -548,6 +548,7 @@

 #ifdef SUPPORT_UNICODE
 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
 #else
 BOOL utf = FALSE;
 #endif
@@ -2190,7 +2191,7 @@
       if (clen == 0) break;

 #ifdef SUPPORT_UNICODE
-      if (utf)
+      if (utf_or_ucp)
         {
         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
           {
@@ -2204,7 +2205,7 @@
         }
       else
 #endif  /* SUPPORT_UNICODE */
-      /* Not UTF mode */
+      /* Not UTF or UCP mode */
         {
         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
           { ADD_NEW(state_offset + 2, 0); }
@@ -2339,7 +2340,7 @@
         {
         uint32_t otherd;
 #ifdef SUPPORT_UNICODE
-        if (utf && d >= 128)
+        if (utf_or_ucp && d >= 128)
           otherd = UCD_OTHERCASE(d);
         else
 #endif  /* SUPPORT_UNICODE */
@@ -2374,7 +2375,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2417,7 +2418,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2458,7 +2459,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2491,7 +2492,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -2531,7 +2532,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UNICODE
-          if (utf && d >= 128)
+          if (utf_or_ucp && d >= 128)
             otherd = UCD_OTHERCASE(d);
           else
 #endif  /* SUPPORT_UNICODE */
@@ -3526,10 +3527,15 @@
   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE 
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
-#endif
+#else
+    if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#endif       
+#endif  /* SUPPORT_UNICODE */
     }
   }
 else
@@ -3545,9 +3551,15 @@
   if ((re->flags & PCRE2_LASTCASELESS) != 0)
     {
     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) 
+      req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }

Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_internal.h    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1952,7 +1952,7 @@
 #define _pcre2_was_newline           PCRE2_SUFFIX(_pcre2_was_newline_)
 #define _pcre2_xclass                PCRE2_SUFFIX(_pcre2_xclass_)

-extern int          _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int          _pcre2_auto_possessify(PCRE2_UCHAR *,
                       const compile_block *);
 extern int          _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
                       int *, uint32_t, uint32_t, BOOL, compile_block *);

Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_match.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -7,7 +7,7 @@

                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2015-2019 University of Cambridge
+          New API code Copyright (c) 2015-2020 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -598,12 +598,13 @@
 BOOL cur_is_word;       /* Used in "word" tests */
 BOOL prev_is_word;      /* Used in "word" tests */

-/* UTF flag */
+/* UTF and UCP flags */

#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else
-BOOL utf = FALSE;
+BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif

 /* This is the length of the last part of a backtracking frame that must be
@@ -928,6 +929,7 @@
       }
     else
 #endif
+
     /* Not UTF mode */
       {
       if (mb->end_subject - Feptr < 1)
@@ -987,10 +989,30 @@
         if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
         }
       }
+
+    /* If UCP is set without UTF we must do the same as above, but with one
+    character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t cc = UCHAR21(Feptr);
+      fc = Fecode[1];
+      if (fc < 128)
+        {
+        if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+        }
+      else
+        {
+        if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
+        }
+      Feptr++;
+      Fecode += 2;
+      }
+
     else
 #endif   /* SUPPORT_UNICODE */

-    /* Not UTF mode; use the table for characters < 256. */
+    /* Not UTF or UCP mode; use the table for characters < 256. */
       {
       if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
           != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@@ -1010,6 +1032,7 @@
       SCHECK_PARTIAL();
       RRETURN(MATCH_NOMATCH);
       }
+
 #ifdef SUPPORT_UNICODE
     if (utf)
       {
@@ -1026,15 +1049,42 @@
         if (ch > 127)
           ch = UCD_OTHERCASE(ch);
         else
-          ch = TABLE_GET(ch, mb->fcc, ch);
+          ch = (mb->fcc)[ch];
         if (ch == fc) RRETURN(MATCH_NOMATCH);
         }
       }
+
+    /* UCP without UTF is as above, but with one character per code unit. */
+
+    else if (ucp)
+      {
+      uint32_t ch;
+      fc = UCHAR21INC(Feptr);
+      ch = Fecode[1];
+      Fecode += 2;
+
+      if (ch == fc)
+        {
+        RRETURN(MATCH_NOMATCH);  /* Caseful match */
+        }
+      else if (Fop == OP_NOTI)   /* If caseless */
+        {
+        if (ch > 127)
+          ch = UCD_OTHERCASE(ch);
+        else
+          ch = (mb->fcc)[ch];
+        if (ch == fc) RRETURN(MATCH_NOMATCH);
+        }
+      }
+
     else
 #endif  /* SUPPORT_UNICODE */
+
+    /* Neither UTF nor UCP is set */
+
       {
       uint32_t ch = Fecode[1];
-      fc = *Feptr++;
+      fc = UCHAR21INC(Feptr);
       if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
         RRETURN(MATCH_NOMATCH);
       Fecode += 2;
@@ -1244,7 +1294,7 @@
 #endif  /* SUPPORT_UNICODE */

     /* When not in UTF mode, load a single-code-unit character. Then proceed as
-    above. */
+    above, using Unicode casing if either UTF or UCP is set. */

     Lc = *Fecode++;

@@ -1253,11 +1303,15 @@
     if (Fop >= OP_STARI)
       {
 #if PCRE2_CODE_UNIT_WIDTH == 8
-      /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
+      if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      else
+#endif  /* SUPPORT_UNICODE */
+      /* Lc will be < 128 in UTF-8 mode. */
       Loc = mb->fcc[Lc];
 #else /* 16-bit & 32-bit */
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+      if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
       else
 #endif  /* SUPPORT_UNICODE */
       Loc = TABLE_GET(Lc, mb->fcc, Lc);
@@ -1490,7 +1544,7 @@
     if (Fop >= OP_NOTSTARI)     /* Caseless */
       {
 #ifdef SUPPORT_UNICODE
-      if (utf && Lc > 127)
+      if ((utf || ucp) && Lc > 127)
         Loc = UCD_OTHERCASE(Lc);
       else
 #endif /* SUPPORT_UNICODE */
@@ -6045,7 +6099,6 @@
 BOOL has_first_cu = FALSE;
 BOOL has_req_cu = FALSE;
 BOOL startline;
-BOOL utf;

#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
@@ -6069,13 +6122,19 @@
BOOL use_jit;
#endif

+/* This flag is needed even when Unicode is not supported for convenience
+(it is used by the IS_NEWLINE macro). */
+
+BOOL utf = FALSE;
+
#ifdef SUPPORT_UNICODE
+BOOL ucp = FALSE;
BOOL allow_invalid;
uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE;
#endif
-#endif
+#endif /* SUPPORT_UNICODE */

PCRE2_SIZE frame_size;

@@ -6147,12 +6206,13 @@
           (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
 #endif

-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */

+#ifdef SUPPORT_UNICODE
utf = (re->overall_options & PCRE2_UTF) != 0;
-#ifdef SUPPORT_UNICODE
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
+#endif /* SUPPORT_UNICODE */

/* Convert the partial matching flags into an integer. */

@@ -6589,9 +6649,13 @@
   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
     first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
+#else
+    if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }
 else
@@ -6607,9 +6671,13 @@
   if ((re->flags & PCRE2_LASTCASELESS) != 0)
     {
     req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
+#else
+    if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
 #endif
+#endif  /* SUPPORT_UNICODE */
     }
   }

@@ -6756,15 +6824,16 @@
 #endif
           }

-        /* If we can't find the required code unit, having reached the true end
-        of the subject, break the bumpalong loop, to force a match failure,
-        except when doing partial matching, when we let the next cycle run at
-        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
-        which partially matches "abc", even though the string does not contain
-        the starting character "d". If we have not reached the true end of the
-        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
-        we also let the cycle run, because the matching string is legitimately
-        allowed to start with the first code unit of a newline. */
+        /* If we can't find the required first code unit, having reached the
+        true end of the subject, break the bumpalong loop, to force a match
+        failure, except when doing partial matching, when we let the next cycle
+        run at the end of the subject. To see why, consider the pattern
+        /(?<=abc)def/, which partially matches "abc", even though the string
+        does not contain the starting character "d". If we have not reached the
+        true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
+        temporarily modified) we also let the cycle run, because the matching
+        string is legitimately allowed to start with the first code unit of a
+        newline. */

         if (mb->partial == 0 && start_match >= mb->end_subject)
           {

Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_study.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -772,16 +772,20 @@
   p             points to the first code unit of the character
   caseless      TRUE if caseless
   utf           TRUE for UTF mode
+  ucp           TRUE for UCP mode

 Returns:        pointer after the character
 */

 static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf, 
+  BOOL ucp)
 {
 uint32_t c = *p++;   /* First code unit */
-(void)utf;           /* Stop compiler warning when UTF not supported */

+(void)utf;           /* Stop compiler warnings when UTF not supported */
+(void)ucp;
+
 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
 0xff. */

@@ -810,22 +814,26 @@
 if (caseless)
   {
 #ifdef SUPPORT_UNICODE
-  if (utf)
+  if (utf || ucp)
     {
+    c = UCD_OTHERCASE(c);
 #if PCRE2_CODE_UNIT_WIDTH == 8
-    PCRE2_UCHAR buff[6];
-    c = UCD_OTHERCASE(c);
-    (void)PRIV(ord2utf)(c, buff);
-    SET_BIT(buff[0]);
+    if (utf)
+      { 
+      PCRE2_UCHAR buff[6];
+      (void)PRIV(ord2utf)(c, buff);
+      SET_BIT(buff[0]);
+      }
+    else SET_BIT(c);    
 #else  /* 16-bit or 32-bit mode */
-    c = UCD_OTHERCASE(c);
     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
     }
+ 
   else
 #endif  /* SUPPORT_UNICODE */

- /* Not UTF */
+ /* Not UTF or UCP */

   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
   }
@@ -931,6 +939,7 @@
   re           points to the compiled regex block
   code         points to an expression
   utf          TRUE if in UTF mode
+  ucp          TRUE if in UCP mode 
   depthptr     pointer to recurse depth

 Returns:       SSB_FAIL     => Failed to find any starting code units
@@ -941,7 +950,8 @@
 */

 static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+  int *depthptr)
 {
 uint32_t c;
 int yield = SSB_DONE;
@@ -1111,7 +1121,7 @@
       case OP_SCRIPT_RUN:
       case OP_ASSERT:
       case OP_ASSERT_NA:
-      rc = set_start_bits(re, tcode, utf, depthptr);
+      rc = set_start_bits(re, tcode, utf, ucp, depthptr);
       if (rc == SSB_DONE)
         {
         try_next = FALSE;
@@ -1167,7 +1177,7 @@
       case OP_BRAZERO:
       case OP_BRAMINZERO:
       case OP_BRAPOSZERO:
-      rc = set_start_bits(re, ++tcode, utf, depthptr);
+      rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
       tcode += 1 + LINK_SIZE;
@@ -1189,7 +1199,7 @@
       case OP_QUERY:
       case OP_MINQUERY:
       case OP_POSQUERY:
-      tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
       break;

       case OP_STARI:
@@ -1198,7 +1208,7 @@
       case OP_QUERYI:
       case OP_MINQUERYI:
       case OP_POSQUERYI:
-      tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
       break;

       /* Single-char upto sets the bit and tries the next */
@@ -1206,13 +1216,13 @@
       case OP_UPTO:
       case OP_MINUPTO:
       case OP_POSUPTO:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
       break;

       case OP_UPTOI:
       case OP_MINUPTOI:
       case OP_POSUPTOI:
-      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
       break;

       /* At least one single char sets the bit and stops */
@@ -1224,7 +1234,7 @@
       case OP_PLUS:
       case OP_MINPLUS:
       case OP_POSPLUS:
-      (void)set_table_bit(re, tcode + 1, FALSE, utf);
+      (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
       try_next = FALSE;
       break;

@@ -1235,7 +1245,7 @@
       case OP_PLUSI:
       case OP_MINPLUSI:
       case OP_POSPLUSI:
-      (void)set_table_bit(re, tcode + 1, TRUE, utf);
+      (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
       try_next = FALSE;
       break;

@@ -1664,6 +1674,7 @@
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;

/* Find start of compiled code */

@@ -1677,7 +1688,7 @@
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int depth = 0;
- int rc = set_start_bits(re, code, utf, &depth);
+ int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;

   /* If a list of starting code units was set up, scan the list to see if only
@@ -1695,7 +1706,7 @@
     int b = -1;
     uint8_t *p = re->start_bitmap;
     uint32_t flags = PCRE2_FIRSTMAPSET;
-
+    
     for (i = 0; i < 256; p++, i += 8)
       {
       uint8_t x = *p;
@@ -1725,27 +1736,27 @@
           }

         /* c contains the code unit value, in the range 0-255. In 8-bit UTF
-        mode, only values < 128 can be used. */
+        mode, only values < 128 can be used. In all the other cases, c is a 
+        character value. */

 #if PCRE2_CODE_UNIT_WIDTH == 8
-        if (c > 127) goto DONE;
+        if (utf && c > 127) goto DONE;
 #endif
-        if (a < 0) a = c;   /* First one found */
+        if (a < 0) a = c;   /* First one found, save in a */
         else if (b < 0)     /* Second one found */
           {
           int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
-
+          
 #ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
-          if (utf && UCD_CASESET(c) != 0) goto DONE;   /* Multiple case set */
-#else   /* 16-bit or 32-bit */
-          if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
-          if (utf && c > 127) d = UCD_OTHERCASE(c);
-#endif  /* Code width */
+          if (utf || ucp)
+            { 
+            if (UCD_CASESET(c) != 0) goto DONE;     /* Multiple case set */
+            if (c > 127) d = UCD_OTHERCASE(c);
+            }
 #endif  /* SUPPORT_UNICODE */

-          if (d != a) goto DONE;   /* Not other case of a */
-          b = c;
+          if (d != a) goto DONE;   /* Not the other case of a */
+          b = c;                   /* Save second in b */
           }
         else goto DONE;   /* More than two characters found */
         }

Modified: code/trunk/src/pcre2_substitute.c
===================================================================
--- code/trunk/src/pcre2_substitute.c    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/src/pcre2_substitute.c    2020-02-23 16:40:05 UTC (rev 1224)
@@ -236,6 +236,7 @@
 BOOL replacement_only;
 #ifdef SUPPORT_UNICODE
 BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
 #endif
 PCRE2_UCHAR temp[6];
 PCRE2_SPTR ptr;
@@ -758,7 +759,7 @@
           if (forcecase != 0)
             {
 #ifdef SUPPORT_UNICODE
-            if (utf)
+            if (utf || ucp)
               {
               uint32_t type = UCD_CHARTYPE(ch);
               if (PRIV(ucp_gentype)[type] == ucp_L &&
@@ -860,7 +861,7 @@
       if (forcecase != 0)
         {
 #ifdef SUPPORT_UNICODE
-        if (utf)
+        if (utf || ucp)
           {
           uint32_t type = UCD_CHARTYPE(ch);
           if (PRIV(ucp_gentype)[type] == ucp_L &&

Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput10    2020-02-23 16:40:05 UTC (rev 1224)
@@ -570,8 +570,10 @@
 /[\xff\x{ffff}]/I,utf

 /[\xff\x{ff}]/I,utf
+    abc\x{ff}def

 /[\xff\x{ff}]/I
+    abc\x{ff}def

/[Ss]/I

@@ -585,4 +587,31 @@
     abc\x80\=startchar
     abc\x80\=startchar,offset=3

+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/a|\x{c1}/iI,ucp
+    \x{e1}xxx
+
+/a|\x{c1}/iI,utf
+    \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+    X\x{c1}Y
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+#subject     
+
 # End of testinput10

Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput12    2020-02-23 16:40:05 UTC (rev 1224)
@@ -463,4 +463,71 @@

/(?:\x{ff}|\x{3000})/I,utf

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+
+/\x{c1}/i,I,ucp
+
+/[\x{120}\x{121}]/iB,ucp
+
+/[ab\x{120}]+/iB,ucp
+    aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+    \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+
+/\x{c1}+\x{e1}/iB,ucp
+    \x{c1}\x{c1}\x{c1}
+
+/\x{c1}+\x{e1}/iIB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/a|\x{c1}/iI,ucp
+    \x{e1}xxx
+
+/\x{c1}|\x{e1}/iI,ucp
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12

Modified: code/trunk/testdata/testinput14
===================================================================
--- code/trunk/testdata/testinput14    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testinput14    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

#subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
     XX\x{d800}
     XX\x{d800}\=offset=3
@@ -33,5 +36,46 @@
     XX\xef\x80\=ph
     \xf7\=ph
     \xf7\x80\=ph
+    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.

+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+
+/\x{c1}+\x{e1}/iB,ucp
+    \x{c1}\x{c1}\x{c1}
+    \x{e1}\x{e1}\x{e1} 
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+    \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+
+# ---------------------------------------------------- 
+
 # End of testinput14

Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput10    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1780,11 +1780,15 @@
 Options: utf
 Starting code units: \xc3 
 Subject length lower bound = 1
+    abc\x{ff}def
+ 0: \x{ff}

 /[\xff\x{ff}]/I
 Capture group count = 0
-Starting code units: \xff 
+First code unit = \xff
 Subject length lower bound = 1
+    abc\x{ff}def
+ 0: \xff

 /[Ss]/I
 Capture group count = 0
@@ -1813,4 +1817,62 @@
     abc\x80\=startchar,offset=3
 Error -36 (bad UTF-8 offset)

+#subject no_jit
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/a|\x{c1}/iI,utf
+Capture group count = 0
+Options: caseless utf
+Starting code units: A a \xc3 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \x{e1}
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
+    X\x{c1}Y
+ 1: >\xe1<
+
+# Without UTF or UCP characters > 127 have only one case in the default locale.
+
+/X(\x{e1})Y/replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xe1<
+
+#subject     
+
 # End of testinput10

Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput12-16    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1613,7 +1613,7 @@

/[Ss]/I
Capture group count = 0
-Starting code units: S s
+First code unit = 'S' (caseless)
Subject length lower bound = 1

/[Ss]/I,utf
@@ -1628,4 +1628,134 @@
Starting code units: \xff
Subject length lower bound = 1

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{120}
+        Ket
+        End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+        Bra
+        [ABab\x{120}-\x{121}]++
+        Ket
+        End
+------------------------------------------------------------------
+    aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+ 1: >\x{120}<
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12

Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput12-32    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1611,7 +1611,7 @@

/[Ss]/I
Capture group count = 0
-Starting code units: S s
+First code unit = 'S' (caseless)
Subject length lower bound = 1

/[Ss]/I,utf
@@ -1626,4 +1626,134 @@
Starting code units: \xff
Subject length lower bound = 1

+# ---------------------------------------------------- 
+# UCP and casing tests
+
+/\x{120}/i,I
+Capture group count = 0
+Options: caseless
+First code unit = \x{120}
+Subject length lower bound = 1
+
+/\x{c1}/i,I,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/[\x{120}\x{121}]/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{120}
+        Ket
+        End
+------------------------------------------------------------------
+
+/[ab\x{120}]+/iB,ucp
+------------------------------------------------------------------
+        Bra
+        [ABab\x{120}-\x{121}]++
+        Ket
+        End
+------------------------------------------------------------------
+    aABb\x{121}\x{120}
+ 0: aABb\x{121}\x{120}
+
+#subject no_jit
+
+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+
+/\x{c1}+\x{e1}/iIB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Last code unit = \xe1 (caseless)
+Subject length lower bound = 2
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+
+/a|\x{c1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+Starting code units: A a \xc1 \xe1 
+Subject length lower bound = 1
+    \x{e1}xxx
+ 0: \xe1
+
+/\x{c1}|\x{e1}/iI,ucp
+Capture group count = 0
+Options: caseless ucp
+First code unit = \xc1 (caseless)
+Subject length lower bound = 1
+
+/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{e1}Y
+ 1: >\xc1<
+
+/X(\x{121})Y/ucp,replace=>\U$1<,substitute_extended
+    X\x{121}Y
+ 1: >\x{120}<
+
+#subject 
+
+# ---------------------------------------------------- 
+
 # End of testinput12

Modified: code/trunk/testdata/testoutput14-16
===================================================================
--- code/trunk/testdata/testoutput14-16    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-16    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

#subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
     XX\x{d800}
 Failed: error -24: UTF-16 error: missing low surrogate at end at offset 2
@@ -57,5 +60,66 @@
 No match
     \xf7\x80\=ph
 No match
+    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.

+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14

Modified: code/trunk/testdata/testoutput14-32
===================================================================
--- code/trunk/testdata/testoutput14-32    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-32    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

#subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
     XX\x{d800}
 Failed: error -27: UTF-32 error: code points 0xd800-0xdfff are not defined at offset 2
@@ -57,5 +60,66 @@
 No match
     \xf7\x80\=ph
 No match
+    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.

+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/\x{120}\x{c1}/i,ucp
+    \x{121}\x{e1}
+ 0: \x{121}\xe1
+
+/[^\x{120}]/i,no_start_optimize
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+\= Expect no match
+    \x{121}
+No match
+
+/[^\x{120}]/i
+    \x{121}
+ 0: \x{121}
+
+/[^\x{120}]/i,ucp
+\= Expect no match
+    \x{121}
+No match
+    
+/\x{120}{2}/i,ucp
+    \x{121}\x{121}
+ 0: \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+\= Expect no match
+    \x{121}\x{121}
+No match
+
+# ---------------------------------------------------- 
+
 # End of testinput14

Modified: code/trunk/testdata/testoutput14-8
===================================================================
--- code/trunk/testdata/testoutput14-8    2020-02-21 07:44:04 UTC (rev 1223)
+++ code/trunk/testdata/testoutput14-8    2020-02-23 16:40:05 UTC (rev 1224)
@@ -1,9 +1,12 @@
-# These test special (mostly error) UTF features of DFA matching. They are a 
-# selection of the more comprehensive tests that are run for non-DFA matching.
-# The output is different for the different widths.
+# These test special UTF and UCP features of DFA matching. The output is
+# different for the different widths.

#subject dfa

+# ---------------------------------------------------- 
+# These are a selection of the more comprehensive tests that are run for
+# non-DFA matching.
+
 /X/utf
     XX\x{d800}
 Failed: error -16: UTF-8 error: code points 0xd800-0xdfff are not defined at offset 2
@@ -57,5 +60,66 @@
 Failed: error -5: UTF-8 error: 3 bytes missing at end at offset 0
     \xf7\x80\=ph
 Failed: error -4: UTF-8 error: 2 bytes missing at end at offset 0
+    
+# ---------------------------------------------------- 
+# UCP and casing tests - except for the first two, these will all fail in 8-bit
+# mode because they are testing UCP without UTF and use characters > 255.

+/\x{c1}/i,no_start_optimize
+\= Expect no match
+    \x{e1}
+No match
+
+/\x{c1}+\x{e1}/iB,ucp
+------------------------------------------------------------------
+        Bra
+     /i \x{c1}+
+     /i \x{e1}
+        Ket
+        End
+------------------------------------------------------------------
+    \x{c1}\x{c1}\x{c1}
+ 0: \xc1\xc1\xc1
+ 1: \xc1\xc1
+    \x{e1}\x{e1}\x{e1} 
+ 0: \xe1\xe1\xe1
+ 1: \xe1\xe1
+
+/\x{120}\x{c1}/i,ucp,no_start_optimize
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{e1}
+
+/\x{120}\x{c1}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{e1}
+
+/[^\x{120}]/i,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+    \x{121}
+
+/[^\x{120}]/i,ucp,no_start_optimize
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}
+
+/[^\x{120}]/i
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+    \x{121}
+
+/[^\x{120}]/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}
+    
+/\x{120}{2}/i,ucp
+Failed: error 134 at offset 6: character code point value in \x{} or \o{} is too large
+    \x{121}\x{121}
+
+/[^\x{120}]{2}/i,ucp
+Failed: error 134 at offset 8: character code point value in \x{} or \o{} is too large
+\= Expect no match
+    \x{121}\x{121}
+
+# ---------------------------------------------------- 
+
 # End of testinput14

This message is part of the following thread:
	the complete thread tree sorted by date

[Pcre-svn] [1224] code/trunk: Unicode upper/lower casing is …