[Pcre-svn] [837] code/trunk: Allow anchored patterns to use …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [837] code/trunk: Allow anchored patterns to use "first code unit" optimization.
Revision: 837
          http://www.exim.org/viewvc/pcre2?view=rev&revision=837
Author:   ph10
Date:     2017-06-30 17:00:33 +0100 (Fri, 30 Jun 2017)
Log Message:
-----------
Allow anchored patterns to use "first code unit" optimization.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_compile.c
    code/trunk/src/pcre2_dfa_match.c
    code/trunk/src/pcre2_match.c
    code/trunk/src/pcre2_study.c
    code/trunk/testdata/testinput10
    code/trunk/testdata/testinput12
    code/trunk/testdata/testinput2
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput10
    code/trunk/testdata/testoutput12-16
    code/trunk/testdata/testoutput12-32
    code/trunk/testdata/testoutput17
    code/trunk/testdata/testoutput2
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/ChangeLog    2017-06-30 16:00:33 UTC (rev 837)
@@ -205,7 +205,12 @@
 subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are 
 much faster.


+46. Arrange for anchored patterns to record and use "first code unit" data,
+because this can give a fast "no match" without searching for a "required code
+unit". Previously only non-anchored patterns did this.

+
+
Version 10.23 14-February-2017
------------------------------


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/src/pcre2_compile.c    2017-06-30 16:00:33 UTC (rev 837)
@@ -9632,14 +9632,19 @@
      is_anchored(codestart, 0, &cb, 0, FALSE))
   re->overall_options |= PCRE2_ANCHORED;


-/* If the pattern is still not anchored and we do not have a first code unit,
-see if there is one that is asserted (these are not saved during the compile
-because they can cause conflicts with actual literals that follow). This code
-need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
-create will not be used. */
+/* Set up the first code unit or startline flag, the required code unit, and
+then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
+is set, as the data it would create will not be used. Note that a first code
+unit (but not the startline flag) is useful for anchored patterns because it
+can still give a quick "no match" and also avoid searching for a last code
+unit. */

-if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
+if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
   {
+  /* If we do not have a first code unit, see if there is one that is asserted
+  (these are not saved during the compile because they can cause conflicts with
+  actual literals that follow). */
+
   if (firstcuflags < 0)
     firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);


@@ -9672,52 +9677,50 @@
       }
     }


- /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
- flag. This is helpful for multiline matches when all branches start with ^
- and also when all branches start with non-atomic .* for non-DOTALL matches
- when *PRUNE and SKIP are not present. (There is an option that disables this
- case.) */
+ /* When there is no first code unit, for non-anchored patterns, see if we can
+ set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
+ branches start with ^ and also when all branches start with non-atomic .* for
+ non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
+ that disables this case.) */

-  else if (is_startline(codestart, 0, &cb, 0, FALSE))
+  else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
+           is_startline(codestart, 0, &cb, 0, FALSE))
     re->flags |= PCRE2_STARTLINE;
-  }


-/* Handle the "required code unit", if one is set. In the case of an anchored
-pattern, do this only if it follows a variable length item in the pattern.
-Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
+ /* Handle the "required code unit", if one is set. In the case of an anchored
+ pattern, do this only if it follows a variable length item in the pattern. */

-if (reqcuflags >= 0 &&
-     ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
-      (reqcuflags & REQ_VARY) != 0))
-  {
-  re->last_codeunit = reqcu;
-  re->flags |= PCRE2_LASTSET;
+  if (reqcuflags >= 0 &&
+       ((re->overall_options & PCRE2_ANCHORED) == 0 ||
+        (reqcuflags & REQ_VARY) != 0))
+    {
+    re->last_codeunit = reqcu;
+    re->flags |= PCRE2_LASTSET;


-  /* Handle caseless required code units as for first code units (above). */
+    /* Handle caseless required code units as for first code units (above). */


-  if ((reqcuflags & REQ_CASELESS) != 0)
-    {
-    if (reqcu < 128 || (!utf && reqcu < 255))
+    if ((reqcuflags & REQ_CASELESS) != 0)
       {
-      if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
-      }
+      if (reqcu < 128 || (!utf && reqcu < 255))
+        {
+        if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
+        }
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-    else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
-      re->flags |= PCRE2_LASTCASELESS;
+      else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
+        re->flags |= PCRE2_LASTCASELESS;
 #endif
+      }
     }
-  }


-/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
-to set up information such as a bitmap of starting code units and a minimum
-matching length. */
+ /* Finally, study the compiled pattern to set up information such as a bitmap
+ of starting code units and a minimum matching length. */

-if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
-    PRIV(study)(re) != 0)
-  {
-  errorcode = ERR31;
-  goto HAD_CB_ERROR;
-  }
+  if (PRIV(study)(re) != 0)
+    {
+    errorcode = ERR31;
+    goto HAD_CB_ERROR;
+    }
+  }   /* End of start-of-match optimizations. */


/* Control ends up here in all cases. When running under valgrind, make a
pattern's terminating zero defined again. If memory was obtained for the parsed

Modified: code/trunk/src/pcre2_dfa_match.c
===================================================================
--- code/trunk/src/pcre2_dfa_match.c    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/src/pcre2_dfa_match.c    2017-06-30 16:00:33 UTC (rev 837)
@@ -3341,34 +3341,27 @@
   }
 #endif  /* SUPPORT_UNICODE */


-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
   {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127)
-        first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127)
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif
-      }
     }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
   }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;


-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may be a "last known required code unit" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
   {
@@ -3414,8 +3407,8 @@
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
     first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */


     if (firstline)
       {
@@ -3434,70 +3427,138 @@
       while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
       end_subject = t;
       }
+      
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */


-    /* Advance to a unique first code unit if there is one. */
-
-    if (has_first_cu)
+    if (anchored)
       {
-      PCRE2_UCHAR smc;
-      if (first_cu != first_cu2)
-        while (start_match < end_subject &&
-          (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-      else
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
+      if (has_first_cu || start_bits != NULL)
+        {
+        BOOL ok = start_match < end_subject;
+        if (ok)
+          {
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
+            {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
+            }
+          }
+        if (!ok) break;
+        }
       }


-    /* Or to just after a linebreak for a multiline match */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */


-    else if (startline)
+    else
       {
-      if (start_match > mb->start_subject + start_offset)
+      if (has_first_cu)
         {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        if (first_cu != first_cu2)  /* Caseless */
           {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
-            {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
             start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
-            }
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
+#endif
           }
+
+        /* The caseful case */
+
         else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
 #endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
+          }


-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */


-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
+            start_match >= end_subject)
+          break;
         }
-      }


-    /* Or to a non-unique first code unit if any have been identified. The
-    bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
-    code units greater than 254 set the 255 bit. */
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */


-    else if (start_bits != NULL)
-      {
-      while (start_match < end_subject)
+      else if (startline)
         {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          if (c > 255) c = 255;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
         }
-      }
+      }  /* End of first code unit handling */


     /* Restore fudged end_subject */



Modified: code/trunk/src/pcre2_match.c
===================================================================
--- code/trunk/src/pcre2_match.c    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/src/pcre2_match.c    2017-06-30 16:00:33 UTC (rev 837)
@@ -270,7 +270,7 @@


*lengthptr = (*Fecode == OP_CALLOUT)?
PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
-
+
if (mb->callout == NULL) return 0; /* No callout function provided */

/* The original matching code (pre 10.30) worked directly with the ovector
@@ -279,11 +279,11 @@
the overall match offsets (which would waste space in the frame). For backward
compatibility, however, we pass capture_top and offset_vector to the callout as
if for the extended ovector, and we ensure that the first two slots are unset
-by preserving and restoring their current contents. Picky compilers complain if
-references such as Fovector[-2] are use directly, so we set up a separate
+by preserving and restoring their current contents. Picky compilers complain if
+references such as Fovector[-2] are use directly, so we set up a separate
pointer. */

-callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
+callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;

 cb.version          = 1;
 cb.capture_top      = (uint32_t)Foffset_top/2 + 1;
@@ -935,8 +935,8 @@


     /* ===================================================================== */
     /* Match a single character, caselessly. If we are at the end of the
-    subject, give up immediately. We get here only when the pattern character 
-    has at most one other case. Characters with more than two cases are coded 
+    subject, give up immediately. We get here only when the pattern character
+    has at most one other case. Characters with more than two cases are coded
     as OP_PROP with the pseudo-property PT_CLIST. */


     case OP_CHARI:
@@ -954,7 +954,7 @@
       GETCHARLEN(fc, Fecode, Flength);


       /* If the pattern character's value is < 128, we know that its other case
-      (if any) is also < 128 (and therefore only one code unit long in all 
+      (if any) is also < 128 (and therefore only one code unit long in all
       code-unit widths), so we can use the fast lookup table. We checked above
       that there is at least one character left in the subject. */


@@ -966,7 +966,7 @@
         Feptr++;
         }


-      /* Otherwise we must pick up the subject character and use Unicode 
+      /* Otherwise we must pick up the subject character and use Unicode
       property support to test its other case. Note that we cannot use the
       value of "Flength" to check for sufficient bytes left, because the other
       case of the character may have more or fewer code units. */
@@ -3056,7 +3056,7 @@
           }
         Feptr += Lmin;
         break;
-        
+
         /* This OP_ANYBYTE case will never be reached because \C gets turned
         into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
         reports don't complain about it's never being used. */
@@ -5352,8 +5352,8 @@
                 (char *)assert_accept_frame + offsetof(heapframe, ovector),
                 assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
           Foffset_top = assert_accept_frame->offset_top;
-           
-          /* Fall through */ 
+
+          /* Fall through */
           /* In the case of a match, the captures have already been put into
           the current frame. */


@@ -5650,7 +5650,7 @@
     if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
     if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;


-    /* Fall through */ 
+    /* Fall through */
     /* Unconditional end of subject assertion (\z) */


     case OP_EOD:
@@ -6280,7 +6280,7 @@
 has to be expanded. We therefore put it into the match block so that it is
 correct when calling match() more than once for non-anchored patterns. */


-frame_size = offsetof(heapframe, ovector) +
+frame_size = offsetof(heapframe, ovector) +
re->top_bracket * 2 * sizeof(PCRE2_SIZE);

/* Limits set in the pattern override the match context only if they are
@@ -6333,33 +6333,26 @@
mb->fcc = re->tables + fcc_offset;
mb->ctypes = re->tables + ctypes_offset;

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
   {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
     {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
 #endif
-      }
     }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
   }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;


-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may also be a "last known required character" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
   {
@@ -6398,8 +6391,8 @@
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
     first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    we stop the optimization scans for a first code unit at a newline. If the
+    match fails at the newline, later code breaks this loop. */


     if (firstline)
       {
@@ -6419,107 +6412,143 @@
       end_subject = t;
       }


-    /* Advance to a unique first code unit if there is one. In 8-bit mode, the
-    use of memchr() gives a big speed up, even though we have to call it twice
-    in caseless mode, in order to find the first occurrence of the character in
-    either of its cases. */
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */


-    if (has_first_cu)
+    if (anchored)
       {
-      if (first_cu != first_cu2)  /* Caseless */
+      if (has_first_cu || start_bits != NULL)
         {
+        BOOL ok = start_match < end_subject;
+        if (ok)
+          {
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
+            {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        PCRE2_UCHAR smc;
-        while (start_match < end_subject &&
-              (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-#else  /* 8-bit code units */
-        PCRE2_SPTR pp1 = memchr(start_match, first_cu, end_subject-start_match);
-        PCRE2_SPTR pp2 = memchr(start_match, first_cu2, end_subject-start_match);
-        if (pp1 == NULL)
-          start_match = (pp2 == NULL)? end_subject : pp2;
-        else
-          start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
+            if (c > 255) c = 255;
 #endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
+            }
+          }
+        if (!ok)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
         }
+      }


-      /* The caseful case */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */


-      else
+    else
+      {
+      if (has_first_cu)
         {
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
+#endif
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
 #else
-        start_match = memchr(start_match, first_cu, end_subject - start_match);
-        if (start_match == NULL) start_match = end_subject;
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
 #endif
-        }
+          }


-      /* If we can't find the required code unit, break the bumpalong loop, to
-      force a match failure, except when doing partial matching, when we let
-      the next cycle run at the end of the subject. To see why, consider the
-      pattern /(?<=abc)def/, which partially matches "abc", even though the
-      string does not contain the starting character "d". */
+        /* If we can't find the required code unit, break the bumpalong loop,
+        to force a match failure, except when doing partial matching, when we
+        let the next cycle run at the end of the subject. To see why, consider
+        the pattern /(?<=abc)def/, which partially matches "abc", even though
+        the string does not contain the starting character "d". */


-      if (!mb->partial && start_match >= end_subject)
-        {
-        rc = MATCH_NOMATCH;
-        break;
+        if (!mb->partial && start_match >= end_subject)
+          {
+          rc = MATCH_NOMATCH;
+          break;
+          }
         }
-      }


-    /* If there's no first code unit, advance to just after a linebreak for a
-    multiline match if required. */
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */


-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
+      else if (startline)
         {
+        if (start_match > mb->start_subject + start_offset)
+          {
 #ifdef SUPPORT_UNICODE
-        if (utf)
-          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          if (utf)
             {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, *start_match,
+                start_match++);
+              }
             }
-          }
-        else
+          else
 #endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;


-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */


-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
         }
-      }


-    /* If there's no first code unit or a requirement for a multiline line
-    start, advance to a non-unique first code unit if any have been identified.
-    The bitmap contains only 256 bits. When code units are 16 or 32 bits wide,
-    all code units greater than 254 set the 255 bit. */
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */


-    else if (start_bits != NULL)
-      {
-      while (start_match < end_subject)
+      else if (start_bits != NULL)
         {
-        uint32_t c = UCHAR21TEST(start_match);
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          if (c > 255) c = 255;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
         }
-      }
+      }   /* End first code unit handling */


     /* Restore fudged end_subject */



Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/src/pcre2_study.c    2017-06-30 16:00:33 UTC (rev 837)
@@ -799,7 +799,7 @@
     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
 #endif
     }
-  else 
+  else
 #endif  /* SUPPORT_UNICODE */


   /* Not UTF */
@@ -953,7 +953,6 @@
       case OP_ALLANY:
       case OP_ANY:
       case OP_ANYBYTE:
-      case OP_CIRC:
       case OP_CIRCM:
       case OP_CLOSE:
       case OP_COMMIT:
@@ -1021,6 +1020,13 @@
       case OP_THEN_ARG:
       return SSB_FAIL;


+      /* OP_CIRC happens only at the start of an anchored branch (multiline ^
+      uses OP_CIRCM). Skip over it. */
+
+      case OP_CIRC:
+      tcode += PRIV(OP_lengths)[OP_CIRC];
+      break;
+
       /* A "real" property test implies no starting bits, but the fake property
       PT_CLIST identifies a list of characters. These lists are short, as they
       are used for characters with more than one "other case", so there is no
@@ -1450,7 +1456,7 @@
 #endif
       /* It seems that the fall through comment must be outside the #ifdef if
       it is to avoid the gcc compiler warning. */
-        
+
       /* Fall through */


       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
@@ -1579,12 +1585,11 @@
 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
   re->name_entry_size * re->name_count;


-/* For an anchored pattern, or an unanchored pattern that has a first code
-unit, or a multiline pattern that matches only at "line start", there is no
-point in seeking a list of starting code units. */
+/* For a pattern that has a first code unit, or a multiline pattern that
+matches only at "line start", there is no point in seeking a list of starting
+code units. */

-if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
-    (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
+if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
   {
   int rc = set_start_bits(re, code, utf);
   if (rc == SSB_UNKNOWN) return 1;


Modified: code/trunk/testdata/testinput10
===================================================================
--- code/trunk/testdata/testinput10    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testinput10    2017-06-30 16:00:33 UTC (rev 837)
@@ -466,5 +466,14 @@


 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
     \x{dfff}\x{df01}\=no_utf_check
+    
+# This has different starting code units in 8-bit mode. 


+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa
+
 # End of testinput10


Modified: code/trunk/testdata/testinput12
===================================================================
--- code/trunk/testdata/testinput12    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testinput12    2017-06-30 16:00:33 UTC (rev 837)
@@ -373,4 +373,13 @@
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
     \x{dfff}\x{df01}\=no_utf_check


+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+    c
+    \x{ff}
+    \x{100}
+\= Expect no match
+    aaa
+
 # End of testinput12


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testinput2    2017-06-30 16:00:33 UTC (rev 837)
@@ -5256,6 +5256,9 @@
     XAB     


 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+
+/^(?!A(?C1)B)C/no_start_optimize
     ABC\=callout_error=1


/^(?(?!A(?C1)B)C)/

Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testinput5    2017-06-30 16:00:33 UTC (rev 837)
@@ -120,13 +120,6 @@
     \x{ff}
     \x{100}


-/^[^ab]/IB,utf
-    c
-    \x{ff}
-    \x{100}
-\= Expect no match
-    aaa
-
 /\x{100}*(\d+|"(?1)")/utf
     1234
     "1234"
@@ -190,8 +183,11 @@
 /\w/utf
     \x{100}X


-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.

+/^\ሴ/IB,utf,no_start_optimize
+
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()

Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput10    2017-06-30 16:00:33 UTC (rev 837)
@@ -1585,5 +1585,39 @@
 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
     \x{dfff}\x{df01}\=no_utf_check
  0: \x{dfff}\x{df01}
+    
+# This has different starting code units in 8-bit mode. 


+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
+  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
+  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
+  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
+  \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput10


Modified: code/trunk/testdata/testoutput12-16
===================================================================
--- code/trunk/testdata/testoutput12-16    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput12-16    2017-06-30 16:00:33 UTC (rev 837)
@@ -1433,4 +1433,42 @@
 Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
     \x{dfff}\x{df01}\=no_utf_check


+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12


Modified: code/trunk/testdata/testoutput12-32
===================================================================
--- code/trunk/testdata/testoutput12-32    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput12-32    2017-06-30 16:00:33 UTC (rev 837)
@@ -1425,4 +1425,42 @@
     \x{dfff}\x{df01}\=no_utf_check
  0: \x{dfff}\x{df01}


+# This has different starting code units in 8-bit mode. 
+
+/^[^ab]/IB,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        [\x00-`c-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Compile options: utf
+Overall options: anchored utf
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca 
+  \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 
+  \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 
+  \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 
+  \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+Subject length lower bound = 1
+    c
+ 0: c
+    \x{ff}
+ 0: \x{ff}
+    \x{100}
+ 0: \x{100}
+\= Expect no match
+    aaa
+No match
+
 # End of testinput12


Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput17    2017-06-30 16:00:33 UTC (rev 837)
@@ -368,6 +368,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #pop jitverify
@@ -379,6 +380,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 6
 JIT compilation was successful
 #save testsaved1


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput2    2017-06-30 16:00:33 UTC (rev 837)
@@ -72,6 +72,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
     abc
  0: abc
@@ -110,6 +111,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
     abc
  0: abc
@@ -339,6 +341,7 @@
 /the quick brown fox/I,anchored
 Capturing subpattern count = 0
 Options: anchored
+First code unit = 't'
 Subject length lower bound = 19
     the quick brown fox
  0: the quick brown fox
@@ -351,6 +354,7 @@


 /^abc|def/I
 Capturing subpattern count = 0
+Starting code units: a d 
 Subject length lower bound = 3
     abcdef
  0: abc
@@ -495,6 +499,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = '1'
 Subject length lower bound = 4


/(^b|(?i)^d)/I
@@ -501,6 +506,7 @@
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
+Starting code units: D b d
Subject length lower bound = 1

/(?s).*/I
@@ -624,6 +630,7 @@
Max lookbehind = 1
Compile options: multiline
Overall options: anchored multiline
+First code unit = 'a'
Subject length lower bound = 3

 /^abc/Im
@@ -637,6 +644,7 @@
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
   aaaaabbbbbcccccdef
  0: aaaaabbbbbcccccdef
@@ -808,6 +816,7 @@
 Max back reference = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a 
 Subject length lower bound = 4
 \= Expect no match
     aaaa
@@ -1004,6 +1013,7 @@
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Subject length lower bound = 4
     adef\=get=1,get=2,get=3,get=4,getall
  0: adef
@@ -1042,6 +1052,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 7
     abc\00def\=copy=0,getall
  0: abc\x00def
@@ -1227,6 +1238,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'i'
 Subject length lower bound = 3
     ississippi
  0: iss
@@ -1286,6 +1298,7 @@
 Contains explicit CR or LF match
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3
     ab\nab\ncd
  0: ab\x0a
@@ -1776,6 +1789,8 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P 
+  Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z 
 Subject length lower bound = 1


/^[[:^alnum:]]/IB
@@ -1789,6 +1804,18 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
+ ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88
+ \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97
+ \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6
+ \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5
+ \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4
+ \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3
+ \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2
+ \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1
+ \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

/^[[:alpha:]]/IB
@@ -1802,6 +1829,8 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
+ a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1

/^[[:^alpha:]]/IB
@@ -1815,6 +1844,19 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
+ 5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84
+ \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93
+ \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2
+ \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1
+ \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0
+ \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf
+ \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde
+ \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed
+ \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc
+ \xfd \xfe \xff
Subject length lower bound = 1

/[_[:alpha:]]/I
@@ -1834,6 +1876,12 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
+ 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
+ Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
+ \x7f
Subject length lower bound = 1

/^[[:^ascii:]]/IB
@@ -1847,6 +1895,15 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a
+ \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99
+ \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8
+ \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7
+ \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6
+ \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5
+ \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4
+ \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3
+ \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

/^[[:blank:]]/IB
@@ -1860,6 +1917,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x09 \x20
Subject length lower bound = 1

/^[[:^blank:]]/IB
@@ -1873,6 +1931,20 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b
+ \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a
+ \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9
+ : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
+ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80
+ \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f
+ \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e
+ \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad
+ \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc
+ \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb
+ \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda
+ \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9
+ \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8
+ \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

/[\n\x0b\x0c\x0d[:blank:]]/I
@@ -1892,6 +1964,9 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x7f
Subject length lower bound = 1

/^[[:digit:]]/IB
@@ -1905,6 +1980,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9
Subject length lower bound = 1

/^[[:graph:]]/IB
@@ -1918,6 +1994,9 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 :
+ ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
+ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
Subject length lower bound = 1

/^[[:lower:]]/IB
@@ -1931,6 +2010,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1

/^[[:print:]]/IB
@@ -1944,6 +2024,9 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8
+ 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ]
+ ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
Subject length lower bound = 1

/^[[:punct:]]/IB
@@ -1957,6 +2040,8 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^
+ _ ` { | } ~
Subject length lower bound = 1

/^[[:space:]]/IB
@@ -1970,6 +2055,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x09 \x0a \x0b \x0c \x0d \x20
Subject length lower bound = 1

/^[[:upper:]]/IB
@@ -1983,6 +2069,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
Subject length lower bound = 1

/^[[:xdigit:]]/IB
@@ -1996,6 +2083,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f
Subject length lower bound = 1

/^[[:word:]]/IB
@@ -2009,6 +2097,8 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
+ Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1

/^[[:^cntrl:]]/IB
@@ -2022,6 +2112,18 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8
+ 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ]
+ ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x80 \x81
+ \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90
+ \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f
+ \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae
+ \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd
+ \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc
+ \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
+ \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
+ \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
+ \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

/^[12[:^digit:]]/IB
@@ -2035,6 +2137,20 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
+ \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
+ \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 1 2 : ; <
+ = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a
+ b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82
+ \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91
+ \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0
+ \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf
+ \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe
+ \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
+ \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
+ \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
+ \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
+ \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

/^[[:^blank:]]/IB
@@ -2048,6 +2164,20 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b
+ \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a
+ \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9
+ : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
+ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80
+ \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f
+ \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e
+ \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad
+ \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc
+ \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb
+ \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda
+ \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9
+ \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8
+ \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1

 /[01[:alpha:]%]/IB
@@ -2418,6 +2548,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 1
     aba
  0: aba
@@ -2428,6 +2559,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2438,6 +2570,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2448,6 +2581,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2458,6 +2592,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2467,6 +2602,7 @@
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2478,6 +2614,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2488,6 +2625,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2497,6 +2635,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbbaa
  0: aabbbaa
@@ -2506,6 +2645,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbbaa
  0: aabbbaa
@@ -2515,6 +2655,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbaa
  0: aabbaa
@@ -2524,6 +2665,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbbaa
  0: aabbbaa
@@ -2533,6 +2675,7 @@
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbbaa
  0: aabbbaa
@@ -2544,6 +2687,7 @@
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2
     aabbbbaa
  0: aabbbbaa
@@ -3052,6 +3196,7 @@
 Capturing subpattern count = 5
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3


/^x(?U)a+b/IB
@@ -3067,6 +3212,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+First code unit = 'x'
Last code unit = 'b'
Subject length lower bound = 3

@@ -3085,6 +3231,7 @@
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
+First code unit = 'x'
Last code unit = 'b'
Subject length lower bound = 3

@@ -3725,6 +3872,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+First code unit = 'a'
Subject length lower bound = 3

 /(?C)a|b/I
@@ -3785,6 +3933,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = '>'
 Last code unit = '<'
 Subject length lower bound = 10
    >abc>123<xyz<
@@ -3835,6 +3984,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: ( - 0 1 2 3 4 5 6 7 8 9 
 Subject length lower bound = 1
     12
  0: 12
@@ -3854,6 +4004,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+First code unit = 'x'
 Subject length lower bound = 3
     xyz
  0: xyz
@@ -3913,6 +4064,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 9
     abcdefabc
  0: abcdefabc
@@ -3922,6 +4074,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
     a=a
  0: a=a
@@ -3937,6 +4090,7 @@
 Capturing subpattern count = 2
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b c 
 Subject length lower bound = 2
     a=a
  0: a=a
@@ -5173,6 +5327,7 @@
 Capturing subpattern count = 3
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = '/'
 Subject length lower bound = 6
     13/05/04\=ps
@@ -5270,6 +5425,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: 0 1 2 3 4 5 6 7 8 9 
 Last code unit = 'X'
 Subject length lower bound = 4
     1\=ps
@@ -5643,6 +5799,7 @@
   A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
     a1b\=copy=A
  0: a1
@@ -5680,6 +5837,7 @@
   A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
     ab\=copy=A
  0: ab
@@ -5693,6 +5851,7 @@
   A   1
   A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
     ab\=copy=A
  0: ab
@@ -5711,6 +5870,7 @@
   A   3
   A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
     cdefgh\=copy=A
  0: cdefgh
@@ -5727,6 +5887,7 @@
   A   3
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
     a1b\=get=A
  0: a1
@@ -5754,6 +5915,7 @@
   A   2
 Compile options: dupnames
 Overall options: anchored dupnames
+First code unit = 'a'
 Subject length lower bound = 2
     ab\=get=A
  0: ab
@@ -5767,6 +5929,7 @@
   A   1
   A   2
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
     ab\=get=A
  0: ab
@@ -5785,6 +5948,7 @@
   A   3
   A   4
 Options: dupnames
+Starting code units: a c 
 Subject length lower bound = 2
     cdefgh\=get=A
  0: cdefgh
@@ -5802,6 +5966,7 @@
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 2
     a1b\=copy=A
  0: a1
@@ -5832,6 +5997,7 @@
 Compile options: <none>
 Overall options: anchored
 Duplicate name status changes
+First code unit = 'a'
 Subject length lower bound = 6
     a bc d\=copy=A,copy=B,copy=C
  0: a bc d
@@ -6233,6 +6399,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+Starting code units: a b 
 Last code unit = 'b'
 Subject length lower bound = 2


@@ -6249,6 +6416,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: a b
Last code unit = 'b'
Subject length lower bound = 2

@@ -6265,6 +6433,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+Starting code units: a b
Last code unit = 'b'
Subject length lower bound = 2

@@ -6281,6 +6450,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Last code unit = 'A'
 Subject length lower bound = 3
     aaaA5
@@ -6302,6 +6472,7 @@
 Capturing subpattern count = 0
 Compile options: caseless
 Overall options: anchored caseless
+Starting code units: A a 
 Last code unit = 'A' (caseless)
 Subject length lower bound = 2
     aaaA5
@@ -9540,6 +9711,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'F'
 Last code unit = ':'
 Subject length lower bound = 22


@@ -9691,6 +9863,7 @@
   D   1
 Compile options: dupnames extended
 Overall options: anchored dupnames extended
+Starting code units: a e 
 Subject length lower bound = 2
     abcdX
  0: abcdX
@@ -10445,6 +10618,7 @@
 Capturing subpattern count = 1
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 2


/(^ab)++/I
@@ -10451,6 +10625,7 @@
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
+First code unit = 'a'
Subject length lower bound = 2

/(^ab|^)+/I
@@ -10471,6 +10646,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+First code unit = 'a'
Subject length lower bound = 2

/(?:^ab)++/I
@@ -10477,6 +10653,7 @@
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
+First code unit = 'a'
Subject length lower bound = 2

/(?:^ab|^)+/I
@@ -11586,6 +11763,7 @@
Capturing subpattern count = 0
Compile options: dotall
Overall options: anchored dotall
+First code unit = 'a'
Subject length lower bound = 2

/.*?a(*SKIP)b/I
@@ -11608,6 +11786,7 @@
Capturing subpattern count = 0
Compile options: dotall
Overall options: anchored dotall
+First code unit = 'a'
Subject length lower bound = 2

 /(?>.*?)(?<=(abcd)|(wxyz))/I
@@ -13375,7 +13554,6 @@
 /(|ab)*?d/I,no_start_optimize
 Capturing subpattern count = 1
 Options: no_start_optimize
-Last code unit = 'd'
 Subject length lower bound = 0
    abd
  0: abd
@@ -13641,6 +13819,7 @@
 Capturing subpattern count = 0
 Compile options: <none>
 Overall options: anchored
+First code unit = 'a'
 Subject length lower bound = 3


/^abc/info,no_dotstar_anchor
@@ -13647,6 +13826,7 @@
Capturing subpattern count = 0
Compile options: no_dotstar_anchor
Overall options: anchored no_dotstar_anchor
+First code unit = 'a'
Subject length lower bound = 3

/.*\d/info,auto_callout
@@ -14684,6 +14864,7 @@
Max back reference = 1
Compile options: <none>
Overall options: anchored
+First code unit = 'o'
Last code unit = '}'
Subject length lower bound = 65535

@@ -15607,6 +15788,7 @@
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
+First code unit = 'b'
Subject length lower bound = 2

/(a){0}.*bc/sI
@@ -15885,6 +16067,10 @@
No match

 /^(?!A(?C1)B)C/
+    ABC\=callout_error=1,no_jit
+No match
+
+/^(?!A(?C1)B)C/no_start_optimize
     ABC\=callout_error=1
 --->ABC
   1 ^^      B


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2017-06-26 16:25:16 UTC (rev 836)
+++ code/trunk/testdata/testoutput5    2017-06-30 16:00:33 UTC (rev 837)
@@ -194,6 +194,7 @@
 Capturing subpattern count = 0
 Compile options: utf
 Overall options: anchored utf
+Starting code units: a b 
 Subject length lower bound = 1
     bar
  0: b
@@ -205,28 +206,6 @@
     \x{100}
 No match


-/^[^ab]/IB,utf
-------------------------------------------------------------------
-        Bra
-        ^
-        [\x00-`c-\xff] (neg)
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
-    c
- 0: c
-    \x{ff}
- 0: \x{ff}
-    \x{100}
- 0: \x{100}
-\= Expect no match
-    aaa
-No match
-
 /\x{100}*(\d+|"(?1)")/utf
     1234
  0: 1234
@@ -479,7 +458,10 @@
     \x{100}X
  0: X


-/^\ሴ/IB,utf
+# Use no_start_optimize because the first code unit is different in 8-bit from
+# the wider modes.
+
+/^\ሴ/IB,utf,no_start_optimize
 ------------------------------------------------------------------
         Bra
         ^
@@ -488,9 +470,9 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Compile options: utf
-Overall options: anchored utf
-Subject length lower bound = 1
+Compile options: no_start_optimize utf
+Overall options: anchored no_start_optimize utf
+Subject length lower bound = 0


/()()()()()()()()()()
()()()()()()()()()()