[Pcre-svn] [810] code/branches/pcre16: A lot more work on pcretest; now runs many ( but not all) tests.

Autor: Subversion repository
Datum:
To: pcre-svn
Betreff: [Pcre-svn] [810] code/branches/pcre16: A lot more work on pcretest; now runs many ( but not all) tests.

Revision: 810

          http://vcs.pcre.org/viewvc?view=rev&revision=810
Author:   ph10
Date:     2011-12-19 13:34:10 +0000 (Mon, 19 Dec 2011)

Log Message:
-----------
A lot more work on pcretest; now runs many (but not all) tests.

Modified Paths:
--------------
    code/branches/pcre16/RunTest
    code/branches/pcre16/pcre_printint.c
    code/branches/pcre16/pcretest.c
    code/branches/pcre16/testdata/testinput1
    code/branches/pcre16/testdata/testinput2
    code/branches/pcre16/testdata/testinput4
    code/branches/pcre16/testdata/testinput5
    code/branches/pcre16/testdata/testoutput1
    code/branches/pcre16/testdata/testoutput10
    code/branches/pcre16/testdata/testoutput13
    code/branches/pcre16/testdata/testoutput2
    code/branches/pcre16/testdata/testoutput4
    code/branches/pcre16/testdata/testoutput5
    code/branches/pcre16/testdata/testoutput8

Added Paths:
-----------
    code/branches/pcre16/testdata/testinput16
    code/branches/pcre16/testdata/testinput17
    code/branches/pcre16/testdata/testoutput16
    code/branches/pcre16/testdata/testoutput17

Modified: code/branches/pcre16/RunTest
===================================================================
--- code/branches/pcre16/RunTest    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/RunTest    2011-12-19 13:34:10 UTC (rev 810)
@@ -18,7 +18,10 @@
 # two tests for JIT-specific features, one to be run when JIT support is
 # available, and one when it is not.

-# The arguments for this script can be individual test numbers, or the word
+# Whichever of the 8-bit and 16-bit libraries exist are tested. It is also
+# possible to select which to test by the arguments -8 or -16.
+
+# Other arguments for this script can be individual test numbers, or the word
# "valgrind", or "sim" followed by an argument to run cross-compiled
# executables under a simulator, for example:
#
@@ -26,6 +29,8 @@

valgrind=
sim=
+arg8=
+arg16=

# Select which tests to run; for those that are explicitly requested, check
# that the necessary optional facilities are available.
@@ -45,6 +50,8 @@
do13=no
do14=no
do15=no
+do16=no
+do17=no

 while [ $# -gt 0 ] ; do
   case $1 in
@@ -63,6 +70,10 @@
    13) do13=yes;;
    14) do14=yes;;
    15) do15=yes;;
+   16) do16=yes;;
+   17) do17=yes;;  
+   -8) arg8=yes;;
+  -16) arg16=yes;;   
    valgrind) valgrind="valgrind -q --smc-check=all";;
    sim) shift; sim=$1;;
     *) echo "Unknown test number $1"; exit 1;;
@@ -107,12 +118,26 @@
 if [ $? -eq 0 ] ; then
   test8=
   test16=-16
+  if [ "$arg8" = yes -a "$arg16" != yes ] ; then
+    test16=skip
+  fi    
+  if [ "$arg16" = yes -a "$arg8" != yes ] ; then
+    test8=skip
+  fi    
 else
   $sim ./pcretest -C | $sim ./pcregrep '8-bit support' >/dev/null
   if [ $? -eq 0 ] ; then
+    if [ "$arg16" = yes ] ; then
+      echo "Cannot run 16-bit library tests: 16-bit library not compiled"
+      exit 1
+    fi      
     test8=
     test16=skip
   else
+    if [ "$arg8" = yes ] ; then
+      echo "Cannot run 8-bit library tests: 8-bit library not compiled"
+      exit 1
+    fi      
     test8=skip
     test16=-16
   fi
@@ -135,6 +160,20 @@
   jitopt=-s+
 fi

+if [ "$test8" = skip ] ; then
+  if [ $do17 = yes ] ; then
+    echo "Can't run test 17 because the 8-bit library is not built"
+    exit 1
+  fi     
+fi
+
+if [ "$test16" = skip ] ; then
+  if [ $do16 = yes ] ; then
+    echo "Can't run test 16 because the 16-bit library is not built"
+    exit 1
+  fi     
+fi
+
 if [ $utf -eq 0 ] ; then
   if [ $do4 = yes ] ; then
     echo "Can't run test 4 because UTF support is not configured"
@@ -152,6 +191,12 @@
     echo "Can't run test 12 because UTF support is not configured"
     exit 1
   fi
+  if [ $do16 = yes ] ; then
+    echo "Can't run test 16 because UTF support is not configured"
+  fi     
+  if [ $do17 = yes ] ; then
+    echo "Can't run test 17 because UTF support is not configured"
+  fi     
 fi

 if [ $ucp -eq 0 ] ; then
@@ -198,7 +243,8 @@
 if [ $do1  = no -a $do2  = no -a $do3  = no -a $do4  = no -a \
      $do5  = no -a $do6  = no -a $do7  = no -a $do8  = no -a \
      $do9  = no -a $do10 = no -a $do11 = no -a $do12 = no -a \
-     $do13 = no -a $do14 = no -a $do15 = no ] ; then
+     $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \
+     $do17 = no ] ; then
   do1=yes
   do2=yes
   do3=yes
@@ -214,6 +260,8 @@
   do13=yes
   do14=yes
   do15=yes
+  do16=yes
+  do17=yes  
 fi

 # Show which release and which test data
@@ -226,8 +274,8 @@
   case "$bmode" in
     skip) continue;;
     -16)  if [ "$test8" != "skip" ] ; then echo ""; fi
-          echo "---- Testing 16-bit library ----"; echo "";;
-    *)    echo "---- Testing 8-bit library ----"; echo "";;
+          bits=16; echo "---- Testing 16-bit library ----"; echo "";;
+    *)    bits=8; echo "---- Testing 8-bit library ----"; echo "";;
   esac

# Primary test, compatible with JIT and all versions of Perl >= 5.8
@@ -251,7 +299,7 @@
# PCRE tests that are not JIT or Perl-compatible: API, errors, internals

 if [ $do2 = yes ] ; then
-  echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-8/16)"
+  echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-$bits)"
   for opt in "" "-s" $jitopt; do
     $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput2 testtry
     if [ $? = 0 ] ; then
@@ -336,9 +384,9 @@
 # Additional tests for UTF support

 if [ $do4 = yes ] ; then
-  echo "Test 4: UTF-8/16 support (Compatible with Perl >= 5.8)"
+  echo "Test 4: UTF-$bits support (Compatible with Perl >= 5.8)"
   if [ $utf -eq 0 ] ; then
-    echo "  Skipped because UTF support is not available"
+    echo "  Skipped because UTF-$bits support is not available"
   else
     for opt in "" "-s" $jitopt; do
       $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput4 testtry
@@ -356,9 +404,9 @@
 fi

 if [ $do5 = yes ] ; then
-  echo "Test 5: API, internals, and non-Perl stuff for UTF-8/16 support"
+  echo "Test 5: API, internals, and non-Perl stuff for UTF-$bits support"
   if [ $utf -eq 0 ] ; then
-    echo "  Skipped because UTF support is not available"
+    echo "  Skipped because UTF-$bits support is not available"
   else
     for opt in "" "-s" $jitopt; do
       $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput5 testtry
@@ -411,9 +459,9 @@
 fi

 if [ $do8 = yes ] ; then
-  echo "Test 8: DFA matching with UTF-8 or UTF-16"
+  echo "Test 8: DFA matching with UTF-$bits"
   if [ $utf -eq 0 ] ; then
-    echo "  Skipped because UTF support is not available"
+    echo "  Skipped because UTF-$bits support is not available"
   else
     for opt in "" "-s"; do
       $sim $valgrind ./pcretest -q $bmode $opt -dfa $testdata/testinput8 testtry
@@ -469,10 +517,10 @@
   fi
 fi

-# Test of Perl >= 5.10 features without UTF8 support
+# Test of Perl >= 5.10 features without UTF support

 if [ $do11 = yes ] ; then
-  echo "Test 11: Features from Perl >= 5.10 without UTF8 support"
+  echo "Test 11: Features from Perl >= 5.10 without UTF-$bits support"
   for opt in "" "-s" $jitopt; do
     $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput11 testtry
     if [ $? = 0 ] ; then
@@ -487,12 +535,12 @@
   done
 fi

-# Test of Perl >= 5.10 features with UTF8 support
+# Test of Perl >= 5.10 features with UTF support

 if [ $do12 = yes ] ; then
-  echo "Test 12: Features from Perl >= 5.10 with UTF-8 or UTF-16 support"
+  echo "Test 12: Features from Perl >= 5.10 with UTF-$bits support"
   if [ $utf -eq 0 ] ; then
-    echo "  Skipped because UTF support is not available"
+    echo "  Skipped because UTF-$bits support is not available"
   else
     for opt in "" "-s" $jitopt; do
       $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput12 testtry
@@ -565,8 +613,55 @@
   fi
 fi

+# Tests for 16-bit-specific features (needs UTF-8 support)
+
+if [ $do16 = yes ] ; then
+  echo "Test 16: specials for the 16-bit library"
+  if [ "$bits" = "8" ] ; then
+    echo "  Skipped when running 8-bit tests"
+  elif [ $utf -eq 0 ] ; then
+    echo "  Skipped because UTF-$bits support is not available"
+  else   
+    for opt in "" "-s" $jitopt; do
+      $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput16 testtry
+      if [ $? = 0 ] ; then
+        $cf $testdata/testoutput16 testtry
+        if [ $? != 0 ] ; then exit 1; fi
+      else exit 1
+      fi
+      if [ "$opt" = "-s" ] ; then echo "  OK with study"
+      elif [ "$opt" = "-s+" ] ; then echo "  OK with JIT study"
+      else echo "  OK"
+      fi
+    done
+  fi
+fi
+
+# Tests for 8-bit-specific features (needs UTF-8 support)
+
+if [ $do17 = yes ] ; then
+  echo "Test 17: specials for the 8-bit library"
+  if [ "$bits" = "16" ] ; then
+    echo "  Skipped when running 16-bit tests"
+  elif [ $utf -eq 0 ] ; then
+    echo "  Skipped because UTF-$bits support is not available"
+  else   
+    for opt in "" "-s" $jitopt; do
+      $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput17 testtry
+      if [ $? = 0 ] ; then
+        $cf $testdata/testoutput17 testtry
+        if [ $? != 0 ] ; then exit 1; fi
+      else exit 1
+      fi
+      if [ "$opt" = "-s" ] ; then echo "  OK with study"
+      elif [ "$opt" = "-s+" ] ; then echo "  OK with JIT study"
+      else echo "  OK"
+      fi
+    done
+  fi
+fi
+
 # End of loop for 8-bit/16-bit tests
-
 done

# End

Modified: code/branches/pcre16/pcre_printint.c
===================================================================
--- code/branches/pcre16/pcre_printint.c    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/pcre_printint.c    2011-12-19 13:34:10 UTC (rev 810)
@@ -473,7 +473,10 @@
     case OP_NOT:
     c = code[1];
     if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c);
-      else fprintf(f, " %s [^\\x%02x]", flag, c);
+    else if (utf || c > 0xff)
+      fprintf(f, " %s [^\\x{%02x}]", flag, c);
+    else   
+      fprintf(f, " %s [^\\x%02x]", flag, c);
     break;

     case OP_NOTSTARI:

Modified: code/branches/pcre16/pcretest.c
===================================================================
--- code/branches/pcre16/pcretest.c    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/pcretest.c    2011-12-19 13:34:10 UTC (rev 810)
@@ -219,12 +219,12 @@
   count = pcre16_exec(re, extra, (PCRE_SPTR16)bptr, len, start_offset, \
     options, offsets, size_offsets)

+#define PCRE_FREE_STUDY16(extra) \
+ pcre16_free_study(extra)
+
#define PCRE_STUDY16(extra, re, options, error) \
extra = pcre16_study(re, options, error)

-#define PCRE_FREE_STUDY16(extra) \
- pcre16_free_study(extra)
-
#endif /* SUPPORT_PCRE16 */

@@ -259,18 +259,18 @@
     PCRE_EXEC8(count, re, extra, bptr, len, start_offset, options, \
       offsets, size_offsets)

+#define PCRE_FREE_STUDY(extra) \
+  if (use_pcre16) \
+    PCRE_FREE_STUDY16(extra); \
+  else \
+    PCRE_FREE_STUDY8(extra)
+
 #define PCRE_STUDY(extra, re, options, error) \
   if (use_pcre16) \
     PCRE_STUDY16(extra, re, options, error); \
   else \
     PCRE_STUDY8(extra, re, options, error)

-#define PCRE_FREE_STUDY(extra) \
-  if (use_pcre16) \
-    PCRE_FREE_STUDY16(extra); \
-  else \
-    PCRE_FREE_STUDY8(extra)
-
 /* ----- Only 8-bit mode is supported ----- */

 #elif defined SUPPORT_PCRE8
@@ -278,8 +278,8 @@
 #define PCHARSV          PCHARSV8
 #define PCRE_COMPILE     PCRE_COMPILE8
 #define PCRE_EXEC        PCRE_EXEC8
+#define PCRE_FREE_STUDY  PCRE_FREE_STUDY8
 #define PCRE_STUDY       PCRE_STUDY8
-#define PCRE_FREE_STUDY  PCRE_FREE_STUDY8

/* ----- Only 16-bit mode is supported ----- */

@@ -288,8 +288,8 @@
 #define PCHARSV          PCHARSV16
 #define PCRE_COMPILE     PCRE_COMPILE16
 #define PCRE_EXEC        PCRE_EXEC16
+#define PCRE_FREE_STUDY  PCRE_FREE_STUDY16
 #define PCRE_STUDY       PCRE_STUDY16
-#define PCRE_FREE_STUDY  PCRE_FREE_STUDY16
 #endif

/* ----- End of mode-specific function call macros ----- */
@@ -321,7 +321,7 @@
static int first_callout;
static int locale_set = 0;
static int show_malloc;
-static int use_utf8;
+static int use_utf;
static size_t gotten_store;
static size_t first_gotten_store = 0;
static const unsigned char *last_callout_mark = NULL;
@@ -848,8 +848,17 @@
8-bit size. For a UTF-8 string, the size needed for UTF-16 is no more than
double, because up to 0xffff uses no more than 3 bytes in UTF-8 but possibly 4
in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in UTF-16. The
-result is always left in buffer16. */
+result is always left in buffer16.

+Arguments:
+  p          points to a byte string
+  utf        true if UTF-8 (to be converted to UTF-16)
+  len        number of bytes in the string (excluding trailing zero)
+  
+Returns:     number of 16-bit data items used (excluding trailing zero)
+             OR -1 if a UTF-8 string is malformed  
+*/
+
 static int
 to16(pcre_uint8 *p, int utf, int len)
 {
@@ -880,6 +889,7 @@
   while (len > 0)
     {
     int chlen = utf82ord(p, &c);
+    if (chlen <= 0) return -1;
     p += chlen;
     len -= chlen; 
     if (c < 0x10000) *pp++ = c; else
@@ -1030,6 +1040,43 @@

+/*************************************************
+*             Print one character                *
+*************************************************/
+
+/* Print a single character either literally, or as a hex escape. */
+
+static int pchar(int c, FILE *f)
+{
+if (PRINTOK(c))
+  {
+  if (f != NULL) fprintf(f, "%c", c);
+  return 1;
+  }
+  
+if (c < 0x100)
+  {
+  if (use_utf)
+    {  
+    if (f != NULL) fprintf(f, "\\x{%02x}", c);
+    return 6;
+    }  
+  else 
+    {
+    if (f != NULL) fprintf(f, "\\x%02x", c);
+    return 4; 
+    } 
+  }
+  
+if (f != NULL) fprintf(f, "\\x{%02x}", c);
+return (c <= 0x000000ff)? 6 :
+       (c <= 0x00000fff)? 7 :
+       (c <= 0x0000ffff)? 8 :
+       (c <= 0x000fffff)? 9 : 10;
+}
+
+
+
 #ifdef SUPPORT_PCRE8
 /*************************************************
 *         Print 8-bit character string           *
@@ -1046,46 +1093,20 @@
 while (length-- > 0)
   {
 #if !defined NOUTF8
-  if (use_utf8)
+  if (use_utf)
     {
     int rc = utf82ord(p, &c);
-
     if (rc > 0 && rc <= length + 1)   /* Mustn't run over the end */
       {
       length -= rc - 1;
       p += rc;
-      if (PRINTOK(c))
-        {
-        if (f != NULL) fprintf(f, "%c", c);
-        yield++;
-        }
-      else
-        {
-        int n = 4;
-        if (f != NULL) fprintf(f, "\\x{%02x}", c);
-        yield += (n <= 0x000000ff)? 2 :
-                 (n <= 0x00000fff)? 3 :
-                 (n <= 0x0000ffff)? 4 :
-                 (n <= 0x000fffff)? 5 : 6;
-        }
-      continue;
+      yield += pchar(c, f);
+      continue;  
       }
     }
 #endif
-
-   /* Not UTF-8, or malformed UTF-8  */
-
   c = *p++;
-  if (PRINTOK(c))
-    {
-    if (f != NULL) fprintf(f, "%c", c);
-    yield++;
-    }
-  else
-    {
-    if (f != NULL) fprintf(f, "\\x%02x", c);
-    yield += 4;
-    }
+  yield += pchar(c, f);
   }

 return yield;
@@ -1109,9 +1130,8 @@
 while (length-- > 0)
   {
   int c = *p++ & 0xffff;
-  
 #if !defined NOUTF8
-  if (use_utf8 && c >= 0xD800 && c < 0xDC00 && length > 0)
+  if (use_utf && c >= 0xD800 && c < 0xDC00 && length > 0)
     {
     int d = *p & 0xffff;
     if (d >= 0xDC00 && d < 0xDFFF)
@@ -1122,28 +1142,7 @@
       }
     }   
 #endif
-
-  if (PRINTOK(c))
-    {
-    if (f != NULL) fprintf(f, "%c", c);
-    yield++;
-    }
-  else
-    {
-    yield += 4;
-    if (c < 0x100)
-      {
-      if (f != NULL) fprintf(f, "\\x%02x", c);
-      }
-    else
-      {
-      if (f != NULL) fprintf(f, "\\x{%02x}", c);
-      yield += (c <= 0x000000ff)? 2 :
-               (c <= 0x00000fff)? 3 :
-               (c <= 0x0000ffff)? 4 :
-               (c <= 0x000fffff)? 5 : 6;
-      }
-    }
+  yield += pchar(c, f);
   }

return yield;
@@ -1795,7 +1794,7 @@
int do_flip = 0;
int erroroffset, len, delimiter, poffset;

- use_utf8 = 0;
+ use_utf = 0;
debug_lengths = 1;

   if (extend_inputline(infile, buffer, "  re> ") == NULL) break;
@@ -1859,7 +1858,7 @@
     /* Need to know if UTF-8 for printing data strings */

     new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
-    use_utf8 = (get_options & PCRE_UTF8) != 0;
+    use_utf = (get_options & PCRE_UTF8) != 0;

     /* Now see if there is any following study data. */

@@ -2004,7 +2003,7 @@
       case 'X': options |= PCRE_EXTRA; break;
       case 'Y': options |= PCRE_NO_START_OPTIMISE; break;
       case 'Z': debug_lengths = 0; break;
-      case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
+      case '8': options |= PCRE_UTF8; use_utf = 1; break;
       case '?': options |= PCRE_NO_UTF8_CHECK; break;

       case 'T':
@@ -2122,7 +2121,12 @@
 #ifdef SUPPORT_PCRE16
     if (use_pcre16) 
       {
-      (void)to16(p, options & PCRE_UTF8, (int)strlen((char *)p));
+      if (to16(p, options & PCRE_UTF8, (int)strlen((char *)p)) < 0)
+        {
+        fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
+          "converted to UTF-16\n"); 
+        goto SKIP_DATA;  
+        }   
       p = (pcre_uint8 *)buffer16; 
       } 
 #endif
@@ -2178,7 +2182,7 @@
     lines. */

     new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
-    if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1;
+    if ((get_options & PCRE_UTF8) != 0) use_utf = 1;

     /* Extract the size for possible writing before possibly flipping it,
     and remember the store that was got. */
@@ -2395,9 +2399,9 @@
           ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
           ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
           ((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
-          ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
+          ((get_options & PCRE_UTF8) != 0)? " utf" : "",
           ((get_options & PCRE_UCP) != 0)? " ucp" : "",
-          ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
+          ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf_check" : "",
           ((get_options & PCRE_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "",
           ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");

@@ -2442,11 +2446,15 @@
         const char *caseless =
           ((((real_pcre *)re)->flags & PCRE_FCH_CASELESS) == 0)?
           "" : " (caseless)";
-
+          
         if (PRINTOK(first_char))
           fprintf(outfile, "First char = \'%c\'%s\n", first_char, caseless);
         else
-          fprintf(outfile, "First char = %d%s\n", first_char, caseless);
+          { 
+          fprintf(outfile, "First char = ");
+          pchar(first_char, outfile); 
+          fprintf(outfile, "%s\n", caseless);
+          } 
         }

       if (need_char < 0)
@@ -2690,7 +2698,7 @@
           c = c * 8 + *p++ - '0';

 #if !defined NOUTF8
-        if (use_utf8 && c > 255)
+        if (use_utf && c > 255)
           {
           pcre_uint8 buff8[8];
           int ii, utn;
@@ -2722,7 +2730,7 @@
             {
             pcre_uint8 buff8[8];
             int ii, utn;
-            if (use_utf8)
+            if (use_utf)
               {
               utn = ord2utf8(c, buff8);
               for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
@@ -3055,6 +3063,12 @@
     if (use_pcre16) 
       {
       len = to16(bptr, (((real_pcre *)re)->options) & PCRE_UTF8, len);
+      if (len < 0)
+        {
+        fprintf(outfile, "**Failed: invalid UTF-8 string cannot be "
+          "converted to UTF-16\n"); 
+        goto NEXT_DATA;  
+        }   
       bptr = (pcre_uint8 *)buffer16;
       }  
 #endif
@@ -3369,7 +3383,7 @@
               bptr[start_offset] == '\r' &&
               bptr[start_offset+1] == '\n')
             onechar++;
-          else if (use_utf8)
+          else if (use_utf)
             {
             while (start_offset + onechar < len)
               {

Modified: code/branches/pcre16/testdata/testinput1
===================================================================
--- code/branches/pcre16/testdata/testinput1    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testinput1    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,5 +1,6 @@
 /-- This set of tests is for features that are compatible with all versions of
-    Perl 5, in non-UTF-8 mode. --/
+    Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and 
+    16-bit PCRE libraries. --/

 /the quick brown fox/
     the quick brown fox

Added: code/branches/pcre16/testdata/testinput16
===================================================================
--- code/branches/pcre16/testdata/testinput16                            (rev 0)
+++ code/branches/pcre16/testdata/testinput16    2011-12-19 13:34:10 UTC (rev 810)
@@ -0,0 +1,238 @@
+/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit
+    library. There are some non-UTF 16-bit tests as well (it doesn't seem
+    worth setting up another test file just for this case). --/
+
+/\xC3\xC3\xC3xxx/8?DZSS
+
+/abc/8
+    \xC3]
+
+/X(\C{3})/8
+    X\x{11234}Y
+
+/X(\C{4})/8
+    X\x{11234}YZ
+    
+/X\C*/8
+    XYZabcdce
+    
+/X\C*?/8
+    XYZabcde
+    
+/X\C{3,5}/8
+    Xabcdefg   
+    X\x{11234}Y 
+    X\x{11234}YZ
+    X\x{11234}\x{512}  
+    X\x{11234}\x{512}YZ
+    X\x{11234}\x{512}\x{11234}Z
+
+/X\C{3,5}?/8
+    Xabcdefg   
+    X\x{11234}Y 
+    X\x{11234}YZ
+    X\x{11234}\x{512}YZ  
+    *** Failers
+    X\x{11234}
+
+/a\Cb/
+    aXb
+    a\nb
+  
+/a\Cb/8
+    aXb
+    a\nb
+    
+/a\C\Cb/8 
+    a\x{12257}b
+    ** Failers 
+    a\x{100}b
+
+/ab\Cde/8
+    abXde
+    
+/-- Check maximum non-UTF character size --/
+
+/\x{ffff}/
+
+/\x{10000}/ 
+
+/\x{100}/8DZ
+
+/\x{1000}/8DZ
+
+/\x{10000}/8DZ
+
+/\x{100000}/8DZ
+
+/\x{10ffff}/8DZ
+
+/[\x{ff}]/8DZ
+
+/[\x{100}]/8DZ
+
+/\x80/8DZ
+
+/\xff/8DZ
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
+    \x{D55c}\x{ad6d}\x{C5B4} 
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+    \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+
+/\x{084}/DZ8
+
+/\x{104}/DZ8
+
+/\x{861}/DZ8
+
+/\x{212ab}/DZ8
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+    
+/X(\C)(.*)/8
+    X\x{1234}
+    X\nabc 
+
+/-- This one is here because Perl gives out a grumbly error message (quite 
+correctly, but that messes up comparisons). --/
+    
+/a\Cb/8
+    *** Failers 
+    a\x{100}b 
+    
+/[^ab\xC0-\xF0]/8SDZ
+    \x{f1}
+    \x{bf}
+    \x{100}
+    \x{1000}   
+    *** Failers
+    \x{c0} 
+    \x{f0} 
+
+/Ā{3,4}/8SDZ
+  \x{100}\x{100}\x{100}\x{100\x{100}
+
+/(\x{100}+|x)/8SDZ
+
+/(\x{100}*a|x)/8SDZ
+
+/(\x{100}{0,2}a|x)/8SDZ
+
+/(\x{100}{1,2}a|x)/8SDZ
+
+/\x{100}/8DZ
+
+/a\x{100}\x{101}*/8DZ
+
+/a\x{100}\x{101}+/8DZ
+
+/[^\x{c4}]/DZ
+
+/[\x{100}]/8DZ
+    \x{100}
+    Z\x{100}
+    \x{100}Z
+    *** Failers 
+
+/[\xff]/DZ8
+    >\x{ff}<
+
+/[^\xff]/8DZ
+
+/\x{100}abc(xyz(?1))/8DZ
+
+/\777/8I
+  \x{1ff}
+  \777 
+  
+/\x{100}+\x{200}/8DZ
+
+/\x{100}+X/8DZ
+
+/^[\QĀ\E-\QŐ\E/BZ8
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
+    
+/X/8
+    \x{0}\x{d7ff}\x{e000}\x{10ffff}
+    \x{d800}
+    \x{d800}\?
+    \x{da00}
+    \x{da00}\?
+    \x{dfff}
+    \x{dfff}\?
+    \x{110000}    
+    \x{110000}\?    
+    \x{2000000} 
+    \x{2000000}\? 
+    \x{7fffffff} 
+    \x{7fffffff}\? 
+
+/(*UTF16)\x{11234}/
+  abcd\x{11234}pqr
+
+/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
+
+/\h/SI8
+    ABC\x{09}
+    ABC\x{20}
+    ABC\x{a0}
+    ABC\x{1680}
+    ABC\x{180e}
+    ABC\x{2000}
+    ABC\x{202f} 
+    ABC\x{205f} 
+    ABC\x{3000} 
+
+/\v/SI8
+    ABC\x{0a}
+    ABC\x{0b}
+    ABC\x{0c}
+    ABC\x{0d}
+    ABC\x{85}
+    ABC\x{2028}
+
+/\h*A/SI8
+    CDBABC
+    
+/\v+A/SI8
+
+/\s?xxx\s/8SI
+
+/\sxxx\s/I8ST1
+    AB\x{85}xxx\x{a0}XYZ
+    AB\x{a0}xxx\x{85}XYZ
+
+/\S \S/I8ST1
+    \x{a2} \x{84} 
+    A Z 
+
+/a+/8
+    a\x{123}aa\>1
+    a\x{123}aa\>2
+    a\x{123}aa\>3
+    a\x{123}aa\>4
+    a\x{123}aa\>5
+    a\x{123}aa\>6
+
+/\x{1234}+/iS8I
+
+/\x{1234}+?/iS8I
+
+/\x{1234}++/iS8I
+
+/\x{1234}{2}/iS8I
+
+/[^\x{c4}]/8DZ
+
+/X+\x{200}/8DZ
+
+/\R/SI8
+
+/-- End of testinput16 --/

Added: code/branches/pcre16/testdata/testinput17
===================================================================
--- code/branches/pcre16/testdata/testinput17                            (rev 0)
+++ code/branches/pcre16/testdata/testinput17    2011-12-19 13:34:10 UTC (rev 810)
@@ -0,0 +1,282 @@
+/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit 
+    library. --/
+
+/X(\C{3})/8
+    X\x{1234}
+
+/X(\C{4})/8
+    X\x{1234}YZ
+    
+/X\C*/8
+    XYZabcdce
+    
+/X\C*?/8
+    XYZabcde
+    
+/X\C{3,5}/8
+    Xabcdefg   
+    X\x{1234} 
+    X\x{1234}YZ
+    X\x{1234}\x{512}  
+    X\x{1234}\x{512}YZ
+
+/X\C{3,5}?/8
+    Xabcdefg   
+    X\x{1234} 
+    X\x{1234}YZ
+    X\x{1234}\x{512}  
+
+/a\Cb/
+    aXb
+    a\nb
+  
+/a\Cb/8
+    aXb
+    a\nb
+    
+/a\C\Cb/8 
+    a\x{100}b 
+
+/ab\Cde/8
+    abXde
+
+/a\C\Cb/8 
+    a\x{100}b
+    ** Failers 
+    a\x{12257}b
+
+/[\xC3]/8
+
+/\xC3/8
+
+/\xC3\xC3\xC3xxx/8
+
+/\xC3\xC3\xC3xxx/8?DZSS
+
+/abc/8
+    \xC3]
+    \xC3
+    \xC3\xC3\xC3
+    \xC3\xC3\xC3\?
+    \xe1\x88 
+    \P\xe1\x88 
+    \P\P\xe1\x88 
+    XX\xea
+    \O0XX\xea
+    \O1XX\xea
+    \O2XX\xea
+    XX\xf1
+    XX\xf8  
+    XX\xfc
+    ZZ\xea\xaf\x20YY
+    ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY  
+    ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY  
+    ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY  
+    ZZ\xffYY
+    ZZ\xfeYY  
+
+/anything/8
+    \xc0\x80
+    \xc1\x8f 
+    \xe0\x9f\x80
+    \xf0\x8f\x80\x80 
+    \xf8\x87\x80\x80\x80  
+    \xfc\x83\x80\x80\x80\x80
+    \xfe\x80\x80\x80\x80\x80  
+    \xff\x80\x80\x80\x80\x80  
+    \xc3\x8f
+    \xe0\xaf\x80
+    \xe1\x80\x80
+    \xf0\x9f\x80\x80 
+    \xf1\x8f\x80\x80 
+    \xf8\x88\x80\x80\x80  
+    \xf9\x87\x80\x80\x80  
+    \xfc\x84\x80\x80\x80\x80
+    \xfd\x83\x80\x80\x80\x80
+    \?\xf8\x88\x80\x80\x80  
+    \?\xf9\x87\x80\x80\x80  
+    \?\xfc\x84\x80\x80\x80\x80
+    \?\xfd\x83\x80\x80\x80\x80
+
+/\x{100}/8DZ
+
+/\x{1000}/8DZ
+
+/\x{10000}/8DZ
+
+/\x{100000}/8DZ
+
+/\x{10ffff}/8DZ
+
+/[\x{ff}]/8DZ
+
+/[\x{100}]/8DZ
+
+/\x80/8DZ
+
+/\xff/8DZ
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
+    \x{D55c}\x{ad6d}\x{C5B4} 
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+    \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+
+/\x{084}/DZ8
+
+/\x{104}/DZ8
+
+/\x{861}/DZ8
+
+/\x{212ab}/DZ8
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+    
+/X(\C)(.*)/8
+    X\x{1234}
+    X\nabc 
+
+/-- This one is here because Perl gives out a grumbly error message (quite 
+correctly, but that messes up comparisons). --/
+    
+/a\Cb/8
+    *** Failers 
+    a\x{100}b 
+    
+/[^ab\xC0-\xF0]/8SDZ
+    \x{f1}
+    \x{bf}
+    \x{100}
+    \x{1000}   
+    *** Failers
+    \x{c0} 
+    \x{f0} 
+
+/Ā{3,4}/8SDZ
+  \x{100}\x{100}\x{100}\x{100\x{100}
+
+/(\x{100}+|x)/8SDZ
+
+/(\x{100}*a|x)/8SDZ
+
+/(\x{100}{0,2}a|x)/8SDZ
+
+/(\x{100}{1,2}a|x)/8SDZ
+
+/\x{100}/8DZ
+
+/a\x{100}\x{101}*/8DZ
+
+/a\x{100}\x{101}+/8DZ
+
+/[^\x{c4}]/DZ
+
+/[\x{100}]/8DZ
+    \x{100}
+    Z\x{100}
+    \x{100}Z
+    *** Failers 
+
+/[\xff]/DZ8
+    >\x{ff}<
+
+/[^\xff]/8DZ
+
+/\x{100}abc(xyz(?1))/8DZ
+
+/a\x{1234}b/P8
+    a\x{1234}b
+
+/\777/8I
+  \x{1ff}
+  \777 
+  
+/\x{100}+\x{200}/8DZ
+
+/\x{100}+X/8DZ
+
+/^[\QĀ\E-\QŐ\E/BZ8
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
+    
+/X/8
+    \x{0}\x{d7ff}\x{e000}\x{10ffff}
+    \x{d800}
+    \x{d800}\?
+    \x{da00}
+    \x{da00}\?
+    \x{dfff}
+    \x{dfff}\?
+    \x{110000}    
+    \x{110000}\?    
+    \x{2000000} 
+    \x{2000000}\? 
+    \x{7fffffff} 
+    \x{7fffffff}\? 
+
+/(*UTF8)\x{1234}/
+  abcd\x{1234}pqr
+
+/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
+
+/\h/SI8
+    ABC\x{09}
+    ABC\x{20}
+    ABC\x{a0}
+    ABC\x{1680}
+    ABC\x{180e}
+    ABC\x{2000}
+    ABC\x{202f} 
+    ABC\x{205f} 
+    ABC\x{3000} 
+
+/\v/SI8
+    ABC\x{0a}
+    ABC\x{0b}
+    ABC\x{0c}
+    ABC\x{0d}
+    ABC\x{85}
+    ABC\x{2028}
+
+/\h*A/SI8
+    CDBABC
+    
+/\v+A/SI8
+
+/\s?xxx\s/8SI
+
+/\sxxx\s/I8ST1
+    AB\x{85}xxx\x{a0}XYZ
+    AB\x{a0}xxx\x{85}XYZ
+
+/\S \S/I8ST1
+    \x{a2} \x{84} 
+    A Z 
+
+/a+/8
+    a\x{123}aa\>1
+    a\x{123}aa\>2
+    a\x{123}aa\>3
+    a\x{123}aa\>4
+    a\x{123}aa\>5
+    a\x{123}aa\>6
+
+/\x{1234}+/iS8I
+
+/\x{1234}+?/iS8I
+
+/\x{1234}++/iS8I
+
+/\x{1234}{2}/iS8I
+
+/[^\x{c4}]/8DZ
+
+/X+\x{200}/8DZ
+
+/\R/SI8
+
+/-- End of testinput17 --/

Modified: code/branches/pcre16/testdata/testinput2
===================================================================
--- code/branches/pcre16/testdata/testinput2    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testinput2    2011-12-19 13:34:10 UTC (rev 810)
@@ -5,8 +5,8 @@
     either because PCRE can't be compatible, or there is a possible Perl 
     bug.

-    NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test
-    5, and if Unicode Property Support is needed, use test 13. --/  
+    NOTE: This is a non-UTF set of tests. When UTF support is needed, use
+    test 5, and if Unicode Property Support is needed, use test 13. --/

 /-- Originally, the Perl >= 5.10 things were in here too, but now I have 
     separated many (most?) of them out into test 11. However, there may still

Modified: code/branches/pcre16/testdata/testinput4
===================================================================
--- code/branches/pcre16/testdata/testinput4    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testinput4    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,5 +1,6 @@
-/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is
-    compatible with all versions of Perl 5. --/
+/-- This set of tests is for UTF support, excluding Unicode properties. It is
+    compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE 
+    libraries. --/

 /a.b/8
     acb
@@ -126,31 +127,6 @@
     *** Failers
     XYZ

-/X(\C{3})/8
-    X\x{1234}
-
-/X(\C{4})/8
-    X\x{1234}YZ
-    
-/X\C*/8
-    XYZabcdce
-    
-/X\C*?/8
-    XYZabcde
-    
-/X\C{3,5}/8
-    Xabcdefg   
-    X\x{1234} 
-    X\x{1234}YZ
-    X\x{1234}\x{512}  
-    X\x{1234}\x{512}YZ
-
-/X\C{3,5}?/8
-    Xabcdefg   
-    X\x{1234} 
-    X\x{1234}YZ
-    X\x{1234}\x{512}  
-
 /[^a]+/8g
     bcd
     \x{100}aY\x{256}Z 
@@ -456,17 +432,6 @@
     \x{150}X
     \x{200}X

-/a\Cb/
-    aXb
-    a\nb
-  
-/a\Cb/8
-    aXb
-    a\nb
-    
-/a\C\Cb/8 
-    a\x{100}b 
-
 /[z-\x{100}]/8i
     z
     Z 
@@ -650,7 +615,4 @@
 /(abc)\1/8
    abc

-/ab\Cde/8
-    abXde
-
 /-- End of testinput4 --/

Modified: code/branches/pcre16/testdata/testinput5
===================================================================
--- code/branches/pcre16/testdata/testinput5    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testinput5    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,22 +1,9 @@
-/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8
-    support, excluding Unicode properties. --/
+/-- This set of tests checks the API, internals, and non-Perl stuff for UTF
+    support, excluding Unicode properties. However, tests that give different
+    results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/

-/\x{100}/8DZ
-
-/\x{1000}/8DZ
-
-/\x{10000}/8DZ
-
-/\x{100000}/8DZ
-
-/\x{10ffff}/8DZ
-
/\x{110000}/8DZ

-/[\x{ff}]/8DZ
-
-/[\x{100}]/8DZ
-
/\x{ffffffff}/8

 /\x{100000000}/8
@@ -32,54 +19,18 @@
 /^\x{100}a\x{1234}/8
     \x{100}a\x{1234}bcd

-/\x80/8DZ
-
-/\xff/8DZ
-
 /\x{0041}\x{2262}\x{0391}\x{002e}/DZ8
     \x{0041}\x{2262}\x{0391}\x{002e}

-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
-    \x{D55c}\x{ad6d}\x{C5B4} 
-
-/\x{65e5}\x{672c}\x{8a9e}/DZ8
-    \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/DZ8
-
-/\x{084}/DZ8
-
-/\x{104}/DZ8
-
-/\x{861}/DZ8
-
-/\x{212ab}/DZ8
-
 /.{3,5}X/DZ8
     \x{212ab}\x{212ab}\x{212ab}\x{861}X

-
 /.{3,5}?/DZ8
     \x{212ab}\x{212ab}\x{212ab}\x{861}

 /(?<=\C)X/8
     Should produce an error diagnostic

-/-- This one is here not because it's different to Perl, but because the way
-the captured single-byte is displayed. (In Perl it becomes a character, and you
-can't tell the difference.) --/
-    
-/X(\C)(.*)/8
-    X\x{1234}
-    X\nabc 
-
-/-- This one is here because Perl gives out a grumbly error message (quite 
-correctly, but that messes up comparisons). --/
-    
-/a\Cb/8
-    *** Failers 
-    a\x{100}b 
-    
 /^[ab]/8DZ
     bar
     *** Failers
@@ -94,26 +45,6 @@
     *** Failers 
     aaa

-/[^ab\xC0-\xF0]/8SDZ
-    \x{f1}
-    \x{bf}
-    \x{100}
-    \x{1000}   
-    *** Failers
-    \x{c0} 
-    \x{f0} 
-
-/Ā{3,4}/8SDZ
-  \x{100}\x{100}\x{100}\x{100\x{100}
-
-/(\x{100}+|x)/8SDZ
-
-/(\x{100}*a|x)/8SDZ
-
-/(\x{100}{0,2}a|x)/8SDZ
-
-/(\x{100}{1,2}a|x)/8SDZ
-
 /\x{100}*(\d+|"(?1)")/8
     1234
     "1234" 
@@ -124,33 +55,17 @@
     *** Failers 
     \x{100}\x{100}abcd

-/\x{100}/8DZ
-
/\x{100}*/8DZ

/a\x{100}*/8DZ

/ab\x{100}*/8DZ

-/a\x{100}\x{101}*/8DZ
-
-/a\x{100}\x{101}+/8DZ
-
 /\x{100}*A/8DZ
     A

/\x{100}*\d(?R)/8DZ

-/[^\x{c4}]/DZ
-
-/[^\x{c4}]/8DZ
-
-/[\x{100}]/8DZ
-    \x{100}
-    Z\x{100}
-    \x{100}Z
-    *** Failers 
-
 /[Z\x{100}]/8DZ
     Z\x{100}
     \x{100}
@@ -175,13 +90,8 @@
 /[\xFF]/DZ
     >\xff<

-/[\xff]/DZ8
-    >\x{ff}<
-
 /[^\xFF]/DZ

-/[^\xff]/8DZ
-
 /[Ä-Ü]/8
     Ö # Matches without Study
     \x{d6}
@@ -198,61 +108,6 @@
     Ö <-- Same with Study
     \x{d6}

-/[\xC3]/8
-
-/\xC3/8
-
-/\xC3\xC3\xC3xxx/8
-
-/\xC3\xC3\xC3xxx/8?DZSS
-
-/abc/8
-    \xC3]
-    \xC3
-    \xC3\xC3\xC3
-    \xC3\xC3\xC3\?
-    \xe1\x88 
-    \P\xe1\x88 
-    \P\P\xe1\x88 
-    XX\xea
-    \O0XX\xea
-    \O1XX\xea
-    \O2XX\xea
-    XX\xf1
-    XX\xf8  
-    XX\xfc
-    ZZ\xea\xaf\x20YY
-    ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY  
-    ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY  
-    ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY  
-    ZZ\xffYY
-    ZZ\xfeYY  
-
-/anything/8
-    \xc0\x80
-    \xc1\x8f 
-    \xe0\x9f\x80
-    \xf0\x8f\x80\x80 
-    \xf8\x87\x80\x80\x80  
-    \xfc\x83\x80\x80\x80\x80
-    \xfe\x80\x80\x80\x80\x80  
-    \xff\x80\x80\x80\x80\x80  
-    \xc3\x8f
-    \xe0\xaf\x80
-    \xe1\x80\x80
-    \xf0\x9f\x80\x80 
-    \xf1\x8f\x80\x80 
-    \xf8\x88\x80\x80\x80  
-    \xf9\x87\x80\x80\x80  
-    \xfc\x84\x80\x80\x80\x80
-    \xfd\x83\x80\x80\x80\x80
-    \?\xf8\x88\x80\x80\x80  
-    \?\xf9\x87\x80\x80\x80  
-    \?\xfc\x84\x80\x80\x80\x80
-    \?\xfd\x83\x80\x80\x80\x80
-
-/\x{100}abc(xyz(?1))/8DZ
-
 /[^\x{100}]abc(xyz(?1))/8DZ

 /[ab\x{100}]abc(xyz(?1))/8DZ
@@ -272,17 +127,10 @@
 /\w/8
     \x{100}X

-/a\x{1234}b/P8
-    a\x{1234}b
-
 /^\ሴ/8DZ

/\777/I

-/\777/8I
- \x{1ff}
- \777
-
/\x{100}*\d/8DZ

/\x{100}*\s/8DZ
@@ -295,12 +143,6 @@

/\x{100}*\W/8DZ

-/\x{100}+\x{200}/8DZ
-
-/\x{100}+X/8DZ
-
-/X+\x{200}/8DZ
-
/()()()()()()()()()()
()()()()()()()()()()
()()()()()()()()()()
@@ -312,8 +154,6 @@

/^[\QĀ\E-\QŐ\E]/BZ8

-/^[\QĀ\E-\QŐ\E/BZ8
-
 /^abc./mgx8<any>
     abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK

@@ -408,23 +248,6 @@
 /.*$/8<any>
     \x{1ec5}

-/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
-    
-/X/8
-    \x{0}\x{d7ff}\x{e000}\x{10ffff}
-    \x{d800}
-    \x{d800}\?
-    \x{da00}
-    \x{da00}\?
-    \x{dfff}
-    \x{dfff}\?
-    \x{110000}    
-    \x{110000}\?    
-    \x{2000000} 
-    \x{2000000}\? 
-    \x{7fffffff} 
-    \x{7fffffff}\? 
-
 /a\Rb/I8<bsr_anycrlf>
     a\rb
     a\nb
@@ -488,11 +311,6 @@
 /X/8f<any> 
     A\x{1ec5}ABCXYZ

-/(*UTF8)\x{1234}/
-  abcd\x{1234}pqr
-
-/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
-
 /Xa{2,4}b/8
     X\P
     Xa\P
@@ -776,53 +594,17 @@

/\h/SI

-/\h/SI8
-    ABC\x{09}
-    ABC\x{20}
-    ABC\x{a0}
-    ABC\x{1680}
-    ABC\x{180e}
-    ABC\x{2000}
-    ABC\x{202f} 
-    ABC\x{205f} 
-    ABC\x{3000} 
-
 /\v/SI

-/\v/SI8
-    ABC\x{0a}
-    ABC\x{0b}
-    ABC\x{0c}
-    ABC\x{0d}
-    ABC\x{85}
-    ABC\x{2028}
-
 /\R/SI

-/\R/SI8
-
-/\h*A/SI8
-    CDBABC
-    
-/\v+A/SI8
-
-/\s?xxx\s/8SI
-
 /\sxxx\s/8T1
     AB\x{85}xxx\x{a0}XYZ
     AB\x{a0}xxx\x{85}XYZ

-/\sxxx\s/I8ST1
-    AB\x{85}xxx\x{a0}XYZ
-    AB\x{a0}xxx\x{85}XYZ
-
 /\S \S/8T1
     \x{a2} \x{84}

-/\S \S/I8ST1
-    \x{a2} \x{84} 
-    A Z 
-
 'A#хц'8x<any>BZ

'A#хц
@@ -840,14 +622,6 @@
/\g{A}xxx#bх(?'A'123)
(?'A'456)/8x<any>BZ

-/a+/8
-    a\x{123}aa\>1
-    a\x{123}aa\>2
-    a\x{123}aa\>3
-    a\x{123}aa\>4
-    a\x{123}aa\>5
-    a\x{123}aa\>6
-
 /^\cģ/8

 /(\R*)(.)/s8
@@ -860,14 +634,6 @@
     \r\r\n\n\r 
     \r\r\n\n\r\n

-/\x{1234}+/iS8I
-
-/\x{1234}+?/iS8I
-
-/\x{1234}++/iS8I
-
-/\x{1234}{2}/iS8I
-
/[^\x{1234}]+/iS8I

/[^\x{1234}]+?/iS8I
@@ -889,5 +655,11 @@

 /f.*/8s
     \P\Pfor
+    
+/\x{d7ff}\x{e000}/8

+/\x{d800}/8
+
+/\x{dfff}/8
+
/-- End of testinput5 --/

Modified: code/branches/pcre16/testdata/testoutput1
===================================================================
--- code/branches/pcre16/testdata/testoutput1    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput1    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,5 +1,6 @@
 /-- This set of tests is for features that are compatible with all versions of
-    Perl 5, in non-UTF-8 mode. --/
+    Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and 
+    16-bit PCRE libraries. --/

 /the quick brown fox/
     the quick brown fox

Modified: code/branches/pcre16/testdata/testoutput10
===================================================================
--- code/branches/pcre16/testdata/testoutput10    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput10    2011-12-19 13:34:10 UTC (rev 810)
@@ -374,7 +374,7 @@
  17     End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'A'
 Need char = '.'

@@ -387,8 +387,8 @@
  18     End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
-First char = 237
+Options: utf
+First char = \x{ed}
 Need char = 180

 /\x{65e5}\x{672c}\x{8a9e}/D8M
@@ -400,8 +400,8 @@
  18     End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
-First char = 230
+Options: utf
+First char = \x{e6}
 Need char = 158

/[\x{100}]/8BM

Modified: code/branches/pcre16/testdata/testoutput13
===================================================================
--- code/branches/pcre16/testdata/testoutput13    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput13    2011-12-19 13:34:10 UTC (rev 810)
@@ -57,7 +57,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -69,7 +69,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char
     1234
@@ -83,7 +83,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char
     1234
@@ -105,7 +105,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 First char = 'A' (caseless)
 No need char

@@ -117,7 +117,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'A'
 Need char = 176

@@ -129,7 +129,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'A'
 Need char = 176

@@ -141,7 +141,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 First char = 'A' (caseless)
 Need char = 'B' (caseless)

@@ -153,7 +153,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 No first char
 No need char
     \x{104}
@@ -177,7 +177,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 No first char
 No need char
     Z
@@ -215,7 +215,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 No first char
 No need char

@@ -1049,7 +1049,7 @@

/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 17

Added: code/branches/pcre16/testdata/testoutput16
===================================================================
--- code/branches/pcre16/testdata/testoutput16                            (rev 0)
+++ code/branches/pcre16/testdata/testoutput16    2011-12-19 13:34:10 UTC (rev 810)
@@ -0,0 +1,819 @@
+/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit
+    library. There are some non-UTF 16-bit tests as well (it doesn't seem
+    worth setting up another test file just for this case). --/
+
+/\xC3\xC3\xC3xxx/8?DZSS
+**Failed: invalid UTF-8 string cannot be converted to UTF-16
+
+/abc/8
+    \xC3]
+**Failed: invalid UTF-8 string cannot be converted to UTF-16
+
+/X(\C{3})/8
+    X\x{11234}Y
+ 0: X\x{11234}Y
+ 1: \x{11234}Y
+
+/X(\C{4})/8
+    X\x{11234}YZ
+ 0: X\x{11234}YZ
+ 1: \x{11234}YZ
+    
+/X\C*/8
+    XYZabcdce
+ 0: XYZabcdce
+    
+/X\C*?/8
+    XYZabcde
+ 0: X
+    
+/X\C{3,5}/8
+    Xabcdefg   
+ 0: Xabcde
+    X\x{11234}Y 
+ 0: X\x{11234}Y
+    X\x{11234}YZ
+ 0: X\x{11234}YZ
+    X\x{11234}\x{512}  
+ 0: X\x{11234}\x{512}
+    X\x{11234}\x{512}YZ
+ 0: X\x{11234}\x{512}YZ
+    X\x{11234}\x{512}\x{11234}Z
+ 0: X\x{11234}\x{512}\x{11234}
+
+/X\C{3,5}?/8
+    Xabcdefg   
+ 0: Xabc
+    X\x{11234}Y 
+ 0: X\x{11234}Y
+    X\x{11234}YZ
+ 0: X\x{11234}Y
+    X\x{11234}\x{512}YZ  
+ 0: X\x{11234}\x{512}
+    *** Failers
+No match
+    X\x{11234}
+No match
+
+/a\Cb/
+    aXb
+ 0: aXb
+    a\nb
+ 0: a\x0ab
+  
+/a\Cb/8
+    aXb
+ 0: aXb
+    a\nb
+ 0: a\x{0a}b
+    
+/a\C\Cb/8 
+    a\x{12257}b
+ 0: a\x{12257}b
+    ** Failers 
+No match
+    a\x{100}b
+No match
+
+/ab\Cde/8
+    abXde
+ 0: abXde
+    
+/-- Check maximum non-UTF character size --/
+
+/\x{ffff}/
+
+/\x{10000}/ 
+Failed: character value in \x{...} sequence is too large at offset 8
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/\x{1000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{1000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{1000}
+No need char
+
+/\x{10000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{10000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d800}
+Need char = 56320
+
+/\x{100000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{dbc0}
+Need char = 56320
+
+/\x{10ffff}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{10ffff}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{dbff}
+Need char = 57343
+
+/[\x{ff}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \xff
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/\x80/8DZ
+------------------------------------------------------------------
+        Bra
+        \x80
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{80}
+No need char
+
+/\xff/8DZ
+------------------------------------------------------------------
+        Bra
+        \xff
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
+------------------------------------------------------------------
+        Bra
+        \x{d55c}\x{ad6d}\x{c5b4}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d55c}
+Need char = 50612
+    \x{D55c}\x{ad6d}\x{C5B4} 
+ 0: \x{d55c}\x{ad6d}\x{c5b4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{65e5}\x{672c}\x{8a9e}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{65e5}
+Need char = 35486
+    \x{65e5}\x{672c}\x{8a9e}
+ 0: \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x80
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{80}
+No need char
+
+/\x{084}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x84
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{84}
+No need char
+
+/\x{104}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{104}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{104}
+No need char
+
+/\x{861}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{861}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{861}
+No need char
+
+/\x{212ab}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{212ab}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{d844}
+Need char = 57003
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+    
+/X(\C)(.*)/8
+    X\x{1234}
+ 0: X\x{1234}
+ 1: \x{1234}
+ 2: 
+    X\nabc 
+ 0: X\x{0a}abc
+ 1: \x{0a}
+ 2: abc
+
+/-- This one is here because Perl gives out a grumbly error message (quite 
+correctly, but that messes up comparisons). --/
+    
+/a\Cb/8
+    *** Failers 
+No match
+    a\x{100}b 
+ 0: a\x{100}b
+    
+/[^ab\xC0-\xF0]/8SDZ
+------------------------------------------------------------------
+        Bra
+        [\x00-`c-\xbf\xf1-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e 
+  \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d 
+  \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac 
+  \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb 
+  \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb 
+  \xfc \xfd \xfe \xff 
+    \x{f1}
+ 0: \x{f1}
+    \x{bf}
+ 0: \x{bf}
+    \x{100}
+ 0: \x{100}
+    \x{1000}   
+ 0: \x{1000}
+    *** Failers
+ 0: *
+    \x{c0} 
+No match
+    \x{f0} 
+No match
+
+/Ā{3,4}/8SDZ
+------------------------------------------------------------------
+        Bra
+        \x{100}{3}
+        \x{100}?
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 256
+Subject length lower bound = 3
+No set of starting bytes
+  \x{100}\x{100}\x{100}\x{100\x{100}
+ 0: \x{100}\x{100}\x{100}
+
+/(\x{100}+|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}+
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xff 
+
+/(\x{100}*a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}*+
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xff 
+
+/(\x{100}{0,2}a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}{0,2}
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xff 
+
+/(\x{100}{1,2}a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}
+        \x{100}{0,1}
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xff 
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+
+/a\x{100}\x{101}*/8DZ
+------------------------------------------------------------------
+        Bra
+        a\x{100}
+        \x{101}*
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 256
+
+/a\x{100}\x{101}+/8DZ
+------------------------------------------------------------------
+        Bra
+        a\x{100}
+        \x{101}+
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 257
+
+/[^\x{c4}]/DZ
+------------------------------------------------------------------
+        Bra
+        [^\xc4]
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+No need char
+    \x{100}
+ 0: \x{100}
+    Z\x{100}
+ 0: \x{100}
+    \x{100}Z
+ 0: \x{100}
+    *** Failers 
+No match
+
+/[\xff]/DZ8
+------------------------------------------------------------------
+        Bra
+        \xff
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ff}
+No need char
+    >\x{ff}<
+ 0: \x{ff}
+
+/[^\xff]/8DZ
+------------------------------------------------------------------
+        Bra
+        [^\x{ff}]
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/\x{100}abc(xyz(?1))/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}abc
+        CBra 1
+        xyz
+        Recurse
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+First char = \x{100}
+Need char = 'z'
+
+/\777/8I
+Capturing subpattern count = 0
+Options: utf
+First char = \x{1ff}
+No need char
+  \x{1ff}
+ 0: \x{1ff}
+  \777 
+ 0: \x{1ff}
+  
+/\x{100}+\x{200}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}++
+        \x{200}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 512
+
+/\x{100}+X/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}++
+        X
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{100}
+Need char = 'X'
+
+/^[\QĀ\E-\QŐ\E/BZ8
+Failed: missing terminating ] for character class at offset 13
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
+    
+/X/8
+    \x{0}\x{d7ff}\x{e000}\x{10ffff}
+No match
+    \x{d800}
+Error -10 (bad UTF-8 string) offset=0 reason=1
+    \x{d800}\?
+No match
+    \x{da00}
+Error -10 (bad UTF-8 string) offset=0 reason=1
+    \x{da00}\?
+No match
+    \x{dfff}
+Error -10 (bad UTF-8 string) offset=0 reason=3
+    \x{dfff}\?
+No match
+    \x{110000}    
+Error -10 (bad UTF-8 string) offset=0 reason=3
+    \x{110000}\?    
+No match
+    \x{2000000} 
+Error -10 (bad UTF-8 string) offset=1 reason=3
+    \x{2000000}\? 
+No match
+    \x{7fffffff} 
+Error -10 (bad UTF-8 string) offset=1 reason=3
+    \x{7fffffff}\? 
+No match
+
+/(*UTF16)\x{11234}/
+  abcd\x{11234}pqr
+ 0: \x{11234}
+
+/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
+Capturing subpattern count = 0
+Options: bsr_unicode utf
+Forced newline sequence: CRLF
+First char = 'a'
+Need char = 'b'
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 \xff 
+    ABC\x{09}
+ 0: \x{09}
+    ABC\x{20}
+ 0:  
+    ABC\x{a0}
+ 0: \x{a0}
+    ABC\x{1680}
+ 0: \x{1680}
+    ABC\x{180e}
+ 0: \x{180e}
+    ABC\x{2000}
+ 0: \x{2000}
+    ABC\x{202f} 
+ 0: \x{202f}
+    ABC\x{205f} 
+ 0: \x{205f}
+    ABC\x{3000} 
+ 0: \x{3000}
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff 
+    ABC\x{0a}
+ 0: \x{0a}
+    ABC\x{0b}
+ 0: \x{0b}
+    ABC\x{0c}
+ 0: \x{0c}
+    ABC\x{0d}
+ 0: \x{0d}
+    ABC\x{85}
+ 0: \x{85}
+    ABC\x{2028}
+ 0: \x{2028}
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xa0 
+    CDBABC
+ 0: A
+    
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff 
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x 
+
+/\sxxx\s/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 5
+Starting byte set: \x09 \x0a \x0c \x0d \x20 \x85 \xa0 
+    AB\x{85}xxx\x{a0}XYZ
+ 0: \x{85}xxx\x{a0}
+    AB\x{a0}xxx\x{85}XYZ
+ 0: \x{a0}xxx\x{85}
+
+/\S \S/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = ' '
+Subject length lower bound = 3
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e 
+  \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d 
+  \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ 
+  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e 
+  f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 
+  \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 
+  \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 
+  \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 
+  \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 
+  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
+  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
+  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
+  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
+  \xfe \xff 
+    \x{a2} \x{84} 
+ 0: \x{a2} \x{84}
+    A Z 
+ 0: A Z
+
+/a+/8
+    a\x{123}aa\>1
+ 0: aa
+    a\x{123}aa\>2
+ 0: aa
+    a\x{123}aa\>3
+ 0: a
+    a\x{123}aa\>4
+No match
+    a\x{123}aa\>5
+Error -24 (bad offset value)
+    a\x{123}aa\>6
+Error -24 (bad offset value)
+
+/\x{1234}+/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}+?/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}++/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+No need char
+Subject length lower bound = 1
+No set of starting bytes
+
+/\x{1234}{2}/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+First char = \x{1234}
+Need char = 4660
+Subject length lower bound = 2
+No set of starting bytes
+
+/[^\x{c4}]/8DZ
+------------------------------------------------------------------
+        Bra
+        [^\x{c4}]
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/X+\x{200}/8DZ
+------------------------------------------------------------------
+        Bra
+        X++
+        \x{200}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'X'
+Need char = 512
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff 
+
+/-- End of testinput16 --/

Added: code/branches/pcre16/testdata/testoutput17
===================================================================
--- code/branches/pcre16/testdata/testoutput17                            (rev 0)
+++ code/branches/pcre16/testdata/testoutput17    2011-12-19 13:34:10 UTC (rev 810)
@@ -0,0 +1,907 @@
+/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit 
+    library. --/
+
+/X(\C{3})/8
+    X\x{1234}
+ 0: X\x{1234}
+ 1: \x{1234}
+
+/X(\C{4})/8
+    X\x{1234}YZ
+ 0: X\x{1234}Y
+ 1: \x{1234}Y
+    
+/X\C*/8
+    XYZabcdce
+ 0: XYZabcdce
+    
+/X\C*?/8
+    XYZabcde
+ 0: X
+    
+/X\C{3,5}/8
+    Xabcdefg   
+ 0: Xabcde
+    X\x{1234} 
+ 0: X\x{1234}
+    X\x{1234}YZ
+ 0: X\x{1234}YZ
+    X\x{1234}\x{512}  
+ 0: X\x{1234}\x{512}
+    X\x{1234}\x{512}YZ
+ 0: X\x{1234}\x{512}
+
+/X\C{3,5}?/8
+    Xabcdefg   
+ 0: Xabc
+    X\x{1234} 
+ 0: X\x{1234}
+    X\x{1234}YZ
+ 0: X\x{1234}
+    X\x{1234}\x{512}  
+ 0: X\x{1234}
+
+/a\Cb/
+    aXb
+ 0: aXb
+    a\nb
+ 0: a\x0ab
+  
+/a\Cb/8
+    aXb
+ 0: aXb
+    a\nb
+ 0: a\x{0a}b
+    
+/a\C\Cb/8 
+    a\x{100}b 
+ 0: a\x{100}b
+
+/ab\Cde/8
+    abXde
+ 0: abXde
+
+/a\C\Cb/8 
+    a\x{100}b
+ 0: a\x{100}b
+    ** Failers 
+No match
+    a\x{12257}b
+No match
+
+/[\xC3]/8
+Failed: invalid UTF-8 string at offset 1
+
+/\xC3/8
+Failed: invalid UTF-8 string at offset 0
+
+/\xC3\xC3\xC3xxx/8
+Failed: invalid UTF-8 string at offset 0
+
+/\xC3\xC3\xC3xxx/8?DZSS
+------------------------------------------------------------------
+        Bra
+        \X{c0}\X{c0}\X{c0}xxx
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf no_utf_check
+First char = \x{c3}
+Need char = 'x'
+
+/abc/8
+    \xC3]
+Error -10 (bad UTF-8 string) offset=0 reason=6
+    \xC3
+Error -10 (bad UTF-8 string) offset=0 reason=1
+    \xC3\xC3\xC3
+Error -10 (bad UTF-8 string) offset=0 reason=6
+    \xC3\xC3\xC3\?
+No match
+    \xe1\x88 
+Error -10 (bad UTF-8 string) offset=0 reason=1
+    \P\xe1\x88 
+Error -10 (bad UTF-8 string) offset=0 reason=1
+    \P\P\xe1\x88 
+Error -25 (short UTF-8 string) offset=0 reason=1
+    XX\xea
+Error -10 (bad UTF-8 string) offset=2 reason=2
+    \O0XX\xea
+Error -10 (bad UTF-8 string)
+    \O1XX\xea
+Error -10 (bad UTF-8 string)
+    \O2XX\xea
+Error -10 (bad UTF-8 string) offset=2 reason=2
+    XX\xf1
+Error -10 (bad UTF-8 string) offset=2 reason=3
+    XX\xf8  
+Error -10 (bad UTF-8 string) offset=2 reason=4
+    XX\xfc
+Error -10 (bad UTF-8 string) offset=2 reason=5
+    ZZ\xea\xaf\x20YY
+Error -10 (bad UTF-8 string) offset=2 reason=7
+    ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY  
+Error -10 (bad UTF-8 string) offset=2 reason=8
+    ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY  
+Error -10 (bad UTF-8 string) offset=2 reason=9
+    ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY  
+Error -10 (bad UTF-8 string) offset=2 reason=10
+    ZZ\xffYY
+Error -10 (bad UTF-8 string) offset=2 reason=21
+    ZZ\xfeYY  
+Error -10 (bad UTF-8 string) offset=2 reason=21
+
+/anything/8
+    \xc0\x80
+Error -10 (bad UTF-8 string) offset=0 reason=15
+    \xc1\x8f 
+Error -10 (bad UTF-8 string) offset=0 reason=15
+    \xe0\x9f\x80
+Error -10 (bad UTF-8 string) offset=0 reason=16
+    \xf0\x8f\x80\x80 
+Error -10 (bad UTF-8 string) offset=0 reason=17
+    \xf8\x87\x80\x80\x80  
+Error -10 (bad UTF-8 string) offset=0 reason=18
+    \xfc\x83\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=19
+    \xfe\x80\x80\x80\x80\x80  
+Error -10 (bad UTF-8 string) offset=0 reason=21
+    \xff\x80\x80\x80\x80\x80  
+Error -10 (bad UTF-8 string) offset=0 reason=21
+    \xc3\x8f
+No match
+    \xe0\xaf\x80
+No match
+    \xe1\x80\x80
+No match
+    \xf0\x9f\x80\x80 
+No match
+    \xf1\x8f\x80\x80 
+No match
+    \xf8\x88\x80\x80\x80  
+Error -10 (bad UTF-8 string) offset=0 reason=11
+    \xf9\x87\x80\x80\x80  
+Error -10 (bad UTF-8 string) offset=0 reason=11
+    \xfc\x84\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=12
+    \xfd\x83\x80\x80\x80\x80
+Error -10 (bad UTF-8 string) offset=0 reason=12
+    \?\xf8\x88\x80\x80\x80  
+No match
+    \?\xf9\x87\x80\x80\x80  
+No match
+    \?\xfc\x84\x80\x80\x80\x80
+No match
+    \?\xfd\x83\x80\x80\x80\x80
+No match
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x{1000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{1000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e1}
+Need char = 128
+
+/\x{10000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{10000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f0}
+Need char = 128
+
+/\x{100000}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100000}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f4}
+Need char = 128
+
+/\x{10ffff}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{10ffff}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f4}
+Need char = 191
+
+/[\x{ff}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{ff}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x80/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{80}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 128
+
+/\xff/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{ff}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+
+/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
+------------------------------------------------------------------
+        Bra
+        \x{d55c}\x{ad6d}\x{c5b4}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{ed}
+Need char = 180
+    \x{D55c}\x{ad6d}\x{C5B4} 
+ 0: \x{d55c}\x{ad6d}\x{c5b4}
+
+/\x{65e5}\x{672c}\x{8a9e}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{65e5}\x{672c}\x{8a9e}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e6}
+Need char = 158
+    \x{65e5}\x{672c}\x{8a9e}
+ 0: \x{65e5}\x{672c}\x{8a9e}
+
+/\x{80}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{80}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 128
+
+/\x{084}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{84}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c2}
+Need char = 132
+
+/\x{104}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{104}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 132
+
+/\x{861}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{861}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{e0}
+Need char = 161
+
+/\x{212ab}/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{212ab}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{f0}
+Need char = 171
+
+/-- This one is here not because it's different to Perl, but because the way
+the captured single-byte is displayed. (In Perl it becomes a character, and you
+can't tell the difference.) --/
+    
+/X(\C)(.*)/8
+    X\x{1234}
+ 0: X\x{1234}
+ 1: \x{e1}
+ 2: \x{88}\x{b4}
+    X\nabc 
+ 0: X\x{0a}abc
+ 1: \x{0a}
+ 2: abc
+
+/-- This one is here because Perl gives out a grumbly error message (quite 
+correctly, but that messes up comparisons). --/
+    
+/a\Cb/8
+    *** Failers 
+No match
+    a\x{100}b 
+No match
+    
+/[^ab\xC0-\xF0]/8SDZ
+------------------------------------------------------------------
+        Bra
+        [\x00-`c-\xbf\xf1-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
+  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
+  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
+  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
+  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
+  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
+  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
+  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
+  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
+  \xfe \xff 
+    \x{f1}
+ 0: \x{f1}
+    \x{bf}
+ 0: \x{bf}
+    \x{100}
+ 0: \x{100}
+    \x{1000}   
+ 0: \x{1000}
+    *** Failers
+ 0: *
+    \x{c0} 
+No match
+    \x{f0} 
+No match
+
+/Ā{3,4}/8SDZ
+------------------------------------------------------------------
+        Bra
+        \x{100}{3}
+        \x{100}?
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+Subject length lower bound = 3
+No set of starting bytes
+  \x{100}\x{100}\x{100}\x{100\x{100}
+ 0: \x{100}\x{100}\x{100}
+
+/(\x{100}+|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}+
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xc4 
+
+/(\x{100}*a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}*+
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xc4 
+
+/(\x{100}{0,2}a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}{0,2}
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: a x \xc4 
+
+/(\x{100}{1,2}a|x)/8SDZ
+------------------------------------------------------------------
+        Bra
+        CBra 1
+        \x{100}
+        \x{100}{0,1}
+        a
+        Alt
+        x
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: x \xc4 
+
+/\x{100}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/a\x{100}\x{101}*/8DZ
+------------------------------------------------------------------
+        Bra
+        a\x{100}
+        \x{101}*
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 128
+
+/a\x{100}\x{101}+/8DZ
+------------------------------------------------------------------
+        Bra
+        a\x{100}
+        \x{101}+
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'a'
+Need char = 129
+
+/[^\x{c4}]/DZ
+------------------------------------------------------------------
+        Bra
+        [^\xc4]
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+
+/[\x{100}]/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+    \x{100}
+ 0: \x{100}
+    Z\x{100}
+ 0: \x{100}
+    \x{100}Z
+ 0: \x{100}
+    *** Failers 
+No match
+
+/[\xff]/DZ8
+------------------------------------------------------------------
+        Bra
+        \x{ff}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c3}
+Need char = 191
+    >\x{ff}<
+ 0: \x{ff}
+
+/[^\xff]/8DZ
+------------------------------------------------------------------
+        Bra
+        [\x00-\xfe] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/\x{100}abc(xyz(?1))/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}abc
+        CBra 1
+        xyz
+        Recurse
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 1
+Options: utf
+First char = \x{c4}
+Need char = 'z'
+
+/a\x{1234}b/P8
+    a\x{1234}b
+ 0: a\x{1234}b
+
+/\777/8I
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c7}
+Need char = 191
+  \x{1ff}
+ 0: \x{1ff}
+  \777 
+ 0: \x{1ff}
+  
+/\x{100}+\x{200}/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}++
+        \x{200}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 128
+
+/\x{100}+X/8DZ
+------------------------------------------------------------------
+        Bra
+        \x{100}++
+        X
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = \x{c4}
+Need char = 'X'
+
+/^[\QĀ\E-\QŐ\E/BZ8
+Failed: missing terminating ] for character class at offset 15
+
+/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
+    
+/X/8
+    \x{0}\x{d7ff}\x{e000}\x{10ffff}
+No match
+    \x{d800}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+    \x{d800}\?
+No match
+    \x{da00}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+    \x{da00}\?
+No match
+    \x{dfff}
+Error -10 (bad UTF-8 string) offset=0 reason=14
+    \x{dfff}\?
+No match
+    \x{110000}    
+Error -10 (bad UTF-8 string) offset=0 reason=13
+    \x{110000}\?    
+No match
+    \x{2000000} 
+Error -10 (bad UTF-8 string) offset=0 reason=11
+    \x{2000000}\? 
+No match
+    \x{7fffffff} 
+Error -10 (bad UTF-8 string) offset=0 reason=12
+    \x{7fffffff}\? 
+No match
+
+/(*UTF8)\x{1234}/
+  abcd\x{1234}pqr
+ 0: \x{1234}
+
+/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
+Capturing subpattern count = 0
+Options: bsr_unicode utf
+Forced newline sequence: CRLF
+First char = 'a'
+Need char = 'b'
+
+/\h/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 
+    ABC\x{09}
+ 0: \x{09}
+    ABC\x{20}
+ 0:  
+    ABC\x{a0}
+ 0: \x{a0}
+    ABC\x{1680}
+ 0: \x{1680}
+    ABC\x{180e}
+ 0: \x{180e}
+    ABC\x{2000}
+ 0: \x{2000}
+    ABC\x{202f} 
+ 0: \x{202f}
+    ABC\x{205f} 
+ 0: \x{205f}
+    ABC\x{3000} 
+ 0: \x{3000}
+
+/\v/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
+    ABC\x{0a}
+ 0: \x{0a}
+    ABC\x{0b}
+ 0: \x{0b}
+    ABC\x{0c}
+ 0: \x{0c}
+    ABC\x{0d}
+ 0: \x{0d}
+    ABC\x{85}
+ 0: \x{85}
+    ABC\x{2028}
+ 0: \x{2028}
+
+/\h*A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 
+    CDBABC
+ 0: A
+    
+/\v+A/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'A'
+Subject length lower bound = 2
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
+
+/\s?xxx\s/8SI
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 4
+Starting byte set: \x09 \x0a \x0c \x0d \x20 x 
+
+/\sxxx\s/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = 'x'
+Subject length lower bound = 5
+Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2 
+    AB\x{85}xxx\x{a0}XYZ
+ 0: \x{85}xxx\x{a0}
+    AB\x{a0}xxx\x{85}XYZ
+ 0: \x{a0}xxx\x{85}
+
+/\S \S/I8ST1
+Capturing subpattern count = 0
+Options: utf
+No first char
+Need char = ' '
+Subject length lower bound = 3
+Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e 
+  \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d 
+  \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ 
+  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e 
+  f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 
+  \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 
+  \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 
+  \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 
+  \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
+    \x{a2} \x{84} 
+ 0: \x{a2} \x{84}
+    A Z 
+ 0: A Z
+
+/a+/8
+    a\x{123}aa\>1
+ 0: aa
+    a\x{123}aa\>2
+Error -11 (bad UTF-8 offset)
+    a\x{123}aa\>3
+ 0: aa
+    a\x{123}aa\>4
+ 0: a
+    a\x{123}aa\>5
+No match
+    a\x{123}aa\>6
+Error -24 (bad offset value)
+
+/\x{1234}+/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1 
+
+/\x{1234}+?/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1 
+
+/\x{1234}++/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \xe1 
+
+/\x{1234}{2}/iS8I
+Capturing subpattern count = 0
+Options: caseless utf
+No first char
+No need char
+Subject length lower bound = 2
+Starting byte set: \xe1 
+
+/[^\x{c4}]/8DZ
+------------------------------------------------------------------
+        Bra
+        [\x00-\xc3\xc5-\xff] (neg)
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+
+/X+\x{200}/8DZ
+------------------------------------------------------------------
+        Bra
+        X++
+        \x{200}
+        Ket
+        End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: utf
+First char = 'X'
+Need char = 128
+
+/\R/SI8
+Capturing subpattern count = 0
+Options: utf
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
+
+/-- End of testinput17 --/

Modified: code/branches/pcre16/testdata/testoutput2
===================================================================
--- code/branches/pcre16/testdata/testoutput2    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput2    2011-12-19 13:34:10 UTC (rev 810)
@@ -5,8 +5,8 @@
     either because PCRE can't be compatible, or there is a possible Perl 
     bug.

-    NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test
-    5, and if Unicode Property Support is needed, use test 13. --/  
+    NOTE: This is a non-UTF set of tests. When UTF support is needed, use
+    test 5, and if Unicode Property Support is needed, use test 13. --/

 /-- Originally, the Perl >= 5.10 things were in here too, but now I have 
     separated many (most?) of them out into test 11. However, there may still 
@@ -6178,7 +6178,7 @@
 /\x{0000ff}/I
 Capturing subpattern count = 0
 No options
-First char = 255
+First char = \xff
 No need char

/^((?P<A>a1)|(?P<A>a2)b)/I

Modified: code/branches/pcre16/testdata/testoutput4
===================================================================
--- code/branches/pcre16/testdata/testoutput4    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput4    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,5 +1,6 @@
-/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is
-    compatible with all versions of Perl 5. --/
+/-- This set of tests is for UTF support, excluding Unicode properties. It is
+    compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE 
+    libraries. --/

 /a.b/8
     acb
@@ -255,46 +256,6 @@
     XYZ 
 No match

-/X(\C{3})/8
-    X\x{1234}
- 0: X\x{1234}
- 1: \x{1234}
-
-/X(\C{4})/8
-    X\x{1234}YZ
- 0: X\x{1234}Y
- 1: \x{1234}Y
-    
-/X\C*/8
-    XYZabcdce
- 0: XYZabcdce
-    
-/X\C*?/8
-    XYZabcde
- 0: X
-    
-/X\C{3,5}/8
-    Xabcdefg   
- 0: Xabcde
-    X\x{1234} 
- 0: X\x{1234}
-    X\x{1234}YZ
- 0: X\x{1234}YZ
-    X\x{1234}\x{512}  
- 0: X\x{1234}\x{512}
-    X\x{1234}\x{512}YZ
- 0: X\x{1234}\x{512}
-
-/X\C{3,5}?/8
-    Xabcdefg   
- 0: Xabc
-    X\x{1234} 
- 0: X\x{1234}
-    X\x{1234}YZ
- 0: X\x{1234}
-    X\x{1234}\x{512}  
- 0: X\x{1234}
-
 /[^a]+/8g
     bcd
  0: bcd
@@ -791,22 +752,6 @@
     \x{200}X   
 No match

-/a\Cb/
-    aXb
- 0: aXb
-    a\nb
- 0: a\x0ab
-  
-/a\Cb/8
-    aXb
- 0: aXb
-    a\nb
- 0: a\x{0a}b
-    
-/a\C\Cb/8 
-    a\x{100}b 
- 0: a\x{100}b
-
 /[z-\x{100}]/8i
     z
  0: z
@@ -1136,8 +1081,4 @@
    abc
 No match

-/ab\Cde/8
-    abXde
- 0: abXde
-
 /-- End of testinput4 --/

Modified: code/branches/pcre16/testdata/testoutput5
===================================================================
--- code/branches/pcre16/testdata/testoutput5    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput5    2011-12-19 13:34:10 UTC (rev 810)
@@ -1,93 +1,10 @@
-/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8
-    support, excluding Unicode properties. --/
+/-- This set of tests checks the API, internals, and non-Perl stuff for UTF
+    support, excluding Unicode properties. However, tests that give different
+    results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/

-/\x{100}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{1000}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{1000}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 225
-Need char = 128
-
-/\x{10000}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{10000}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 128
-
-/\x{100000}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100000}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 128
-
-/\x{10ffff}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{10ffff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 244
-Need char = 191
-
 /\x{110000}/8DZ
 Failed: character value in \x{...} sequence is too large at offset 9

-/[\x{ff}]/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{ff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
-/[\x{100}]/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
 /\x{ffffffff}/8
 Failed: character value in \x{...} sequence is too large at offset 11

@@ -108,30 +25,6 @@
     \x{100}a\x{1234}bcd
  0: \x{100}a\x{1234}

-/\x80/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{80}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\xff/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{ff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-
 /\x{0041}\x{2262}\x{0391}\x{002e}/DZ8
 ------------------------------------------------------------------
         Bra
@@ -140,100 +33,12 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'A'
 Need char = '.'
     \x{0041}\x{2262}\x{0391}\x{002e}
  0: A\x{2262}\x{391}.

-/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 
-------------------------------------------------------------------
-        Bra
-        \x{d55c}\x{ad6d}\x{c5b4}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 237
-Need char = 180
-    \x{D55c}\x{ad6d}\x{C5B4} 
- 0: \x{d55c}\x{ad6d}\x{c5b4}
-
-/\x{65e5}\x{672c}\x{8a9e}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{65e5}\x{672c}\x{8a9e}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 230
-Need char = 158
-    \x{65e5}\x{672c}\x{8a9e}
- 0: \x{65e5}\x{672c}\x{8a9e}
-
-/\x{80}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{80}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 128
-
-/\x{084}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{84}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 194
-Need char = 132
-
-/\x{104}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{104}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 132
-
-/\x{861}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{861}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 224
-Need char = 161
-
-/\x{212ab}/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{212ab}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 240
-Need char = 171
-
 /.{3,5}X/DZ8
 ------------------------------------------------------------------
         Bra
@@ -244,13 +49,12 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 Need char = 'X'
     \x{212ab}\x{212ab}\x{212ab}\x{861}X
  0: \x{212ab}\x{212ab}\x{212ab}\x{861}X

-
 /.{3,5}?/DZ8
 ------------------------------------------------------------------
         Bra
@@ -260,7 +64,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char
     \x{212ab}\x{212ab}\x{212ab}\x{861}
@@ -269,29 +73,6 @@
 /(?<=\C)X/8
 Failed: \C not allowed in lookbehind assertion at offset 6

-/-- This one is here not because it's different to Perl, but because the way
-the captured single-byte is displayed. (In Perl it becomes a character, and you
-can't tell the difference.) --/
-    
-/X(\C)(.*)/8
-    X\x{1234}
- 0: X\x{1234}
- 1: \xe1
- 2: \x88\xb4
-    X\nabc 
- 0: X\x{0a}abc
- 1: \x{0a}
- 2: abc
-
-/-- This one is here because Perl gives out a grumbly error message (quite 
-correctly, but that messes up comparisons). --/
-    
-/a\Cb/8
-    *** Failers 
-No match
-    a\x{100}b 
-No match
-    
 /^[ab]/8DZ
 ------------------------------------------------------------------
         Bra
@@ -301,7 +82,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
 No first char
 No need char
     bar
@@ -324,7 +105,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
 No first char
 No need char
     c
@@ -338,136 +119,6 @@
     aaa
 No match

-/[^ab\xC0-\xF0]/8SDZ
-------------------------------------------------------------------
-        Bra
-        [\x00-`c-\xbf\xf1-\xff] (neg)
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a 
-  \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 
-  \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 
-  5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y 
-  Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f 
-  \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 
-  \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf 
-  \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee 
-  \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd 
-  \xfe \xff 
-    \x{f1}
- 0: \x{f1}
-    \x{bf}
- 0: \x{bf}
-    \x{100}
- 0: \x{100}
-    \x{1000}   
- 0: \x{1000}
-    *** Failers
- 0: *
-    \x{c0} 
-No match
-    \x{f0} 
-No match
-
-/Ā{3,4}/8SDZ
-------------------------------------------------------------------
-        Bra
-        \x{100}{3}
-        \x{100}?
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-Subject length lower bound = 3
-No set of starting bytes
-  \x{100}\x{100}\x{100}\x{100\x{100}
- 0: \x{100}\x{100}\x{100}
-
-/(\x{100}+|x)/8SDZ
-------------------------------------------------------------------
-        Bra
-        CBra 1
-        \x{100}+
-        Alt
-        x
-        Ket
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: x \xc4 
-
-/(\x{100}*a|x)/8SDZ
-------------------------------------------------------------------
-        Bra
-        CBra 1
-        \x{100}*+
-        a
-        Alt
-        x
-        Ket
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: a x \xc4 
-
-/(\x{100}{0,2}a|x)/8SDZ
-------------------------------------------------------------------
-        Bra
-        CBra 1
-        \x{100}{0,2}
-        a
-        Alt
-        x
-        Ket
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: a x \xc4 
-
-/(\x{100}{1,2}a|x)/8SDZ
-------------------------------------------------------------------
-        Bra
-        CBra 1
-        \x{100}
-        \x{100}{0,1}
-        a
-        Alt
-        x
-        Ket
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: x \xc4 
-
 /\x{100}*(\d+|"(?1)")/8
     1234
  0: 1234
@@ -492,18 +143,6 @@
     \x{100}\x{100}abcd
 No match

-/\x{100}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
 /\x{100}*/8DZ
 ------------------------------------------------------------------
         Bra
@@ -512,7 +151,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -525,7 +164,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'a'
 No need char

@@ -538,36 +177,10 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 First char = 'a'
 Need char = 'b'

-/a\x{100}\x{101}*/8DZ
-------------------------------------------------------------------
-        Bra
-        a\x{100}
-        \x{101}*
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'a'
-Need char = 128
-
-/a\x{100}\x{101}+/8DZ
-------------------------------------------------------------------
-        Bra
-        a\x{100}
-        \x{101}+
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'a'
-Need char = 129
-
 /\x{100}*A/8DZ
 ------------------------------------------------------------------
         Bra
@@ -577,7 +190,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 Need char = 'A'
     A
@@ -593,54 +206,10 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

-/[^\x{c4}]/DZ
-------------------------------------------------------------------
-        Bra
-        [^\xc4]
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-
-/[^\x{c4}]/8DZ
-------------------------------------------------------------------
-        Bra
-        [\x00-\xc3\xc5-\xff] (neg)
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-
-/[\x{100}]/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-    \x{100}
- 0: \x{100}
-    Z\x{100}
- 0: \x{100}
-    \x{100}Z
- 0: \x{100}
-    *** Failers 
-No match
-
 /[Z\x{100}]/8DZ
 ------------------------------------------------------------------
         Bra
@@ -649,7 +218,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char
     Z\x{100}
@@ -684,7 +253,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -696,7 +265,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char
     \x{100}
@@ -713,25 +282,11 @@
 ------------------------------------------------------------------
 Capturing subpattern count = 0
 No options
-First char = 255
+First char = \xff
 No need char
     >\xff<
  0: \xff

-/[\xff]/DZ8
-------------------------------------------------------------------
-        Bra
-        \x{ff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 195
-Need char = 191
-    >\x{ff}<
- 0: \x{ff}
-
 /[^\xFF]/DZ
 ------------------------------------------------------------------
         Bra
@@ -744,18 +299,6 @@
 No first char
 No need char

-/[^\xff]/8DZ
-------------------------------------------------------------------
-        Bra
-        [\x00-\xfe] (neg)
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-
 /[Ä-Ü]/8
     Ö # Matches without Study
  0: \x{d6}
@@ -780,129 +323,6 @@
     \x{d6} 
  0: \x{d6}

-/[\xC3]/8
-Failed: invalid UTF-8 string at offset 1
-
-/\xC3/8
-Failed: invalid UTF-8 string at offset 0
-
-/\xC3\xC3\xC3xxx/8
-Failed: invalid UTF-8 string at offset 0
-
-/\xC3\xC3\xC3xxx/8?DZSS
-------------------------------------------------------------------
-        Bra
-        \X{c0}\X{c0}\X{c0}xxx
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8 no_utf8_check
-First char = 195
-Need char = 'x'
-
-/abc/8
-    \xC3]
-Error -10 (bad UTF-8 string) offset=0 reason=6
-    \xC3
-Error -10 (bad UTF-8 string) offset=0 reason=1
-    \xC3\xC3\xC3
-Error -10 (bad UTF-8 string) offset=0 reason=6
-    \xC3\xC3\xC3\?
-No match
-    \xe1\x88 
-Error -10 (bad UTF-8 string) offset=0 reason=1
-    \P\xe1\x88 
-Error -10 (bad UTF-8 string) offset=0 reason=1
-    \P\P\xe1\x88 
-Error -25 (short UTF-8 string) offset=0 reason=1
-    XX\xea
-Error -10 (bad UTF-8 string) offset=2 reason=2
-    \O0XX\xea
-Error -10 (bad UTF-8 string)
-    \O1XX\xea
-Error -10 (bad UTF-8 string)
-    \O2XX\xea
-Error -10 (bad UTF-8 string) offset=2 reason=2
-    XX\xf1
-Error -10 (bad UTF-8 string) offset=2 reason=3
-    XX\xf8  
-Error -10 (bad UTF-8 string) offset=2 reason=4
-    XX\xfc
-Error -10 (bad UTF-8 string) offset=2 reason=5
-    ZZ\xea\xaf\x20YY
-Error -10 (bad UTF-8 string) offset=2 reason=7
-    ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY  
-Error -10 (bad UTF-8 string) offset=2 reason=8
-    ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY  
-Error -10 (bad UTF-8 string) offset=2 reason=9
-    ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY  
-Error -10 (bad UTF-8 string) offset=2 reason=10
-    ZZ\xffYY
-Error -10 (bad UTF-8 string) offset=2 reason=21
-    ZZ\xfeYY  
-Error -10 (bad UTF-8 string) offset=2 reason=21
-
-/anything/8
-    \xc0\x80
-Error -10 (bad UTF-8 string) offset=0 reason=15
-    \xc1\x8f 
-Error -10 (bad UTF-8 string) offset=0 reason=15
-    \xe0\x9f\x80
-Error -10 (bad UTF-8 string) offset=0 reason=16
-    \xf0\x8f\x80\x80 
-Error -10 (bad UTF-8 string) offset=0 reason=17
-    \xf8\x87\x80\x80\x80  
-Error -10 (bad UTF-8 string) offset=0 reason=18
-    \xfc\x83\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=19
-    \xfe\x80\x80\x80\x80\x80  
-Error -10 (bad UTF-8 string) offset=0 reason=21
-    \xff\x80\x80\x80\x80\x80  
-Error -10 (bad UTF-8 string) offset=0 reason=21
-    \xc3\x8f
-No match
-    \xe0\xaf\x80
-No match
-    \xe1\x80\x80
-No match
-    \xf0\x9f\x80\x80 
-No match
-    \xf1\x8f\x80\x80 
-No match
-    \xf8\x88\x80\x80\x80  
-Error -10 (bad UTF-8 string) offset=0 reason=11
-    \xf9\x87\x80\x80\x80  
-Error -10 (bad UTF-8 string) offset=0 reason=11
-    \xfc\x84\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=12
-    \xfd\x83\x80\x80\x80\x80
-Error -10 (bad UTF-8 string) offset=0 reason=12
-    \?\xf8\x88\x80\x80\x80  
-No match
-    \?\xf9\x87\x80\x80\x80  
-No match
-    \?\xfc\x84\x80\x80\x80\x80
-No match
-    \?\xfd\x83\x80\x80\x80\x80
-No match
-
-/\x{100}abc(xyz(?1))/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}abc
-        CBra 1
-        xyz
-        Recurse
-        Ket
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 1
-Options: utf8
-First char = 196
-Need char = 'z'
-
 /[^\x{100}]abc(xyz(?1))/8DZ
 ------------------------------------------------------------------
         Bra
@@ -916,7 +336,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 1
-Options: utf8
+Options: utf
 No first char
 Need char = 'z'

@@ -933,7 +353,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 1
-Options: utf8
+Options: utf
 No first char
 Need char = 'z'

@@ -953,7 +373,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 2
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -984,7 +404,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 2
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1004,7 +424,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 2
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1035,7 +455,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 2
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1049,10 +469,6 @@
     \x{100}X   
  0: X

-/a\x{1234}b/P8
-    a\x{1234}b
- 0: a\x{1234}b
-
 /^\ሴ/8DZ
 ------------------------------------------------------------------
         Bra
@@ -1062,23 +478,13 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: anchored utf8
+Options: anchored utf
 No first char
 No need char

/\777/I
Failed: octal value is greater than \377 (not in UTF-8 mode) at offset 3

-/\777/8I
-Capturing subpattern count = 0
-Options: utf8
-First char = 199
-Need char = 191
-  \x{1ff}
- 0: \x{1ff}
-  \777 
- 0: \x{1ff}
-  
 /\x{100}*\d/8DZ
 ------------------------------------------------------------------
         Bra
@@ -1088,7 +494,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1101,7 +507,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1114,7 +520,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1127,7 +533,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1140,7 +546,7 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

@@ -1153,49 +559,10 @@
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
-Options: utf8
+Options: utf
 No first char
 No need char

-/\x{100}+\x{200}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}++
-        \x{200}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 128
-
-/\x{100}+X/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{100}++
-        X
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 196
-Need char = 'X'
-
-/X+\x{200}/8DZ
-------------------------------------------------------------------
-        Bra
-        X++
-        \x{200}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 'X'
-Need char = 128
-
 /()()()()()()()()()()
  ()()()()()()()()()()
  ()()()()()()()()()()
@@ -1237,9 +604,6 @@
         End
 ------------------------------------------------------------------

-/^[\QĀ\E-\QŐ\E/BZ8
-Failed: missing terminating ] for character class at offset 15
-
 /^abc./mgx8<any>
     abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK
  0: abc1
@@ -1442,39 +806,9 @@
     \x{1ec5} 
  0: \x{1ec5}

-/-- This tests the stricter UTF-8 check according to RFC 3629. --/ 
-    
-/X/8
-    \x{0}\x{d7ff}\x{e000}\x{10ffff}
-No match
-    \x{d800}
-Error -10 (bad UTF-8 string) offset=0 reason=14
-    \x{d800}\?
-No match
-    \x{da00}
-Error -10 (bad UTF-8 string) offset=0 reason=14
-    \x{da00}\?
-No match
-    \x{dfff}
-Error -10 (bad UTF-8 string) offset=0 reason=14
-    \x{dfff}\?
-No match
-    \x{110000}    
-Error -10 (bad UTF-8 string) offset=0 reason=13
-    \x{110000}\?    
-No match
-    \x{2000000} 
-Error -10 (bad UTF-8 string) offset=0 reason=11
-    \x{2000000}\? 
-No match
-    \x{7fffffff} 
-Error -10 (bad UTF-8 string) offset=0 reason=12
-    \x{7fffffff}\? 
-No match
-
 /a\Rb/I8<bsr_anycrlf>
 Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1492,7 +826,7 @@

 /a\Rb/I8<bsr_unicode>
 Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1514,7 +848,7 @@

 /a\R?b/I8<bsr_anycrlf>
 Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1532,7 +866,7 @@

 /a\R?b/I8<bsr_unicode>
 Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1598,17 +932,6 @@
     A\x{1ec5}ABCXYZ
  0: X

-/(*UTF8)\x{1234}/
-  abcd\x{1234}pqr
- 0: \x{1234}
-
-/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I
-Capturing subpattern count = 0
-Options: bsr_unicode utf8
-Forced newline sequence: CRLF
-First char = 'a'
-Need char = 'b'
-
 /Xa{2,4}b/8
     X\P
 Partial match: X
@@ -2094,32 +1417,6 @@
 Subject length lower bound = 1
 Starting byte set: \x09 \x20 \xa0

-/\h/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 
-    ABC\x{09}
- 0: \x{09}
-    ABC\x{20}
- 0:  
-    ABC\x{a0}
- 0: \x{a0}
-    ABC\x{1680}
- 0: \x{1680}
-    ABC\x{180e}
- 0: \x{180e}
-    ABC\x{2000}
- 0: \x{2000}
-    ABC\x{202f} 
- 0: \x{202f}
-    ABC\x{205f} 
- 0: \x{205f}
-    ABC\x{3000} 
- 0: \x{3000}
-
 /\v/SI
 Capturing subpattern count = 0
 No options
@@ -2128,26 +1425,6 @@
 Subject length lower bound = 1
 Starting byte set: \x0a \x0b \x0c \x0d \x85

-/\v/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
-    ABC\x{0a}
- 0: \x{0a}
-    ABC\x{0b}
- 0: \x{0b}
-    ABC\x{0c}
- 0: \x{0c}
-    ABC\x{0d}
- 0: \x{0d}
-    ABC\x{85}
- 0: \x{85}
-    ABC\x{2028}
- 0: \x{2028}
-
 /\R/SI
 Capturing subpattern count = 0
 No options
@@ -2156,82 +1433,16 @@
 Subject length lower bound = 1
 Starting byte set: \x0a \x0b \x0c \x0d \x85

-/\R/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
-
-/\h*A/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'A'
-Subject length lower bound = 1
-Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 
-    CDBABC
- 0: A
-    
-/\v+A/SI8
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'A'
-Subject length lower bound = 2
-Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 
-
-/\s?xxx\s/8SI
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'x'
-Subject length lower bound = 4
-Starting byte set: \x09 \x0a \x0c \x0d \x20 x 
-
 /\sxxx\s/8T1
     AB\x{85}xxx\x{a0}XYZ
  0: \x{85}xxx\x{a0}
     AB\x{a0}xxx\x{85}XYZ
  0: \x{a0}xxx\x{85}

-/\sxxx\s/I8ST1
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = 'x'
-Subject length lower bound = 5
-Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2 
-    AB\x{85}xxx\x{a0}XYZ
- 0: \x{85}xxx\x{a0}
-    AB\x{a0}xxx\x{85}XYZ
- 0: \x{a0}xxx\x{85}
-
 /\S \S/8T1
     \x{a2} \x{84} 
  0: \x{a2} \x{84}

-/\S \S/I8ST1
-Capturing subpattern count = 0
-Options: utf8
-No first char
-Need char = ' '
-Subject length lower bound = 3
-Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e 
-  \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d 
-  \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ 
-  A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e 
-  f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 
-  \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 
-  \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 
-  \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 
-  \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff 
-    \x{a2} \x{84} 
- 0: \x{a2} \x{84}
-    A Z 
- 0: A Z
-
 'A#хц'8x<any>BZ
 ------------------------------------------------------------------
         Bra
@@ -2295,20 +1506,6 @@
         End
 ------------------------------------------------------------------

-/a+/8
-    a\x{123}aa\>1
- 0: aa
-    a\x{123}aa\>2
-Error -11 (bad UTF-8 offset)
-    a\x{123}aa\>3
- 0: aa
-    a\x{123}aa\>4
- 0: a
-    a\x{123}aa\>5
-No match
-    a\x{123}aa\>6
-Error -24 (bad offset value)
-
 /^\cģ/8
 Failed: \c must be followed by an ASCII character at offset 3

@@ -2340,41 +1537,9 @@
1: \x{0a}
2: \x{0d}

-/\x{1234}+/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}+?/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}++/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \xe1
-
-/\x{1234}{2}/iS8I
-Capturing subpattern count = 0
-Options: caseless utf8
-No first char
-No need char
-Subject length lower bound = 2
-Starting byte set: \xe1
-
/[^\x{1234}]+/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2382,7 +1547,7 @@

/[^\x{1234}]+?/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2390,7 +1555,7 @@

/[^\x{1234}]++/iS8I
Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
No first char
No need char
Subject length lower bound = 1
@@ -2398,7 +1563,7 @@

 /[^\x{1234}]{2}/iS8I
 Capturing subpattern count = 0
-Options: caseless utf8
+Options: caseless utf
 No first char
 No need char
 Subject length lower bound = 2
@@ -2422,5 +1587,13 @@
 /f.*/8s
     \P\Pfor
 Partial match: for
+    
+/\x{d7ff}\x{e000}/8

+/\x{d800}/8
+Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{dfff}/8
+Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
/-- End of testinput5 --/

Modified: code/branches/pcre16/testdata/testoutput8
===================================================================
--- code/branches/pcre16/testdata/testoutput8    2011-12-19 11:04:45 UTC (rev 809)
+++ code/branches/pcre16/testdata/testoutput8    2011-12-19 13:34:10 UTC (rev 810)
@@ -1210,7 +1210,7 @@

 /a\Rb/I8<bsr_anycrlf>
 Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1228,7 +1228,7 @@

 /a\Rb/I8<bsr_unicode>
 Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1250,7 +1250,7 @@

 /a\R?b/I8<bsr_anycrlf>
 Capturing subpattern count = 0
-Options: bsr_anycrlf utf8
+Options: bsr_anycrlf utf
 First char = 'a'
 Need char = 'b'
     a\rb
@@ -1268,7 +1268,7 @@

 /a\R?b/I8<bsr_unicode>
 Capturing subpattern count = 0
-Options: bsr_unicode utf8
+Options: bsr_unicode utf
 First char = 'a'
 Need char = 'b'
     a\rb

Diese Nachricht ist Teil des folgenden Threads:
	Der komplette Thread sortiert nach Datum

[Pcre-svn] [810] code/branches/pcre16: A lot more work on pc…