[Pcre-svn] [781] code/branches/pcre16: renaming utf8 to utf,…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [781] code/branches/pcre16: renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars
Revision: 781
          http://vcs.pcre.org/viewvc?view=rev&revision=781
Author:   zherczeg
Date:     2011-12-03 07:58:30 +0000 (Sat, 03 Dec 2011)


Log Message:
-----------
renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars

Modified Paths:
--------------
    code/branches/pcre16/Makefile.am
    code/branches/pcre16/pcre16_utf16_utils.c
    code/branches/pcre16/pcre16_valid_utf16.c
    code/branches/pcre16/pcre_compile.c
    code/branches/pcre16/pcre_dfa_exec.c
    code/branches/pcre16/pcre_exec.c
    code/branches/pcre16/pcre_internal.h
    code/branches/pcre16/pcre_jit_compile.c
    code/branches/pcre16/pcre_newline.c
    code/branches/pcre16/pcre_ord2utf8.c
    code/branches/pcre16/pcre_study.c
    code/branches/pcre16/pcre_valid_utf8.c
    code/branches/pcre16/pcreposix.c
    code/branches/pcre16/sljit/sljitConfigInternal.h
    code/branches/pcre16/sljit/sljitExecAllocator.c
    code/branches/pcre16/sljit/sljitLir.h
    code/branches/pcre16/sljit/sljitNativeARM_Thumb2.c
    code/branches/pcre16/sljit/sljitNativeARM_v5.c
    code/branches/pcre16/sljit/sljitNativeMIPS_common.c
    code/branches/pcre16/sljit/sljitNativePPC_common.c
    code/branches/pcre16/sljit/sljitNativeX86_common.c
    code/branches/pcre16/testdata/testinput10
    code/branches/pcre16/testdata/testinput5
    code/branches/pcre16/testdata/testoutput10
    code/branches/pcre16/testdata/testoutput5


Added Paths:
-----------
    code/branches/pcre16/pcre16_ord2utf16.c


Modified: code/branches/pcre16/Makefile.am
===================================================================
--- code/branches/pcre16/Makefile.am    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/Makefile.am    2011-12-03 07:58:30 UTC (rev 781)
@@ -214,6 +214,7 @@
   pcre16_exec.c \
   pcre16_jit_compile.c \
   pcre16_newline.c \
+  pcre16_ord2utf16.c \
   pcre16_string_utils.c \
   pcre16_study.c \
   pcre16_tables.c \


Added: code/branches/pcre16/pcre16_ord2utf16.c
===================================================================
--- code/branches/pcre16/pcre16_ord2utf16.c                            (rev 0)
+++ code/branches/pcre16/pcre16_ord2utf16.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -0,0 +1,95 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2008 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This file contains a private PCRE function that converts an ordinal
+character value into a UTF16 string. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+*       Convert character value to UTF-16         *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-16 character in 1 to 2 pcre_uchars.
+
+Arguments:
+  cvalue     the character value
+  buffer     pointer to buffer for result - at least 2 pcre_uchars long
+
+Returns:     number of characters placed in the buffer
+*/
+
+int
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
+{
+#ifdef SUPPORT_UTF16
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+  cvalue = 0xfffe;
+
+if (cvalue <= 0xffff)
+  {
+  *buffer = (pcre_uchar)cvalue;
+  return 1;
+  }
+
+cvalue -= 0x10000;
+*buffer++ = 0xd800 | (cvalue >> 10);
+*buffer = 0xdc00 | (cvalue & 0x3ff);
+return 2;
+
+#else
+
+(void)(cvalue);  /* Keep compiler happy; this function won't ever be */
+(void)(buffer);  /* called when SUPPORT_UTF8 is not defined. */
+return 0;
+
+#endif
+}
+
+/* End of pcre16_ord2utf16.c */


Modified: code/branches/pcre16/pcre16_utf16_utils.c
===================================================================
--- code/branches/pcre16/pcre16_utf16_utils.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre16_utf16_utils.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -57,7 +57,7 @@
 BOOL same_bo = TRUE;
 PCRE_SPTR16 end = input + length;
 /* The c variable must be unsigned. */
-register uschar c;
+register pcre_uchar c;


while (input < end)
{

Modified: code/branches/pcre16/pcre16_valid_utf16.c
===================================================================
--- code/branches/pcre16/pcre16_valid_utf16.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre16_valid_utf16.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -78,11 +78,11 @@
 */


int
-PRIV(valid_utf16)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF16
register PCRE_PUCHAR p;
-register uschar c;
+register pcre_uchar c;

if (length < 0)
{

Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_compile.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -470,6 +470,7 @@
   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
   /* 70 */
   "internal error: unknown opcode in find_fixedlength()\0"
+  "Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0"
   ;


/* Table to identify digits and hex digits. This is used when compiling
@@ -538,7 +539,7 @@

/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */

-static const pcre_unit8 digitab[] =
+static const pcre_uint8 digitab[] =
   {
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
@@ -706,9 +707,11 @@
 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
   int options, BOOL isclass)
 {
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
 const pcre_uchar *ptr = *ptrptr + 1;
-int c, i;
+pcre_int32 c;
+int i;


 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 ptr--;                            /* Set pointer back to the last byte */
@@ -940,12 +943,12 @@
     c -= CHAR_0;
     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
         c = c * 8 + *(++ptr) - CHAR_0;
-    if (!utf8 && c > 0xff) *errorcodeptr = ERR51;
+    if (!utf && c > 0xff) *errorcodeptr = ERR51;
     break;


     /* \x is complicated. \x{ddd} is a character number which can be greater
-    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
-    treated as a data character. */
+    than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
+    If not, { is treated as a data character. */


     case CHAR_x:
     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
@@ -974,14 +977,12 @@
     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
       {
       const pcre_uchar *pt = ptr + 2;
-      int count = 0;


       c = 0;
       while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
         {
         register int cc = *pt++;
         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
-        count++;


 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
@@ -990,17 +991,25 @@
         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif
-        }


-      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
-        {
 #ifdef COMPILE_PCRE8
-        if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34;
+        if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
 #else
 #ifdef COMPILE_PCRE16
-        if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34;
+        if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
 #endif
 #endif
+        }
+
+      if (c < 0)
+        {
+        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
+        *errorcodeptr = ERR34;
+        }
+
+      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
+        {
+        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71;
         ptr = pt;
         break;
         }
@@ -1281,7 +1290,7 @@
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
-  utf8         TRUE if we are in UTF-8 mode
+  utf          TRUE if we are in UTF-8 / UTF-16 mode
   count        pointer to the current capturing subpattern number (updated)


 Returns:       the number of the named subpattern, or -1 if not found
@@ -1289,7 +1298,7 @@


 static int
 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
-  BOOL xmode, BOOL utf8, int *count)
+  BOOL xmode, BOOL utf, int *count)
 {
 pcre_uchar *ptr = *ptrptr;
 int start_count = *count;
@@ -1458,7 +1467,7 @@
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
       ptr++;
 #ifdef SUPPORT_UTF8
-      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+      if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
       }
     if (*ptr == 0) goto FAIL_EXIT;
@@ -1469,7 +1478,7 @@


   if (*ptr == CHAR_LEFT_PARENTHESIS)
     {
-    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
     if (rc > 0) return rc;
     if (*ptr == 0) goto FAIL_EXIT;
     }
@@ -1515,14 +1524,14 @@
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
-  utf8         TRUE if we are in UTF-8 mode
+  utf          TRUE if we are in UTF-8 / UTF-16 mode


 Returns:       the number of the found subpattern, or -1 if not found
 */


static int
find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
- BOOL utf8)
+ BOOL utf)
{
pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
int count = 0;
@@ -1535,7 +1544,7 @@

for (;;)
{
- rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
+ rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
if (rc > 0 || *ptr++ == 0) break;
}

@@ -1618,7 +1627,7 @@

 Arguments:
   code     points to the start of the pattern (the bracket)
-  utf8     TRUE in UTF-8 mode
+  utf      TRUE in UTF-8 / UTF-16 mode
   atend    TRUE if called when the pattern is complete
   cd       the "compile data" structure


@@ -1630,7 +1639,7 @@
*/

static int
-find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
+find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
{
int length = -1;

@@ -1657,7 +1666,7 @@
     case OP_ONCE:
     case OP_ONCE_NC:
     case OP_COND:
-    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd);
+    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
     if (d < 0) return d;
     branchlength += d;
     do cc += GET(cc, 1); while (*cc == OP_ALT);
@@ -1691,7 +1700,7 @@
     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
     if (cc > cs && cc < ce) return -1;                    /* Recursion */
-    d = find_fixedlength(cs + 2, utf8, atend, cd);
+    d = find_fixedlength(cs + 2, utf, atend, cd);
     if (d < 0) return d;
     branchlength += d;
     cc += 1 + LINK_SIZE;
@@ -1751,7 +1760,7 @@
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;


@@ -1765,7 +1774,7 @@
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;


@@ -1945,14 +1954,14 @@

 Arguments:
   code        points to start of expression
-  utf8        TRUE in UTF-8 mode
+  utf         TRUE in UTF-8 / UTF-16 mode
   number      the required bracket number or negative to find a lookbehind


 Returns:      pointer to the opcode for the bracket, or NULL if not found
 */


const pcre_uchar *
-PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number)
+PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
{
for (;;)
{
@@ -2033,7 +2042,7 @@
arrange to skip the extra bytes. */

 #ifdef SUPPORT_UTF8
-    if (utf8) switch(c)
+    if (utf) switch(c)
       {
       case OP_CHAR:
       case OP_CHARI:
@@ -2067,7 +2076,7 @@
       break;
       }
 #else
-    (void)(utf8);  /* Keep compiler happy by referencing function argument */
+    (void)(utf);  /* Keep compiler happy by referencing function argument */
 #endif
     }
   }
@@ -2084,13 +2093,13 @@


 Arguments:
   code        points to start of expression
-  utf8        TRUE in UTF-8 mode
+  utf         TRUE in UTF-8 / UTF-16 mode


 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
 */


 static const pcre_uchar *
-find_recurse(const pcre_uchar *code, BOOL utf8)
+find_recurse(const pcre_uchar *code, BOOL utf)
 {
 for (;;)
   {
@@ -2153,7 +2162,7 @@
     to arrange to skip the extra bytes. */


 #ifdef SUPPORT_UTF8
-    if (utf8) switch(c)
+    if (utf) switch(c)
       {
       case OP_CHAR:
       case OP_CHARI:
@@ -2187,7 +2196,7 @@
       break;
       }
 #else
-    (void)(utf8);  /* Keep compiler happy by referencing function argument */
+    (void)(utf);  /* Keep compiler happy by referencing function argument */
 #endif
     }
   }
@@ -2210,7 +2219,7 @@
 Arguments:
   code        points to start of search
   endcode     points to where to stop
-  utf8        TRUE if in UTF8 mode
+  utf         TRUE if in UTF-8 / UTF-16 mode
   cd          contains pointers to tables etc.


 Returns:      TRUE if what is matched could be empty
@@ -2218,7 +2227,7 @@


static BOOL
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
- BOOL utf8, compile_data *cd)
+ BOOL utf, compile_data *cd)
{
register int c;
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
@@ -2266,7 +2275,7 @@

     do
       {
-      if (could_be_empty_branch(scode, endcode, utf8, cd))
+      if (could_be_empty_branch(scode, endcode, utf, cd))
         {
         empty_branch = TRUE;
         break;
@@ -2322,7 +2331,7 @@
       empty_branch = FALSE;
       do
         {
-        if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
+        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
           empty_branch = TRUE;
         code += GET(code, 1);
         }
@@ -2456,7 +2465,7 @@
     case OP_MINQUERYI:
     case OP_POSQUERY:
     case OP_POSQUERYI:
-    if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+    if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
     break;


     case OP_UPTO:
@@ -2465,7 +2474,7 @@
     case OP_MINUPTOI:
     case OP_POSUPTO:
     case OP_POSUPTOI:
-    if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+    if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
     break;
 #endif


@@ -2509,7 +2518,7 @@
   code        points to start of the recursion
   endcode     points to where to stop (current RECURSE item)
   bcptr       points to the chain of current (unclosed) branch starts
-  utf8        TRUE if in UTF-8 mode
+  utf         TRUE if in UTF-8 / UTF-16 mode
   cd          pointers to tables etc


 Returns:      TRUE if what is matched could be empty
@@ -2517,11 +2526,11 @@


 static BOOL
 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
-  branch_chain *bcptr, BOOL utf8, compile_data *cd)
+  branch_chain *bcptr, BOOL utf, compile_data *cd)
 {
 while (bcptr != NULL && bcptr->current_branch >= code)
   {
-  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
+  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
     return FALSE;
   bcptr = bcptr->outer;
   }
@@ -2656,7 +2665,7 @@
 Arguments:
   group      points to the start of the group
   adjust     the amount by which the group is to be moved
-  utf8       TRUE in UTF-8 mode
+  utf        TRUE in UTF-8 / UTF-16 mode
   cd         contains pointers to tables etc.
   save_hwm   the hwm forward reference pointer at the start of the group


@@ -2664,12 +2673,12 @@
*/

static void
-adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
+adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
pcre_uchar *save_hwm)
{
pcre_uchar *ptr = group;

-while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
+while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
{
int offset;
pcre_uchar *hc;
@@ -2875,7 +2884,7 @@

 Arguments:
   previous      pointer to the repeated opcode
-  utf8          TRUE in UTF-8 mode
+  utf           TRUE in UTF-8 / UTF-16 mode
   ptr           next character in pattern
   options       options bits
   cd            contains pointers to tables etc.
@@ -2884,7 +2893,7 @@
 */


 static BOOL
-check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
+check_auto_possessive(const pcre_uchar *previous, BOOL utf,
   const pcre_uchar *ptr, int options, compile_data *cd)
 {
 int c, next;
@@ -2905,7 +2914,7 @@
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       }
@@ -2927,7 +2936,7 @@
 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
   {
 #ifdef SUPPORT_UTF8
-  if (utf8) { GETCHARINC(next, ptr); } else
+  if (utf) { GETCHARINC(next, ptr); } else
 #endif
   next = *ptr++;
   }
@@ -2949,7 +2958,7 @@
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       }
@@ -2988,7 +2997,7 @@
 #endif
   if (c == next) return FALSE;
 #ifdef SUPPORT_UTF8
-  if (utf8)
+  if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
@@ -3013,7 +3022,7 @@
   case OP_NOTI:
   if ((c = *previous) == next) return TRUE;
 #ifdef SUPPORT_UTF8
-  if (utf8)
+  if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
@@ -3348,10 +3357,11 @@
 dynamically as we process the pattern. */


#ifdef SUPPORT_UTF8
-BOOL utf8 = (options & PCRE_UTF8) != 0;
-pcre_uint8 utf8_char[6];
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
+pcre_uchar utf_chars[6];
#else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
#endif

 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
@@ -3459,8 +3469,8 @@
       }


     *lengthptr += (int)(code - last_code);
-    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
-      c));
+    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
+      (int)(code - last_code), c, c));


     /* If "previous" is set and it is not at the start of the work space, move
     it back to there, in order to avoid filling up the work space. Otherwise,
@@ -3547,7 +3557,7 @@
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         ptr++;
 #ifdef SUPPORT_UTF8
-        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
+        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
 #endif
         }
       if (*ptr != 0) continue;
@@ -3727,7 +3737,7 @@
       const pcre_uchar *oldptr;


 #ifdef SUPPORT_UTF8
-      if (utf8 && c > 127)
+      if (utf && c > 127)
         {                           /* Braces are required because the */
         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
         }
@@ -3945,22 +3955,22 @@
             SETBIT(classbits, 0x20); /* SPACE */
             SETBIT(classbits, 0xa0); /* NSBP */
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
               *class_uchardata++ = XCL_SINGLE;
-              class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
               }
 #endif
             continue;
@@ -3980,30 +3990,30 @@
               }


 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
               }
 #endif
             continue;
@@ -4015,12 +4025,12 @@
             SETBIT(classbits, 0x0d); /* CR */
             SETBIT(classbits, 0x85); /* NEL */
 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
               }
 #endif
             continue;
@@ -4043,15 +4053,15 @@
               }


 #ifdef SUPPORT_UTF
-            if (utf8)
+            if (utf)
               {
               xclass = TRUE;
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
               *class_uchardata++ = XCL_RANGE;
-              class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata);
-              class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
               }
 #endif
             continue;
@@ -4139,7 +4149,7 @@
           }


 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {                           /* Braces are required because the */
           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
           }
@@ -4189,7 +4199,7 @@
         available. */


 #ifdef SUPPORT_UTF
-        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
+        if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
 #endif
 #ifndef COMPILE_PCRE8
         if (d > 255)
@@ -4234,9 +4244,9 @@
               else
                 {
                 *class_uchardata++ = XCL_RANGE;
-                class_uchardata += PRIV(ord2utf8)(occ, class_uchardata);
+                class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
                 }
-              class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata);
+              class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
               }
             }
 #endif  /* SUPPORT_UCP */
@@ -4246,8 +4256,8 @@


           *class_uchardata++ = XCL_RANGE;
 #ifdef SUPPORT_UTF
-          class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
-          class_uchardata += PRIV(ord2utf8)(d, class_uchardata);
+          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
+          class_uchardata += PRIV(ord2utf)(d, class_uchardata);
 #else
           *class_uchardata++ = c;
           *class_uchardata++ = d;
@@ -4304,7 +4314,7 @@
       /* Handle a character that cannot go in the bit map */


 #ifdef SUPPORT_UTF
-      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
+      if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
 #endif
 #ifndef COMPILE_PCRE8
       if (c > 255)
@@ -4314,7 +4324,7 @@
         xclass = TRUE;
         *class_uchardata++ = XCL_SINGLE;
 #ifdef SUPPORT_UTF
-        class_uchardata += PRIV(ord2utf8)(c, class_uchardata);
+        class_uchardata += PRIV(ord2utf)(c, class_uchardata);
 #else
         *class_uchardata++ = c;
 #endif
@@ -4326,7 +4336,7 @@
           if ((othercase = UCD_OTHERCASE(c)) != c)
             {
             *class_uchardata++ = XCL_SINGLE;
-            class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata);
+            class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
             }
           }
 #endif  /* SUPPORT_UCP */
@@ -4384,11 +4394,9 @@


 #ifdef SUPPORT_UTF
     if (class_charcount == 1 && !xclass &&
-      (!utf8 || !negate_class || class_lastchar < 128))
-#elif defined COMPILE_PCRE8
+      (!utf || !negate_class || class_lastchar < 128))
+#else
     if (class_charcount == 1)
-#else
-    if (class_charcount == 1 && !xclass)
 #endif
       {
       zeroreqchar = reqchar;
@@ -4408,8 +4416,8 @@
       then we can handle this with the normal one-character code. */


 #ifdef SUPPORT_UTF8
-      if (utf8 && class_lastchar > 127)
-        mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer);
+      if (utf && class_lastchar > 127)
+        mclength = PRIV(ord2utf)(class_lastchar, mcbuffer);
       else
 #endif
         {
@@ -4599,12 +4607,12 @@
       length rather than a small character. */


 #ifdef SUPPORT_UTF8
-      if (utf8 && (code[-1] & 0x80) != 0)
+      if (utf && (code[-1] & 0x80) != 0)
         {
         pcre_uchar *lastchar = code - 1;
         while((*lastchar & 0xc0) == 0x80) lastchar--;
         c = code - lastchar;            /* Length of UTF-8 character */
-        memcpy(utf8_char, lastchar, c); /* Save the char */
+        memcpy(utf_chars, lastchar, c); /* Save the char */
         c |= 0x80;                      /* Flag c as a length */
         }
       else
@@ -4625,7 +4633,7 @@


       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4646,7 +4654,7 @@
       c = previous[1];
       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4670,7 +4678,7 @@


       if (!possessive_quantifier &&
           repeat_max < 0 &&
-          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
+          check_auto_possessive(previous, utf, ptr + 1, options, cd))
         {
         repeat_type = 0;    /* Force greedy */
         possessive_quantifier = TRUE;
@@ -4755,9 +4763,9 @@
         if (repeat_max < 0)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && c >= 128)
+          if (utf && c >= 128)
             {
-            memcpy(code, utf8_char, c & 7);
+            memcpy(code, utf_chars, c & 7);
             code += c & 7;
             }
           else
@@ -4780,9 +4788,9 @@
         else if (repeat_max != repeat_min)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && c >= 128)
+          if (utf && c >= 128)
             {
-            memcpy(code, utf8_char, c & 7);
+            memcpy(code, utf_chars, c & 7);
             code += c & 7;
             }
           else
@@ -4810,9 +4818,9 @@
       /* The character or character type itself comes last in all cases. */


 #ifdef SUPPORT_UTF8
-      if (utf8 && c >= 128)
+      if (utf && c >= 128)
         {
-        memcpy(code, utf8_char, c & 7);
+        memcpy(code, utf_chars, c & 7);
         code += c & 7;
         }
       else
@@ -4939,7 +4947,7 @@
         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
           {
           *code = OP_END;
-          adjust_recurse(previous, 1, utf8, cd, save_hwm);
+          adjust_recurse(previous, 1, utf, cd, save_hwm);
           memmove(previous + 1, previous, IN_UCHARS(len));
           code++;
           if (repeat_max == 0)
@@ -4963,7 +4971,7 @@
           {
           int offset;
           *code = OP_END;
-          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
+          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
           code += 2 + LINK_SIZE;
           *previous++ = OP_BRAZERO + repeat_type;
@@ -5165,7 +5173,7 @@
             pcre_uchar *scode = bracode;
             do
               {
-              if (could_be_empty_branch(scode, ketcode, utf8, cd))
+              if (could_be_empty_branch(scode, ketcode, utf, cd))
                 {
                 *bracode += OP_SBRA - OP_BRA;
                 break;
@@ -5188,7 +5196,7 @@
               {
               int nlen = (int)(code - bracode);
               *code = OP_END;
-              adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
               code += 1 + LINK_SIZE;
               nlen += 1 + LINK_SIZE;
@@ -5266,7 +5274,7 @@
         {
         tempcode += PRIV(OP_lengths)[*tempcode];
 #ifdef SUPPORT_UTF8
-        if (utf8 && tempcode[-1] >= 0xc0)
+        if (utf && tempcode[-1] >= 0xc0)
           tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
 #endif
         }
@@ -5304,7 +5312,7 @@


         default:
         *code = OP_END;
-        adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
+        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
         code += 1 + LINK_SIZE;
         len += 1 + LINK_SIZE;
@@ -5613,7 +5621,7 @@
         /* Search the pattern for a forward reference */


         else if ((i = find_parens(cd, name, namelen,
-                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
+                        (options & PCRE_EXTENDED) != 0, utf)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
           code[1+LINK_SIZE]++;
@@ -5958,7 +5966,7 @@
           temp = cd->end_pattern;
           cd->end_pattern = ptr;
           recno = find_parens(cd, name, namelen,
-            (options & PCRE_EXTENDED) != 0, utf8);
+            (options & PCRE_EXTENDED) != 0, utf);
           cd->end_pattern = temp;
           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
           }
@@ -5985,7 +5993,7 @@
             }
           else if ((recno =                /* Forward back reference */
                     find_parens(cd, name, namelen,
-                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
+                      (options & PCRE_EXTENDED) != 0, utf)) <= 0)
             {
             *errorcodeptr = ERR15;
             goto FAILED;
@@ -6089,14 +6097,14 @@
             {
             *code = OP_END;
             if (recno != 0)
-              called = PRIV(find_bracket)(cd->start_code, utf8, recno);
+              called = PRIV(find_bracket)(cd->start_code, utf, recno);


             /* Forward reference */


             if (called == NULL)
               {
               if (find_parens(cd, NULL, recno,
-                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
+                    (options & PCRE_EXTENDED) != 0, utf) < 0)
                 {
                 *errorcodeptr = ERR15;
                 goto FAILED;
@@ -6120,7 +6128,7 @@
             conditional subpatterns will be picked up then. */


             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
-                     could_be_empty(called, code, bcptr, utf8, cd))
+                     could_be_empty(called, code, bcptr, utf, cd))
               {
               *errorcodeptr = ERR40;
               goto FAILED;
@@ -6618,7 +6626,7 @@


           {  
           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
-          *code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c;
+          *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
           }
         }
       continue;
@@ -6629,8 +6637,8 @@
     handle it as a data character. */


 #ifdef SUPPORT_UTF8
-    if (utf8 && c > 127)
-      mclength = PRIV(ord2utf8)(c, mcbuffer);
+    if (utf && c > 127)
+      mclength = PRIV(ord2utf)(c, mcbuffer);
     else
 #endif


@@ -6652,7 +6660,7 @@
     mcbuffer[0] = c;


 #ifdef SUPPORT_UTF8
-    if (utf8 && c >= 0xc0)
+    if (utf && c >= 0xc0)
       {
       while ((ptr[1] & 0xc0) == 0x80)
         mcbuffer[mclength++] = *(++ptr);
@@ -7360,7 +7368,7 @@
 int newline;
 int errorcode = 0;
 int skipatstart = 0;
-BOOL utf8;
+BOOL utf;
 size_t size;
 pcre_uchar *code;
 const pcre_uchar *codestart;
@@ -7458,22 +7466,23 @@
   else break;
   }


-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;

/* Can't support UTF8 unless PCRE has been compiled to include the code. The
-return of an error code from PRIV(valid_utf8)() is a new feature, introduced in
+return of an error code from PRIV(valid_utf)() is a new feature, introduced in
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
not used here. */

 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
-     (errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
   {
   errorcode = ERR44;
   goto PCRE_EARLY_ERROR_RETURN2;
   }
 #else
-if (utf8)
+if (utf)
   {
   errorcode = ERR32;
   goto PCRE_EARLY_ERROR_RETURN;
@@ -7688,7 +7697,7 @@
   cd->hwm -= LINK_SIZE;
   offset = GET(cd->hwm, 0);
   recno = GET(codestart, offset);
-  groupptr = PRIV(find_bracket)(codestart, utf8, recno);
+  groupptr = PRIV(find_bracket)(codestart, utf, recno);
   if (groupptr == NULL) errorcode = ERR53;
     else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
   }
@@ -7715,9 +7724,9 @@
   of zero, but that is a pathological case, and it does no harm.) When we find
   one, we temporarily terminate the branch it is in while we scan it. */


-  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf8, -1);
+  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
        cc != NULL;
-       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf8, -1))
+       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
     {
     if (GET(cc, 1) == 0)
       {


Modified: code/branches/pcre16/pcre_dfa_exec.c
===================================================================
--- code/branches/pcre16/pcre_dfa_exec.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_dfa_exec.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -414,9 +414,9 @@
 const pcre_uchar *start_code = md->start_code;


#ifdef SUPPORT_UTF8
-BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
+BOOL utf = (md->poptions & PCRE_UTF8) != 0;
#else
-BOOL utf8 = FALSE;
+BOOL utf = FALSE;
#endif

rlevel++;
@@ -474,7 +474,7 @@
#ifdef SUPPORT_UTF8
/* In character mode we have to step back character by character */

-  if (utf8)
+  if (utf)
     {
     for (gone_back = 0; gone_back < max_back; gone_back++)
       {
@@ -606,7 +606,7 @@
     {
     clen = 1;        /* Number of bytes in the character */
 #ifdef SUPPORT_UTF8
-    if (utf8) { GETCHARLEN(c, ptr, clen); } else
+    if (utf) { GETCHARLEN(c, ptr, clen); } else
 #endif  /* SUPPORT_UTF8 */
     c = *ptr;
     }
@@ -695,7 +695,7 @@
       {
       dlen = 1;
 #ifdef SUPPORT_UTF8
-      if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
+      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 #endif  /* SUPPORT_UTF8 */
       d = code[coptable[codevalue]];
       if (codevalue >= OP_TYPESTAR)
@@ -960,7 +960,7 @@
           const pcre_uchar *temp = ptr - 1;
           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
 #ifdef SUPPORT_UTF8
-          if (utf8) BACKCHAR(temp);
+          if (utf) BACKCHAR(temp);
 #endif
           GETCHARTEST(d, temp);
 #ifdef SUPPORT_UCP
@@ -1986,7 +1986,7 @@
       if (clen == 0) break;


 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
           {
@@ -2007,8 +2007,7 @@
         }
       else
 #endif  /* SUPPORT_UTF8 */
-
-      /* Non-UTF-8 mode */
+      /* Not UTF mode */
         {
         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
         }
@@ -2211,7 +2210,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2258,7 +2257,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2303,7 +2302,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2340,7 +2339,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -2384,7 +2383,7 @@
         if (caseless)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8 && d >= 128)
+          if (utf && d >= 128)
             {
 #ifdef SUPPORT_UCP
             otherd = UCD_OTHERCASE(d);
@@ -3005,7 +3004,7 @@
 real_pcre *re = (real_pcre *)argument_re;
 dfa_match_data match_block;
 dfa_match_data *md = &match_block;
-BOOL utf8, anchored, startline, firstline;
+BOOL utf, anchored, startline, firstline;
 const pcre_uchar *current_subject, *end_subject;
 const pcre_uint8 *lcc;


@@ -3073,9 +3072,10 @@
req_char_ptr = current_subject - 1;

#ifdef SUPPORT_UTF8
-utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (re->options & PCRE_UTF8) != 0;
#else
-utf8 = FALSE;
+utf = FALSE;
#endif

anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
@@ -3147,10 +3147,10 @@
back the character offset. */

 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   int erroroffset;
-  int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset);
+  int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
   if (errorcode != 0)
     {
     if (offsetcount >= 2)
@@ -3235,7 +3235,7 @@
       {
       PCRE_PUCHAR t = current_subject;
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
@@ -3278,7 +3278,7 @@
         if (current_subject > md->start_subject + start_offset)
           {
 #ifdef SUPPORT_UTF8
-          if (utf8)
+          if (utf)
             {
             while (current_subject < end_subject &&
                    !WAS_NEWLINE(current_subject))
@@ -3317,7 +3317,7 @@
             {
             current_subject++;
 #ifdef SUPPORT_UTF8
-            if (utf8)
+            if (utf)
               while(current_subject < end_subject &&
                     (*current_subject & 0xc0) == 0x80) current_subject++;
 #endif
@@ -3426,7 +3426,7 @@


   if (firstline && IS_NEWLINE(current_subject)) break;
   current_subject++;
-  if (utf8)
+  if (utf)
     {
     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
       current_subject++;


Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_exec.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -183,7 +183,7 @@
   {
 #ifdef SUPPORT_UTF8
 #ifdef SUPPORT_UCP
-  if (md->utf8)
+  if (md->utf)
     {
     /* Match characters up to the end of the reference. NOTE: the number of
     bytes matched may differ, because there are some characters whose upper and
@@ -385,7 +385,7 @@
   int Xprop_value;
   int Xprop_fail_result;
   int Xoclength;
-  pcre_uint8 Xocchars[8];
+  pcre_uchar Xocchars[6];
 #endif


int Xcodelink;
@@ -450,7 +450,7 @@


/* Performance note: It might be tempting to extract commonly used fields from
-the md structure (e.g. utf8, end_subject) into individual variables to improve
+the md structure (e.g. utf, end_subject) into individual variables to improve
performance. Tests using gcc on a SPARC disproved this; in the first case, it
made performance worse.

@@ -485,7 +485,7 @@
 register int  rrc;         /* Returns from recursive calls */
 register int  i;           /* Used for loops not involving calls to RMATCH() */
 register unsigned int c;   /* Character values not kept over RMATCH() calls */
-register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
+register BOOL utf;         /* Local copy of UTF flag for speed */


BOOL minimize, possessive; /* Quantifier options */
BOOL caseless;
@@ -606,7 +606,7 @@
int prop_value;
int prop_fail_result;
int oclength;
-pcre_uint8 occhars[8];
+pcre_uchar occhars[6];
#endif

int codelink;
@@ -660,9 +660,9 @@
however, impact performance when true recursion is being used. */

 #ifdef SUPPORT_UTF8
-utf8 = md->utf8;       /* Local copy of the flag */
+utf = md->utf;       /* Local copy of the flag */
 #else
-utf8 = FALSE;
+utf = FALSE;
 #endif


/* First check that we haven't called match() too many times, or that we
@@ -1597,7 +1597,7 @@

     case OP_REVERSE:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       i = GET(ecode, 1);
       while (i-- > 0)
@@ -2070,7 +2070,7 @@
       partial matching. */


 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         /* Get status of previous character */


@@ -2189,7 +2189,7 @@
       MRRETURN(MATCH_NOMATCH);
       }
     eptr++;
-    if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+    if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
     ecode++;
     break;


@@ -2546,7 +2546,7 @@
     while (eptr < md->end_subject)
       {
       int len = 1;
-      if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+      if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
       if (UCD_CATEGORY(c) != ucp_M) break;
       eptr += len;
       }
@@ -2744,8 +2744,7 @@
       /* First, ensure the minimum number of matches are present. */


 #ifdef SUPPORT_UTF
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         for (i = 1; i <= min; i++)
           {
@@ -2765,7 +2764,7 @@
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -2797,8 +2796,7 @@
       if (minimize)
         {
 #ifdef SUPPORT_UTF
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           for (fi = min;; fi++)
             {
@@ -2821,7 +2819,7 @@
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -2854,8 +2852,7 @@
         pp = eptr;


 #ifdef SUPPORT_UTF
-        /* UTF mode */
-        if (utf8)
+        if (utf)
           {
           for (i = min; i < max; i++)
             {
@@ -3024,7 +3021,7 @@
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
 #ifdef SUPPORT_UTF
-          if (utf8) BACKCHAR(eptr);
+          if (utf) BACKCHAR(eptr);
 #endif
           }
         MRRETURN(MATCH_NOMATCH);
@@ -3038,7 +3035,7 @@


     case OP_CHAR:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       ecode++;
@@ -3052,8 +3049,7 @@
       }
     else
 #endif
-
-    /* Non-UTF-8 mode */
+    /* Not UTF mode */
       {
       if (md->end_subject - eptr < 1)
         {
@@ -3069,7 +3065,7 @@


     case OP_CHARI:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       ecode++;
@@ -3112,7 +3108,7 @@
     else
 #endif   /* SUPPORT_UTF8 */


-    /* Non-UTF-8 mode */
+    /* Not UTF mode */
       {
       if (md->end_subject - eptr < 1)
         {
@@ -3193,7 +3189,7 @@


     REPEATCHAR:
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       length = 1;
       charptr = ecode;
@@ -3209,7 +3205,7 @@
         unsigned int othercase;
         if (op >= OP_STARI &&     /* Caseless */
             (othercase = UCD_OTHERCASE(fc)) != fc)
-          oclength = PRIV(ord2utf8)(othercase, occhars);
+          oclength = PRIV(ord2utf)(othercase, occhars);
         else oclength = 0;
 #endif  /* SUPPORT_UCP */


@@ -3220,7 +3216,7 @@
 #ifdef SUPPORT_UCP
           else if (oclength > 0 &&
                    eptr <= md->end_subject - oclength &&
-                   memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                   memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
           else
             {
@@ -3243,7 +3239,7 @@
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
-                     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                     memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
             else
               {
@@ -3264,7 +3260,7 @@
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
-                     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
+                     memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
 #endif  /* SUPPORT_UCP */
             else
               {
@@ -3548,8 +3544,7 @@
       fc = md->lcc[fc];


 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         register unsigned int d;
         for (i = 1; i <= min; i++)
@@ -3566,8 +3561,7 @@
         }
       else
 #endif
-
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -3585,8 +3579,7 @@
       if (minimize)
         {
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (fi = min;; fi++)
@@ -3606,7 +3599,7 @@
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -3631,8 +3624,7 @@
         pp = eptr;


 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (i = min; i < max; i++)
@@ -3659,7 +3651,7 @@
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (i = min; i < max; i++)
             {
@@ -3690,8 +3682,7 @@
     else
       {
 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         register unsigned int d;
         for (i = 1; i <= min; i++)
@@ -3707,7 +3698,7 @@
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (i = 1; i <= min; i++)
           {
@@ -3725,8 +3716,7 @@
       if (minimize)
         {
 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (fi = min;; fi++)
@@ -3745,7 +3735,7 @@
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (fi = min;; fi++)
             {
@@ -3770,8 +3760,7 @@
         pp = eptr;


 #ifdef SUPPORT_UTF8
-        /* UTF-8 mode */
-        if (utf8)
+        if (utf)
           {
           register unsigned int d;
           for (i = min; i < max; i++)
@@ -3797,7 +3786,7 @@
           }
         else
 #endif
-        /* Not UTF-8 mode */
+        /* Not UTF mode */
           {
           for (i = min; i < max; i++)
             {
@@ -4073,7 +4062,7 @@
           while (eptr < md->end_subject)
             {
             int len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -4086,7 +4075,7 @@
 /* Handle all other cases when the coding is UTF-8 */


 #ifdef SUPPORT_UTF8
-      if (utf8) switch(ctype)
+      if (utf) switch(ctype)
         {
         case OP_ANY:
         for (i = 1; i <= min; i++)
@@ -4794,7 +4783,7 @@
           while (eptr < md->end_subject)
             {
             int len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -4804,8 +4793,7 @@
 #endif     /* SUPPORT_UCP */


 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-      if (utf8)
+      if (utf)
         {
         for (fi = min;; fi++)
           {
@@ -4968,7 +4956,7 @@
         }
       else
 #endif
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         for (fi = min;; fi++)
           {
@@ -5267,7 +5255,7 @@
           RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
-          if (utf8) BACKCHAR(eptr);
+          if (utf) BACKCHAR(eptr);
           }
         }


@@ -5284,13 +5272,13 @@
             SCHECK_PARTIAL();
             break;
             }
-          if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+          if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
           if (UCD_CATEGORY(c) == ucp_M) break;
           eptr += len;
           while (eptr < md->end_subject)
             {
             len = 1;
-            if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
+            if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
             if (UCD_CATEGORY(c) != ucp_M) break;
             eptr += len;
             }
@@ -5307,7 +5295,7 @@
           if (eptr-- == pp) break;        /* Stop if tried at original pos */
           for (;;)                        /* Move back over one extended */
             {
-            if (!utf8) c = *eptr; else
+            if (!utf) c = *eptr; else
               {
               BACKCHAR(eptr);
               GETCHAR(c, eptr);
@@ -5322,9 +5310,7 @@
 #endif   /* SUPPORT_UCP */


 #ifdef SUPPORT_UTF8
-      /* UTF-8 mode */
-
-      if (utf8)
+      if (utf)
         {
         switch(ctype)
           {
@@ -5607,8 +5593,7 @@
         }
       else
 #endif  /* SUPPORT_UTF8 */
-
-      /* Not UTF-8 mode */
+      /* Not UTF mode */
         {
         switch(ctype)
           {
@@ -5969,7 +5954,7 @@
 BOOL anchored;
 BOOL startline;
 BOOL firstline;
-BOOL utf8;
+BOOL utf;
 BOOL has_first_char = FALSE;
 BOOL has_req_char = FALSE;
 pcre_uchar first_char = 0;
@@ -6005,7 +5990,8 @@
 during "normal" pcre_exec() processing, not when the JIT support is in use,
 so they are set up later. */


-utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = md->utf = (re->options & PCRE_UTF8) != 0;
 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
               ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;


@@ -6013,10 +5999,10 @@
code for an invalid string if a results vector is available. */

 #ifdef SUPPORT_UTF8
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
   int erroroffset;
-  int errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)subject, length, &erroroffset);
+  int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
   if (errorcode != 0)
     {
     if (offsetcount >= 2)
@@ -6306,7 +6292,7 @@
     {
     PCRE_PUCHAR t = start_match;
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       {
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
@@ -6348,7 +6334,7 @@
       if (start_match > md->start_subject + start_offset)
         {
 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
@@ -6389,7 +6375,7 @@
           {
           start_match++;
 #ifdef SUPPORT_UTF8
-          if (utf8)
+          if (utf)
             while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
               start_match++;
 #endif
@@ -6521,7 +6507,7 @@
     case MATCH_THEN:
     new_start_match = start_match + 1;
 #ifdef SUPPORT_UTF8
-    if (utf8)
+    if (utf)
       while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
         new_start_match++;
 #endif


Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_internal.h    2011-12-03 07:58:30 UTC (rev 781)
@@ -292,8 +292,8 @@
 #define IS_NEWLINE(p) \
   ((NLBLOCK->nltype != NLTYPE_FIXED)? \
     ((p) < NLBLOCK->PSEND && \
-     PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\
-       utf8)) \
+     PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \
+       &(NLBLOCK->nllen), utf)) \
     : \
     ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
      (p)[0] == NLBLOCK->nl[0] && \
@@ -307,7 +307,7 @@
   ((NLBLOCK->nltype != NLTYPE_FIXED)? \
     ((p) > NLBLOCK->PSSTART && \
      PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
-       &(NLBLOCK->nllen), utf8)) \
+       &(NLBLOCK->nllen), utf)) \
     : \
     ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
      (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
@@ -581,7 +581,7 @@


#define GETCHARTEST(c, eptr) \
c = *eptr; \
- if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+ if (utf && c >= 0xc0) GETUTF8(c, eptr);

/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */
@@ -629,7 +629,7 @@

#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
- if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+ if (utf && c >= 0xc0) GETUTF8INC(c, eptr);

/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
@@ -681,7 +681,7 @@

#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
- if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
+ if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len);

/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -1366,7 +1366,7 @@
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
-used for [^] in JavaScript compatibility mode, and for \C in non-utf8 mode. In
+used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
non-DOTALL mode, "." behaves like \N.

 The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
@@ -1784,7 +1784,7 @@
        ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
        ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
        ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
-       ERR70, ERRCOUNT };
+       ERR70, ERR71, ERRCOUNT };


 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
@@ -1934,7 +1934,7 @@
   BOOL   offset_overflow;         /* Set if too many extractions */
   BOOL   notbol;                  /* NOTBOL flag */
   BOOL   noteol;                  /* NOTEOL flag */
-  BOOL   utf8;                    /* UTF8 flag */
+  BOOL   utf;                     /* UTF-8 / UTF-16 flag */
   BOOL   jscript_compat;          /* JAVASCRIPT_COMPAT flag */
   BOOL   use_ucp;                 /* PCRE_UCP flag */
   BOOL   endonly;                 /* Dollar not before final \n */
@@ -2103,14 +2103,10 @@
 extern const pcre_uchar *PRIV(find_bracket)(const pcre_uchar *, BOOL, int);
 extern BOOL              PRIV(is_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
                            int *, BOOL);
-extern int               PRIV(ord2utf8)(int, pcre_uint8 *);
+extern int               PRIV(ord2utf)(pcre_uint32, pcre_uchar *);
 extern real_pcre        *PRIV(try_flipped)(const real_pcre *, real_pcre *,
                            const pcre_study_data *, pcre_study_data *);
-#ifndef COMPILE_PCRE16
-extern int               PRIV(valid_utf8)(PCRE_PUCHAR, int, int *);
-#else
-extern int               PRIV(valid_utf16)(PCRE_PUCHAR, int, int *);
-#endif
+extern int               PRIV(valid_utf)(PCRE_PUCHAR, int, int *);
 extern BOOL              PRIV(was_newline)(PCRE_PUCHAR, int, PCRE_PUCHAR,
                            int *, BOOL);
 extern BOOL              PRIV(xclass)(int, const pcre_uchar *);


Modified: code/branches/pcre16/pcre_jit_compile.c
===================================================================
--- code/branches/pcre16/pcre_jit_compile.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_jit_compile.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -298,7 +298,7 @@
   jump_list *caselesscmp;
   BOOL jscript_compat;
 #ifdef SUPPORT_UTF8
-  BOOL utf8;
+  BOOL utf;
 #ifdef SUPPORT_UCP
   BOOL useucp;
 #endif
@@ -497,7 +497,7 @@


case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
- if (common->utf8) return NULL;
+ if (common->utf) return NULL;
#endif
return cc + 1;

@@ -544,7 +544,7 @@
case OP_NOTPOSQUERYI:
cc += 2;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
return cc;

@@ -566,7 +566,7 @@
case OP_NOTPOSUPTOI:
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF8
- if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+ if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
#endif
return cc;

@@ -1264,7 +1264,7 @@
unsigned int c;

#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
GETCHAR(c, cc);
if (c > 127)
@@ -1286,7 +1286,7 @@
{
/* Returns with the othercase. */
#ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
{
#ifdef SUPPORT_UCP
return UCD_OTHERCASE(c);
@@ -1307,7 +1307,7 @@
#endif

#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
GETCHAR(c, cc);
if (c <= 127)
@@ -1343,7 +1343,7 @@
return 0;

#ifdef SUPPORT_UTF8
-if (common->utf8 && c > 127)
+if (common->utf && c > 127)
{
n = PRIV(utf8_table4)[*cc & 0x3f];
while ((bit & 0x3f) == 0)
@@ -1374,7 +1374,7 @@

OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1395,7 +1395,7 @@

OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
@@ -1414,7 +1414,7 @@
#endif

#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1439,7 +1439,7 @@
#ifdef SUPPORT_UTF8
struct sljit_label *label;

-if (common->utf8)
+if (common->utf)
{
label = LABEL();
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -1697,7 +1697,7 @@

/* Increasing the STR_PTR here requires one less jump in the most common case. */
#ifdef SUPPORT_UTF8
-if (common->utf8) readuchar = TRUE;
+if (common->utf) readuchar = TRUE;
#endif
if (newlinecheck) readuchar = TRUE;

@@ -1709,7 +1709,7 @@

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1771,7 +1771,7 @@

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -1882,7 +1882,7 @@
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
#ifndef COMPILE_PCRE8
@@ -1896,12 +1896,12 @@
found = JUMP(SLJIT_C_NOT_ZERO);

 #ifdef SUPPORT_UTF
-if (common->utf8)
+if (common->utf)
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
 #endif
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
@@ -2051,7 +2051,7 @@
 #ifdef SUPPORT_UTF8
   /* Here LOCALS1 has already been zeroed. */
   jump = NULL;
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2090,7 +2090,7 @@
 #ifdef SUPPORT_UTF8
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
   jump = NULL;
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif
   OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
@@ -2119,7 +2119,7 @@
 COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2143,7 +2143,7 @@
 COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680);
@@ -2177,7 +2177,7 @@
 COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
 OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
 #ifdef SUPPORT_UTF8
-if (common->utf8)
+if (common->utf)
   {
   COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
   OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
@@ -2289,7 +2289,7 @@
 unsigned int othercasebit = 0;
 pcre_uchar *othercasechar = NULL;
 #ifdef SUPPORT_UTF8
-int utf8length;
+int utflength;
 #endif


if (caseless && char_has_othercase(common, cc))
@@ -2336,9 +2336,9 @@
}

#ifdef SUPPORT_UTF8
-utf8length = 1;
-if (common->utf8 && *cc >= 0xc0)
- utf8length += PRIV(utf8_table4)[*cc & 0x3f];
+utflength = 1;
+if (common->utf && *cc >= 0xc0)
+ utflength += PRIV(utf8_table4)[*cc & 0x3f];

do
{
@@ -2432,9 +2432,9 @@

cc++;
#ifdef SUPPORT_UTF8
- utf8length--;
+ utflength--;
}
-while (utf8length > 0);
+while (utflength > 0);
#endif

return cc;
@@ -2480,7 +2480,7 @@
int invertcmp, numberofcmps;
unsigned int charoffset;

-/* Although SUPPORT_UTF8 must be defined, we are not necessary in utf8 mode. */
+/* Although SUPPORT_UTF must be defined, we are not necessary in utf mode. */
check_input_end(common, fallbacks);
read_char(common);

@@ -2490,7 +2490,7 @@
 #ifndef COMPILE_PCRE8
   jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #elif defined SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
 #endif


@@ -2504,7 +2504,7 @@
 #ifndef COMPILE_PCRE8
   JUMPHERE(jump);
 #elif defined SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     JUMPHERE(jump);
 #endif
   OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
@@ -2524,7 +2524,7 @@
     {
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2534,11 +2534,11 @@
     {
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     cc++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2639,7 +2639,7 @@
     {
     cc ++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2670,7 +2670,7 @@
     {
     cc ++;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2679,7 +2679,7 @@
       c = *cc++;
     SET_CHAR_OFFSET(c);
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       GETCHARINC(c, cc);
       }
@@ -2876,7 +2876,7 @@
   case OP_ALLANY:
   check_input_end(common, fallbacks);
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
@@ -3096,7 +3096,7 @@
   case OP_CHARI:
   length = 1;
 #ifdef SUPPORT_UTF8
-  if (common->utf8 && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+  if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
 #endif
   if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
     {
@@ -3113,7 +3113,7 @@
   check_input_end(common, fallbacks);
   read_char(common);
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     GETCHAR(c, cc);
     }
@@ -3130,7 +3130,7 @@
   case OP_NOT:
   case OP_NOTI:
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     length = 1;
     if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
@@ -3196,7 +3196,7 @@
   jump[0] = NULL;
 #ifdef SUPPORT_UTF8
   /* This check can only be skipped in pure 8 bit mode. */
-  if (common->utf8)
+  if (common->utf)
 #endif
     {
     jump[0] = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
@@ -3231,7 +3231,7 @@
   OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
 #ifdef SUPPORT_UTF8
-  if (common->utf8)
+  if (common->utf)
     {
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length);
     label = LABEL();
@@ -3269,7 +3269,7 @@
     {
     size = 1;
 #ifdef SUPPORT_UTF8
-    if (common->utf8 && cc[1] >= 0xc0)
+    if (common->utf && cc[1] >= 0xc0)
       size += PRIV(utf8_table4)[cc[1] & 0x3f];
 #endif
     }
@@ -3277,7 +3277,7 @@
     {
     size = 1;
 #ifdef SUPPORT_UTF8
-    if (common->utf8)
+    if (common->utf)
       {
       if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
         size = 0;
@@ -3381,7 +3381,7 @@


#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UCP
-if (common->utf8 && *cc == OP_REFI)
+if (common->utf && *cc == OP_REFI)
{
SLJIT_ASSERT(TMP1 == SLJIT_TEMPORARY_REG1 && STACK_TOP == SLJIT_TEMPORARY_REG2 && TMP2 == SLJIT_TEMPORARY_REG3);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset + 1));
@@ -4787,7 +4787,7 @@
{
*end = cc + 1;
#ifdef SUPPORT_UTF8
- if (common->utf8 && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+ if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
#endif
}
return cc;
@@ -6254,7 +6254,8 @@
common->caselesscmp = NULL;
common->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
#ifdef SUPPORT_UTF8
-common->utf8 = (re->options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
common->useucp = (re->options & PCRE_UCP) != 0;
#endif

Modified: code/branches/pcre16/pcre_newline.c
===================================================================
--- code/branches/pcre16/pcre_newline.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_newline.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -67,17 +67,17 @@
   type         the newline type
   endptr       pointer to the end of the string
   lenptr       where to return the length
-  utf8         TRUE if in utf8 mode
+  utf          TRUE if in utf mode


 Returns:       TRUE or FALSE
 */


BOOL
PRIV(is_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR endptr, int *lenptr,
- BOOL utf8)
+ BOOL utf)
{
int c;
-if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
+if (utf) { GETCHAR(c, ptr); } else c = *ptr;

 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -96,7 +96,7 @@
   case 0x000c: *lenptr = 1; return TRUE;             /* FF */
   case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
                return TRUE;                          /* CR */
-  case 0x0085: *lenptr = utf8? 2 : 1; return TRUE;   /* NEL */
+  case 0x0085: *lenptr = utf? 2 : 1; return TRUE;    /* NEL */
   case 0x2028:                                       /* LS */
   case 0x2029: *lenptr = 3; return TRUE;             /* PS */
   default: return FALSE;
@@ -117,19 +117,19 @@
   type         the newline type
   startptr     pointer to the start of the string
   lenptr       where to return the length
-  utf8         TRUE if in utf8 mode
+  utf          TRUE if in utf mode


 Returns:       TRUE or FALSE
 */


 BOOL
 PRIV(was_newline)(PCRE_PUCHAR ptr, int type, PCRE_PUCHAR startptr, int *lenptr,
-  BOOL utf8)
+  BOOL utf)
 {
 int c;
 ptr--;
 #ifdef SUPPORT_UTF8
-if (utf8)
+if (utf)
   {
   BACKCHAR(ptr);
   GETCHAR(c, ptr);
@@ -154,7 +154,7 @@
   case 0x000b:                                      /* VT */
   case 0x000c:                                      /* FF */
   case 0x000d: *lenptr = 1; return TRUE;            /* CR */
-  case 0x0085: *lenptr = utf8? 2 : 1; return TRUE;  /* NEL */
+  case 0x0085: *lenptr = utf? 2 : 1; return TRUE;   /* NEL */
   case 0x2028:                                      /* LS */
   case 0x2029: *lenptr = 3; return TRUE;            /* PS */
   default: return FALSE;


Modified: code/branches/pcre16/pcre_ord2utf8.c
===================================================================
--- code/branches/pcre16/pcre_ord2utf8.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_ord2utf8.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -52,21 +52,28 @@
 *       Convert character value to UTF-8         *
 *************************************************/


-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
+/* This function takes an integer value in the range 0 - 0x10ffff
+and encodes it as a UTF-8 character in 1 to 6 pcre_uchars.

 Arguments:
   cvalue     the character value
-  buffer     pointer to buffer for result - at least 6 bytes long
+  buffer     pointer to buffer for result - at least 6 pcre_uchars long


 Returns:     number of characters placed in the buffer
 */


int
-PRIV(ord2utf8)(int cvalue, pcre_uint8 *buffer)
+PRIV(ord2utf)(pcre_uint32 cvalue, pcre_uchar *buffer)
{
#ifdef SUPPORT_UTF8
+
register int i, j;
+
+/* Checking invalid cvalue character, encoded as invalid UTF-16 character.
+Should never happen in practice. */
+if ((cvalue & 0xf800) == 0xd800 || cvalue >= 0x110000)
+ cvalue = 0xfffe;
+
for (i = 0; i < PRIV(utf8_table1_size); i++)
if (cvalue <= PRIV(utf8_table1)[i]) break;
buffer += i;
@@ -77,10 +84,13 @@
}
*buffer = PRIV(utf8_table2)[i] | cvalue;
return i + 1;
+
#else
+
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
return 0;
+
#endif
}


Modified: code/branches/pcre16/pcre_study.c
===================================================================
--- code/branches/pcre16/pcre_study.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_study.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -82,7 +82,8 @@
   int recurse_depth)
 {
 int length = -1;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
 BOOL had_recurse = FALSE;
 register int branchlength = 0;
 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
@@ -224,7 +225,7 @@
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;


@@ -245,7 +246,7 @@
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;


@@ -293,7 +294,7 @@

     case OP_ANYBYTE:
 #ifdef SUPPORT_UTF8
-    if (utf8) return -1;
+    if (utf) return -1;
 #endif
     branchlength++;
     cc++;
@@ -374,7 +375,7 @@
     case OP_REFI:
     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
       {
-      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1));
+      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
       if (cs == NULL) return -2;
       do ce += GET(ce, 1); while (*ce == OP_ALT);
       if (cc > cs && cc < ce)
@@ -486,7 +487,7 @@


     cc += PRIV(OP_lengths)[op];
 #ifdef SUPPORT_UTF8
-    if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
 #endif
     break;


@@ -537,29 +538,29 @@
   p             points to the character
   caseless      the caseless flag
   cd            the block with char table pointers
-  utf8          TRUE for UTF-8 mode
+  utf           TRUE for UTF-8 / UTF-16 mode


 Returns:        pointer after the character
 */


static const pcre_uchar *
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
- compile_data *cd, BOOL utf8)
+ compile_data *cd, BOOL utf)
{
unsigned int c = *p;

SET_BIT(c);

 #ifdef SUPPORT_UTF8
-if (utf8 && c > 127)
+if (utf && c > 127)
   {
   GETCHARINC(c, p);
 #ifdef SUPPORT_UCP
   if (caseless)
     {
-    pcre_uint8 buff[8];
+    pcre_uchar buff[6];
     c = UCD_OTHERCASE(c);
-    (void)PRIV(ord2utf8)(c, buff);
+    (void)PRIV(ord2utf)(c, buff);
     SET_BIT(buff[0]);
     }
 #endif
@@ -607,8 +608,8 @@
   {
   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
     {
-    pcre_uint8 buff[8];
-    (void)PRIV(ord2utf8)(c, buff);
+    pcre_uchar buff[6];
+    (void)PRIV(ord2utf)(c, buff);
     SET_BIT(buff[0]);
     }
   }
@@ -663,7 +664,7 @@
 Arguments:
   code         points to an expression
   start_bits   points to a 32-byte table, initialized to 0
-  utf8         TRUE if in UTF-8 mode
+  utf          TRUE if in UTF-8 / UTF-16 mode
   cd           the block with char table pointers


 Returns:       SSB_FAIL     => Failed to find any starting bytes
@@ -673,12 +674,12 @@
 */


static int
-set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8,
+set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
compile_data *cd)
{
register int c;
int yield = SSB_DONE;
-int table_limit = utf8? 16:32;
+int table_limit = utf? 16:32;

 #if 0
 /* ========================================================================= */
@@ -817,7 +818,7 @@
       case OP_ONCE:
       case OP_ONCE_NC:
       case OP_ASSERT:
-      rc = set_start_bits(tcode, start_bits, utf8, cd);
+      rc = set_start_bits(tcode, start_bits, utf, cd);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
       if (rc == SSB_DONE) try_next = FALSE; else
         {
@@ -864,7 +865,7 @@
       case OP_BRAZERO:
       case OP_BRAMINZERO:
       case OP_BRAPOSZERO:
-      rc = set_start_bits(++tcode, start_bits, utf8, cd);
+      rc = set_start_bits(++tcode, start_bits, utf, cd);
       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 /* =========================================================================
       See the comment at the head of this function concerning the next line,
@@ -891,7 +892,7 @@
       case OP_QUERY:
       case OP_MINQUERY:
       case OP_POSQUERY:
-      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
       break;


       case OP_STARI:
@@ -900,7 +901,7 @@
       case OP_QUERYI:
       case OP_MINQUERYI:
       case OP_POSQUERYI:
-      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
       break;


       /* Single-char upto sets the bit and tries the next */
@@ -908,13 +909,13 @@
       case OP_UPTO:
       case OP_MINUPTO:
       case OP_POSUPTO:
-      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
       break;


       case OP_UPTOI:
       case OP_MINUPTOI:
       case OP_POSUPTOI:
-      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8);
+      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
       break;


       /* At least one single char sets the bit and stops */
@@ -926,7 +927,7 @@
       case OP_PLUS:
       case OP_MINPLUS:
       case OP_POSPLUS:
-      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
       try_next = FALSE;
       break;


@@ -937,7 +938,7 @@
       case OP_PLUSI:
       case OP_MINPLUSI:
       case OP_POSPLUSI:
-      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
       try_next = FALSE;
       break;


@@ -950,7 +951,7 @@
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
-      if (utf8)
+      if (utf)
         {
         SET_BIT(0xC2);  /* For U+00A0 */
         SET_BIT(0xE1);  /* For U+1680, U+180E */
@@ -967,7 +968,7 @@
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
-      if (utf8)
+      if (utf)
         {
         SET_BIT(0xC2);  /* For U+0085 */
         SET_BIT(0xE2);  /* For U+2028, U+2029 */
@@ -1057,7 +1058,7 @@
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
-        if (utf8)
+        if (utf)
           {
           SET_BIT(0xC2);  /* For U+00A0 */
           SET_BIT(0xE1);  /* For U+1680, U+180E */
@@ -1073,7 +1074,7 @@
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
-        if (utf8)
+        if (utf)
           {
           SET_BIT(0xC2);  /* For U+0085 */
           SET_BIT(0xE2);  /* For U+2028, U+2029 */
@@ -1126,7 +1127,7 @@


       case OP_NCLASS:
 #ifdef SUPPORT_UTF8
-      if (utf8)
+      if (utf)
         {
         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
@@ -1147,7 +1148,7 @@
         characters in the range 128 - 255. */


 #ifdef SUPPORT_UTF8
-        if (utf8)
+        if (utf)
           {
           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
           for (c = 128; c < 256; c++)


Modified: code/branches/pcre16/pcre_valid_utf8.c
===================================================================
--- code/branches/pcre16/pcre_valid_utf8.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcre_valid_utf8.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -103,7 +103,7 @@
 */


int
-PRIV(valid_utf8)(PCRE_PUCHAR string, int length, int *erroroffset)
+PRIV(valid_utf)(PCRE_PUCHAR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register PCRE_PUCHAR p;

Modified: code/branches/pcre16/pcreposix.c
===================================================================
--- code/branches/pcre16/pcreposix.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/pcreposix.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -155,6 +155,7 @@
   REG_BADPAT,  /* \k is not followed by a braced, angle-bracketed, or quoted name */
   /* 70 */
   REG_BADPAT,  /* internal error: unknown opcode in find_fixedlength() */ 
+  REG_BADPAT,  /* Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) */
 };


/* Table of texts corresponding to POSIX error codes */

Modified: code/branches/pcre16/sljit/sljitConfigInternal.h
===================================================================
--- code/branches/pcre16/sljit/sljitConfigInternal.h    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitConfigInternal.h    2011-12-03 07:58:30 UTC (rev 781)
@@ -354,8 +354,8 @@
 #endif /* !SLJIT_UNALIGNED */


#if (defined SLJIT_EXECUTABLE_ALLOCATOR && SLJIT_EXECUTABLE_ALLOCATOR)
-static void* sljit_malloc_exec(sljit_uw size);
-static void sljit_free_exec(void* ptr);
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size);
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr);
#define SLJIT_MALLOC_EXEC(size) sljit_malloc_exec(size)
#define SLJIT_FREE_EXEC(ptr) sljit_free_exec(ptr)
#endif

Modified: code/branches/pcre16/sljit/sljitExecAllocator.c
===================================================================
--- code/branches/pcre16/sljit/sljitExecAllocator.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitExecAllocator.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -163,7 +163,7 @@
     }
 }


-static void* sljit_malloc_exec(sljit_uw size)
+SLJIT_API_FUNC_ATTRIBUTE void* sljit_malloc_exec(sljit_uw size)
 {
     struct block_header *header;
     struct block_header *next_header;
@@ -231,7 +231,7 @@
     return MEM_START(header);
 }


-static void sljit_free_exec(void* ptr)
+SLJIT_API_FUNC_ATTRIBUTE void sljit_free_exec(void* ptr)
 {
     struct block_header *header;
     struct free_block* free_block;


Modified: code/branches/pcre16/sljit/sljitLir.h
===================================================================
--- code/branches/pcre16/sljit/sljitLir.h    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitLir.h    2011-12-03 07:58:30 UTC (rev 781)
@@ -195,6 +195,8 @@
     int local_size;
     /* Code size. */
     sljit_uw size;
+    /* For statistical purposes. */
+    sljit_uw executable_size;


 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     int args;
@@ -291,6 +293,15 @@
 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler);
 SLJIT_API_FUNC_ATTRIBUTE void sljit_free_code(void* code);


+/*
+ After the code generation we can retrieve the allocated executable memory size,
+ although this area may not be fully filled with instructions depending on some
+ optimizations. This function is useful only for statistical purposes.
+
+ Before a successful code generation, this function returns with 0.
+*/
+static SLJIT_INLINE sljit_uw sljit_get_generated_code_size(struct sljit_compiler *compiler) { return compiler->executable_size; }
+
/* Instruction generation. Returns with error code. */

/*

Modified: code/branches/pcre16/sljit/sljitNativeARM_Thumb2.c
===================================================================
--- code/branches/pcre16/sljit/sljitNativeARM_Thumb2.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitNativeARM_Thumb2.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -416,6 +416,7 @@


     SLJIT_CACHE_FLUSH(code, code_ptr);
     compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = compiler->size * sizeof(sljit_uh);
     /* Set thumb mode flag. */
     return (void*)((sljit_uw)code | 0x1);
 }


Modified: code/branches/pcre16/sljit/sljitNativeARM_v5.c
===================================================================
--- code/branches/pcre16/sljit/sljitNativeARM_v5.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitNativeARM_v5.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -788,6 +788,7 @@


     SLJIT_CACHE_FLUSH(code, code_ptr);
     compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = size * sizeof(sljit_uw);
     return code;
 }



Modified: code/branches/pcre16/sljit/sljitNativeMIPS_common.c
===================================================================
--- code/branches/pcre16/sljit/sljitNativeMIPS_common.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitNativeMIPS_common.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -397,6 +397,7 @@
     }


     compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = compiler->size * sizeof(sljit_ins);
 #ifndef __GNUC__
     SLJIT_CACHE_FLUSH(code, code_ptr);
 #else


Modified: code/branches/pcre16/sljit/sljitNativePPC_common.c
===================================================================
--- code/branches/pcre16/sljit/sljitNativePPC_common.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitNativePPC_common.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -354,6 +354,7 @@


     SLJIT_CACHE_FLUSH(code, code_ptr);
     compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = compiler->size * sizeof(sljit_ins);


 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64)
     if (((sljit_w)code_ptr) & 0x4)


Modified: code/branches/pcre16/sljit/sljitNativeX86_common.c
===================================================================
--- code/branches/pcre16/sljit/sljitNativeX86_common.c    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/sljit/sljitNativeX86_common.c    2011-12-03 07:58:30 UTC (rev 781)
@@ -357,22 +357,22 @@
     while (jump) {
         if (jump->flags & PATCH_MB) {
             SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) >= -128 && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_b))) <= 127);
-            *(sljit_ub*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_b));
+            *(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_b)));
         } else if (jump->flags & PATCH_MW) {
             if (jump->flags & JUMP_LABEL) {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-                *(sljit_w*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_w));
+                *(sljit_w*)jump->addr = (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_w)));
 #else
                 SLJIT_ASSERT((sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
-                *(sljit_hw*)jump->addr = jump->u.label->addr - (jump->addr + sizeof(sljit_hw));
+                *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.label->addr - (jump->addr + sizeof(sljit_hw)));
 #endif
             }
             else {
 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-                *(sljit_w*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_w));
+                *(sljit_w*)jump->addr = (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_w)));
 #else
                 SLJIT_ASSERT((sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) >= -0x80000000ll && (sljit_w)(jump->u.target - (jump->addr + sizeof(sljit_hw))) <= 0x7fffffffll);
-                *(sljit_hw*)jump->addr = jump->u.target - (jump->addr + sizeof(sljit_hw));
+                *(sljit_hw*)jump->addr = (sljit_hw)(jump->u.target - (jump->addr + sizeof(sljit_hw)));
 #endif
             }
         }
@@ -387,6 +387,7 @@
     /* Maybe we waste some space because of short jumps. */
     SLJIT_ASSERT(code_ptr <= code + compiler->size);
     compiler->error = SLJIT_ERR_COMPILED;
+    compiler->executable_size = compiler->size;
     return (void*)code;
 }


@@ -1360,7 +1361,7 @@
             code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
             FAIL_IF(!code);
             INC_CSIZE(4);
-            *(sljit_hw*)code = src1w;
+            *(sljit_hw*)code = (sljit_hw)src1w;
         }
         else {
             EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
@@ -1403,7 +1404,7 @@
             code = (sljit_ub*)ensure_buf(compiler, 1 + 4);
             FAIL_IF(!code);
             INC_CSIZE(4);
-            *(sljit_hw*)code = src2w;
+            *(sljit_hw*)code = (sljit_hw)src2w;
         }
         else {
             EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);


Modified: code/branches/pcre16/testdata/testinput10
===================================================================
--- code/branches/pcre16/testdata/testinput10    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/testdata/testinput10    2011-12-03 07:58:30 UTC (rev 781)
@@ -52,12 +52,10 @@


/\x{100000}/8BM

-/\x{1000000}/8BM
+/\x{10ffff}/8BM

-/\x{4000000}/8BM
+/\x{110000}/8BM

-/\x{7fffFFFF}/8BM
-
/[\x{ff}]/8BM

/[\x{100}]/8BM

Modified: code/branches/pcre16/testdata/testinput5
===================================================================
--- code/branches/pcre16/testdata/testinput5    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/testdata/testinput5    2011-12-03 07:58:30 UTC (rev 781)
@@ -9,12 +9,10 @@


/\x{100000}/8DZ

-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ

-/\x{4000000}/8DZ
+/\x{110000}/8DZ

-/\x{7fffFFFF}/8DZ
-
/[\x{ff}]/8DZ

/[\x{100}]/8DZ
@@ -23,6 +21,14 @@

/\x{100000000}/8

+/\x{d800}/8
+
+/\x{dfff}/8
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
 /^\x{100}a\x{1234}/8
     \x{100}a\x{1234}bcd



Modified: code/branches/pcre16/testdata/testoutput10
===================================================================
--- code/branches/pcre16/testdata/testoutput10    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/testdata/testoutput10    2011-12-03 07:58:30 UTC (rev 781)
@@ -317,33 +317,18 @@
  11     End
 ------------------------------------------------------------------


-/\x{1000000}/8BM
-Memory allocation (code space): 13
+/\x{10ffff}/8BM
+Memory allocation (code space): 12
 ------------------------------------------------------------------
-  0   9 Bra
-  3     \x{1000000}
-  9   9 Ket
- 12     End
+  0   8 Bra
+  3     \x{10ffff}
+  8   8 Ket
+ 11     End
 ------------------------------------------------------------------


-/\x{4000000}/8BM
-Memory allocation (code space): 14
-------------------------------------------------------------------
-  0  10 Bra
-  3     \x{4000000}
- 10  10 Ket
- 13     End
-------------------------------------------------------------------
+/\x{110000}/8BM
+Failed: character value in \x{...} sequence is too large at offset 9


-/\x{7fffFFFF}/8BM
-Memory allocation (code space): 14
-------------------------------------------------------------------
-  0  10 Bra
-  3     \x{7fffffff}
- 10  10 Ket
- 13     End
-------------------------------------------------------------------
-
 /[\x{ff}]/8BM
 Memory allocation (code space): 10
 ------------------------------------------------------------------


Modified: code/branches/pcre16/testdata/testoutput5
===================================================================
--- code/branches/pcre16/testdata/testoutput5    2011-12-02 11:39:21 UTC (rev 780)
+++ code/branches/pcre16/testdata/testoutput5    2011-12-03 07:58:30 UTC (rev 781)
@@ -49,42 +49,21 @@
 First char = 244
 Need char = 128


-/\x{1000000}/8DZ
+/\x{10ffff}/8DZ
 ------------------------------------------------------------------
         Bra
-        \x{1000000}
+        \x{10ffff}
         Ket
         End
 ------------------------------------------------------------------
 Capturing subpattern count = 0
 Options: utf8
-First char = 249
-Need char = 128
-
-/\x{4000000}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{4000000}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 252
-Need char = 128
-
-/\x{7fffFFFF}/8DZ
-------------------------------------------------------------------
-        Bra
-        \x{7fffffff}
-        Ket
-        End
-------------------------------------------------------------------
-Capturing subpattern count = 0
-Options: utf8
-First char = 253
+First char = 244
 Need char = 191


+/\x{110000}/8DZ
+Failed: character value in \x{...} sequence is too large at offset 9
+
 /[\x{ff}]/8DZ
 ------------------------------------------------------------------
         Bra
@@ -115,6 +94,16 @@
 /\x{100000000}/8
 Failed: character value in \x{...} sequence is too large at offset 12


+/\x{d800}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{dfff}/8
+Failed: Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff) at offset 7
+
+/\x{d7ff}/8
+
+/\x{e000}/8
+
 /^\x{100}a\x{1234}/8
     \x{100}a\x{1234}bcd
  0: \x{100}a\x{1234}
@@ -1436,7 +1425,7 @@
 /[\H]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{7fffffff}]
+        [\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------
@@ -1444,7 +1433,7 @@
 /[\V]/8BZ
 ------------------------------------------------------------------
         Bra
-        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{7fffffff}]
+        [\x00-\x09\x0e-\x84\x86-\xff\x{100}-\x{2027}\x{2029}-\x{10ffff}]
         Ket
         End
 ------------------------------------------------------------------