[Pcre-svn] [782] code/branches/pcre16: Start working on UTF-16.

Author: Subversion repository
Date:
To: pcre-svn
Subject: [Pcre-svn] [782] code/branches/pcre16: Start working on UTF-16.

Revision: 782

          http://vcs.pcre.org/viewvc?view=rev&revision=782
Author:   zherczeg
Date:     2011-12-03 23:58:37 +0000 (Sat, 03 Dec 2011)

Log Message:
-----------
Start working on UTF-16. Updating macros and adding new ones.

Modified Paths:
--------------
    code/branches/pcre16/Makefile.am
    code/branches/pcre16/pcre16_ord2utf16.c
    code/branches/pcre16/pcre16_utf16_utils.c
    code/branches/pcre16/pcre16_valid_utf16.c
    code/branches/pcre16/pcre_compile.c
    code/branches/pcre16/pcre_dfa_exec.c
    code/branches/pcre16/pcre_exec.c
    code/branches/pcre16/pcre_internal.h
    code/branches/pcre16/pcre_jit_compile.c
    code/branches/pcre16/pcre_printint.src
    code/branches/pcre16/pcre_study.c
    code/branches/pcre16/pcre_tables.c

Added Paths:
-----------
    code/branches/pcre16/pcre16_ucd.c

Modified: code/branches/pcre16/Makefile.am
===================================================================
--- code/branches/pcre16/Makefile.am    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/Makefile.am    2011-12-03 23:58:37 UTC (rev 782)
@@ -219,6 +219,7 @@
   pcre16_study.c \
   pcre16_tables.c \
   pcre16_try_flipped.c \
+  pcre16_ucd.c \
   pcre16_utf16_utils.c \
   pcre16_valid_utf16.c \
   pcre16_xclass.c

Modified: code/branches/pcre16/pcre16_ord2utf16.c
===================================================================
--- code/branches/pcre16/pcre16_ord2utf16.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre16_ord2utf16.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -45,9 +45,11 @@
 #include "config.h"
 #endif

+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
#include "pcre_internal.h"

-
 /*************************************************
 *       Convert character value to UTF-16         *
 *************************************************/

Added: code/branches/pcre16/pcre16_ucd.c
===================================================================
--- code/branches/pcre16/pcre16_ucd.c                            (rev 0)
+++ code/branches/pcre16/pcre16_ucd.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_ucd.c"
+
+/* End of pcre16_ucd.c */

Modified: code/branches/pcre16/pcre16_utf16_utils.c
===================================================================
--- code/branches/pcre16/pcre16_utf16_utils.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre16_utf16_utils.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -46,6 +46,9 @@
 #include "config.h"
 #endif

+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
#include "pcre_internal.h"

int

Modified: code/branches/pcre16/pcre16_valid_utf16.c
===================================================================
--- code/branches/pcre16/pcre16_valid_utf16.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre16_valid_utf16.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -46,6 +46,9 @@
 #include "config.h"
 #endif

+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
#include "pcre_internal.h"

Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_compile.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -1466,8 +1466,8 @@
       {
       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
       ptr++;
-#ifdef SUPPORT_UTF8
-      if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+      if (utf) FORWARDCHAR(ptr);
 #endif
       }
     if (*ptr == 0) goto FAIL_EXIT;
@@ -1759,8 +1759,8 @@
     case OP_NOTI:
     branchlength++;
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;

@@ -1773,8 +1773,8 @@
     case OP_NOTEXACTI:
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;

@@ -2041,7 +2041,7 @@
a multi-byte character. The length in the table is a minimum, so we have to
arrange to skip the extra bytes. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) switch(c)
       {
       case OP_CHAR:
@@ -2072,7 +2072,7 @@
       case OP_MINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
-      if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
 #else
@@ -2161,7 +2161,7 @@
     by a multi-byte character. The length in the table is a minimum, so we have
     to arrange to skip the extra bytes. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) switch(c)
       {
       case OP_CHAR:
@@ -2192,7 +2192,7 @@
       case OP_MINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
-      if (code[-1] >= 0xc0) code += PRIV(utf8_table4)[code[-1] & 0x3f];
+      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
 #else
@@ -2452,7 +2452,7 @@
     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
     MINUPTO, and POSUPTO may be followed by a multibyte character */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     case OP_STAR:
     case OP_STARI:
     case OP_MINSTAR:
@@ -2465,7 +2465,7 @@
     case OP_MINQUERYI:
     case OP_POSQUERY:
     case OP_POSQUERYI:
-    if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f];
+    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
     break;

     case OP_UPTO:
@@ -2474,7 +2474,7 @@
     case OP_MINUPTOI:
     case OP_POSUPTO:
     case OP_POSUPTOI:
-    if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f];
+    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
     break;
 #endif

@@ -2913,8 +2913,8 @@
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       }
@@ -2957,8 +2957,8 @@
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       }
@@ -3424,7 +3424,7 @@
   int tempbracount;
   pcre_uchar mcbuffer[8];

- /* Get next byte in the pattern */
+ /* Get next character in the pattern */

c = *ptr;

@@ -3556,8 +3556,8 @@
         {
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
         ptr++;
-#ifdef SUPPORT_UTF8
-        if (utf) while ((*ptr & 0xc0) == 0x80) ptr++;
+#ifdef SUPPORT_UTF
+        if (utf) FORWARDCHAR(ptr);
 #endif
         }
       if (*ptr != 0) continue;
@@ -4601,7 +4601,7 @@
       {
       op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;

-      /* Deal with UTF-8 characters that take up more than one byte. It's
+      /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
       hold the length of the character in bytes, plus 0x80 to flag that it's a
       length rather than a small character. */
@@ -4610,16 +4610,16 @@
       if (utf && (code[-1] & 0x80) != 0)
         {
         pcre_uchar *lastchar = code - 1;
-        while((*lastchar & 0xc0) == 0x80) lastchar--;
+        BACKCHAR(lastchar);
         c = code - lastchar;            /* Length of UTF-8 character */
-        memcpy(utf_chars, lastchar, c); /* Save the char */
+        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
         c |= 0x80;                      /* Flag c as a length */
         }
       else
 #endif

-      /* Handle the case of a single byte - either with no UTF8 support, or
-      with UTF-8 disabled, or for a UTF-8 character < 128. */
+      /* Handle the case of a single charater - either with no UTF support, or
+      with UTF disabled, or for a single character UTF character. */

         {
         c = code[-1];
@@ -5273,9 +5273,9 @@
       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
         {
         tempcode += PRIV(OP_lengths)[*tempcode];
-#ifdef SUPPORT_UTF8
-        if (utf && tempcode[-1] >= 0xc0)
-          tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+        if (utf && HAS_EXTRALEN(tempcode[-1]))
+          tempcode += GET_EXTRALEN(tempcode[-1]);
 #endif
         }

@@ -6659,11 +6659,10 @@
     mclength = 1;
     mcbuffer[0] = c;

-#ifdef SUPPORT_UTF8
-    if (utf && c >= 0xc0)
+#ifdef SUPPORT_UTF
+    if (utf && HAS_EXTRALEN(c))
       {
-      while ((ptr[1] & 0xc0) == 0x80)
-        mcbuffer[mclength++] = *(++ptr);
+      INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
       }
 #endif

Modified: code/branches/pcre16/pcre_dfa_exec.c
===================================================================
--- code/branches/pcre16/pcre_dfa_exec.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_dfa_exec.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -480,9 +480,7 @@
       {
       if (current_subject <= start_subject) break;
       current_subject--;
-      while (current_subject > start_subject &&
-             (*current_subject & 0xc0) == 0x80)
-        current_subject--;
+      INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
       }
     }
   else
@@ -3161,9 +3159,17 @@
     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
     }
+#ifdef COMPILE_PCRE8
   if (start_offset > 0 && start_offset < length &&
         (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
     return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+  if (start_offset > 0 && start_offset < length &&
+        (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+    return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
   }
 #endif

@@ -3234,13 +3240,13 @@
     if (firstline)
       {
       PCRE_PUCHAR t = current_subject;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
           t++;
-          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+          INTERNALCHAR(t < end_subject, *t, t++);
           }
         }
       else
@@ -3277,16 +3283,15 @@
         {
         if (current_subject > md->start_subject + start_offset)
           {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
           if (utf)
             {
             while (current_subject < end_subject &&
                    !WAS_NEWLINE(current_subject))
               {
               current_subject++;
-              while(current_subject < end_subject &&
-                    (*current_subject & 0xc0) == 0x80)
-                current_subject++;
+              INTERNALCHAR(current_subject < end_subject, *current_subject,
+                current_subject++);
               }
             }
           else
@@ -3316,10 +3321,10 @@
           if ((start_bits[c/8] & (1 << (c&7))) == 0)
             {
             current_subject++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
             if (utf)
-              while(current_subject < end_subject &&
-                    (*current_subject & 0xc0) == 0x80) current_subject++;
+              INTERNALCHAR(current_subject < end_subject, *current_subject,
+                current_subject++);
 #endif
             }
           else break;
@@ -3426,11 +3431,13 @@

   if (firstline && IS_NEWLINE(current_subject)) break;
   current_subject++;
+#ifdef SUPPORT_UTF
   if (utf)
     {
-    while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
-      current_subject++;
+    INTERNALCHAR(current_subject < end_subject, *current_subject,
+      current_subject++);
     }
+#endif
   if (current_subject > end_subject) break;

/* If we have just passed a CR and we are now at a LF, and the pattern does

Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_exec.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -2077,7 +2077,7 @@
         if (eptr == md->start_subject) prev_is_word = FALSE; else
           {
           PCRE_PUCHAR lastptr = eptr - 1;
-          while((*lastptr & 0xc0) == 0x80) lastptr--;
+          BACKCHAR(lastptr);
           if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
           GETCHAR(c, lastptr);
 #ifdef SUPPORT_UCP
@@ -2189,7 +2189,9 @@
       MRRETURN(MATCH_NOMATCH);
       }
     eptr++;
-    if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#ifdef SUPPORT_UTF
+    if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+#endif
     ecode++;
     break;

@@ -4074,7 +4076,7 @@

/* Handle all other cases when the coding is UTF-8 */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf) switch(ctype)
         {
         case OP_ANY:
@@ -4087,7 +4089,7 @@
             }
           if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
           eptr++;
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4100,7 +4102,7 @@
             MRRETURN(MATCH_NOMATCH);
             }
           eptr++;
-          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4298,7 +4300,8 @@
             }
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
             MRRETURN(MATCH_NOMATCH);
-          while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+          eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4326,7 +4329,8 @@
             }
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
             MRRETURN(MATCH_NOMATCH);
-          while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
+          eptr++;
+          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -5309,7 +5313,7 @@
       else
 #endif   /* SUPPORT_UCP */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         switch(ctype)
@@ -5326,7 +5330,7 @@
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }

@@ -5343,7 +5347,7 @@
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           break;
@@ -5359,7 +5363,7 @@
                 break;
                 }
               eptr++;
-              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           else
@@ -6014,10 +6018,18 @@
       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
     }

-  /* Check that a start_offset points to the start of a UTF-8 character. */
+  /* Check that a start_offset points to the start of a UTF character. */
+#ifdef COMPILE_PCRE8
   if (start_offset > 0 && start_offset < length &&
       (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
     return PCRE_ERROR_BADUTF8_OFFSET;
+#else
+#ifdef COMPILE_PCRE16
+  if (start_offset > 0 && start_offset < length &&
+      (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
+    return PCRE_ERROR_BADUTF8_OFFSET;
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
   }
 #endif

@@ -6291,13 +6303,13 @@
   if (firstline)
     {
     PCRE_PUCHAR t = start_match;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
         t++;
-        while (t < end_subject && (*t & 0xc0) == 0x80) t++;
+        INTERNALCHAR(t < end_subject, *t, t++);
         }
       }
     else
@@ -6333,14 +6345,14 @@
       {
       if (start_match > md->start_subject + start_offset)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
             start_match++;
-            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
-              start_match++;
+            INTERNALCHAR(start_match < end_subject, *start_match,
+              start_match++);
             }
           }
         else
@@ -6366,7 +6378,7 @@
       {
       while (start_match < end_subject)
         {
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE
         register unsigned int c = *start_match;
 #else
         register unsigned int c = *start_match & 0xff;
@@ -6374,10 +6386,10 @@
         if ((start_bits[c/8] & (1 << (c&7))) == 0)
           {
           start_match++;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
           if (utf)
-            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
-              start_match++;
+            INTERNALCHAR(start_match < end_subject, *start_match,
+              start_match++);
 #endif
           }
         else break;
@@ -6506,10 +6518,10 @@
     case MATCH_PRUNE:
     case MATCH_THEN:
     new_start_match = start_match + 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
-      while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
-        new_start_match++;
+      INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+        new_start_match++);
 #endif
     break;

Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_internal.h    2011-12-03 23:58:37 UTC (rev 782)
@@ -531,7 +531,9 @@
 never be called in byte mode. To make sure they can never even appear when
 UTF-8 support is omitted, we don't even define them. */

-#ifndef SUPPORT_UTF8
+/* #define HAS_EXTRALEN(c) */
+/* #define GET_EXTRALEN(c) */
+#ifndef SUPPORT_UTF
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
@@ -539,14 +541,27 @@
#define GETCHARLEN(c, eptr, len) c = *eptr;
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
+/* #define FORWARDCHAR(eptr) */
+/* #define INTERNALCHAR(condition, eptr, action) */

-#else /* SUPPORT_UTF8 */
+#else /* SUPPORT_UTF */

+#ifdef COMPILE_PCRE8
+
/* These macros were originally written in the form of loops that used data
from the tables whose names start with PRIV(utf8_table). They were rewritten by
a user so as not to use loops, because in some environments this gives a
significant performance advantage, and it seems never to do any harm. */

+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) ((c) >= 0xc0)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
+
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */

@@ -689,9 +704,109 @@

#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--

-#endif /* SUPPORT_UTF8 */
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++

+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+ while((condition) && ((eptr) & 0xc0) == 0x80) action

+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+
+/* Tests whether the code point needs extra characters to decode. */
+
+#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800)
+
+/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
+Otherwise it has an undefined behaviour. */
+
+#define GET_EXTRALEN(c) 1
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer. */
+
+#define GETUTF16(c, eptr) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; }
+
+/* Get the next UTF-16 character, not advancing the pointer. This is called when
+we know we are in UTF-16 mode. */
+
+#define GETCHAR(c, eptr) \
+ c = *eptr; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
+pointer. */
+
+#define GETCHARTEST(c, eptr) \
+ c = *eptr; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
+the pointer. */
+
+#define GETUTF16INC(c, eptr) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+
+/* Get the next UTF-16 character, advancing the pointer. This is called when we
+know we are in UTF-16 mode. */
+
+#define GETCHARINC(c, eptr) \
+ c = *eptr++; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-16 mode. */
+
+#define GETCHARINCTEST(c, eptr) \
+ c = *eptr++; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
+
+/* Base macro to pick up the low surrogate of a UTF-16 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF16LEN(c, eptr, len) \
+ { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; }
+
+/* Get the next UTF-16 character, not advancing the pointer, incrementing
+length if there is a low surrogate. This is called when we know we are in
+UTF-16 mode. */
+
+#define GETCHARLEN(c, eptr, len) \
+ c = *eptr; \
+ if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
+pointer, incrementing length if there is a low surrogate. This is called when
+we do not know if we are in UTF-16 mode. */
+
+#define GETCHARLENTEST(c, eptr, len) \
+ c = *eptr; \
+ if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
+
+/* If the pointer is not at the start of a character, move it back until
+it is. This is called only in UTF-16 mode - we don't put a test within the
+macro because almost all calls are already within a block of UTF-16 only
+code. */
+
+#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr--
+
+/* Same as above, just in the other direction. */
+#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
+
+/* Same as above, but it allows a fully customizable form. */
+#define INTERNALCHAR(condition, eptr, action) \
+ if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
+
+#endif
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
+
+
/* In case there is no definition of offsetof() provided - though any proper
Standard C system should have one. */

@@ -2043,12 +2158,15 @@
but are not part of the PCRE public API. The data for these tables is in the
pcre_tables.c module. */

+#ifdef COMPILE_PCRE8
+
 extern const int            PRIV(utf8_table1)[];
+extern const int            PRIV(utf8_table1_size);
 extern const int            PRIV(utf8_table2)[];
 extern const int            PRIV(utf8_table3)[];
 extern const pcre_uint8     PRIV(utf8_table4)[];

-extern const int            PRIV(utf8_table1_size);
+#endif /* COMPILE_PCRE8 */

 extern const char           PRIV(utt_names)[];
 extern const ucp_type_table PRIV(utt)[];

Modified: code/branches/pcre16/pcre_jit_compile.c
===================================================================
--- code/branches/pcre16/pcre_jit_compile.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_jit_compile.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -302,9 +302,11 @@
 #ifdef SUPPORT_UCP
   BOOL useucp;
 #endif
-  jump_list *utf8readchar;
-  jump_list *utf8readtype8;
+  jump_list *utfreadchar;
+#ifdef COMPILE_PCRE8
+  jump_list *utfreadtype8;
 #endif
+#endif /* SUPPORT_UTF8 */
 #ifdef SUPPORT_UCP
   jump_list *getucd;
 #endif
@@ -543,8 +545,8 @@
   case OP_NOTPOSPLUSI:
   case OP_NOTPOSQUERYI:
   cc += 2;
-#ifdef SUPPORT_UTF8
-  if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
   return cc;

@@ -565,8 +567,8 @@
case OP_NOTEXACTI:
case OP_NOTPOSUPTOI:
cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
- if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
return cc;

@@ -1285,7 +1287,7 @@
static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c)
{
/* Returns with the othercase. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf && c > 127)
{
#ifdef SUPPORT_UCP
@@ -1302,11 +1304,11 @@
{
/* Detects if the character and its othercase has only 1 bit difference. */
unsigned int c, oc, bit;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8
int n;
#endif

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
GETCHAR(c, cc);
@@ -1324,11 +1326,11 @@
else
{
c = *cc;
- oc = common->fcc[c];
+ oc = TABLE_GET(c, common->fcc, c);
}
#else
c = *cc;
-oc = common->fcc[c];
+oc = TABLE_GET(c, common->fcc, c);
#endif

SLJIT_ASSERT(c != oc);
@@ -1342,10 +1344,12 @@
if (!ispowerof2(bit))
return 0;

+#ifdef COMPILE_PCRE8
+
 #ifdef SUPPORT_UTF8
 if (common->utf && c > 127)
   {
-  n = PRIV(utf8_table4)[*cc & 0x3f];
+  n = GET_EXTRALEN(*cc);
   while ((bit & 0x3f) == 0)
     {
     n--;
@@ -1353,8 +1357,25 @@
     }
   return (n << 8) | bit;
   }
-#endif
+#endif /* SUPPORT_UTF8 */
 return (0 << 8) | bit;
+
+#else /* COMPILE_PCRE8 */
+
+#ifdef COMPILE_PCRE16
+#ifdef SUPPORT_UTF16
+if (common->utf && c > 65535)
+  {
+  if (bit >= (1 << 10))
+    bit >>= 10;
+  else
+    return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+  }
+#endif /* SUPPORT_UTF16 */
+return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
 }

static SLJIT_INLINE void check_input_end(compiler_common *common, jump_list **fallbacks)
@@ -1368,16 +1389,22 @@
/* Reads the character into TMP1, updates STR_PTR.
Does not check STR_END. TMP2 Destroyed. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
struct sljit_jump *jump;
#endif

OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
+#ifdef COMPILE_PCRE8
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+ add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
#endif
@@ -1389,16 +1416,22 @@
/* Reads the character into TMP1, keeps STR_PTR.
Does not check STR_END. TMP2 Destroyed. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
struct sljit_jump *jump;
#endif

OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
+#ifdef COMPILE_PCRE8
jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
+#else
+#ifdef COMPILE_PCRE16
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+#endif
+#endif /* COMPILE_PCRE8 */
+ add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
JUMPHERE(jump);
}
@@ -1409,46 +1442,83 @@
{
/* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
struct sljit_jump *jump;
#endif

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (common->utf)
{
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
/* This can be an extra read in some situations, but hopefully
- it is a clever early read in most cases. */
+ it is needed in most cases. */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
- add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
+ add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
+#else
+#ifdef COMPILE_PCRE16
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+ JUMPHERE(jump);
+ /* Skip low surrogate if necessary. */
+ OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+#endif
+#endif /* COMPILE_PCRE8 */
return;
}
#endif
-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
+#ifdef COMPILE_PCRE16
+/* The ctypes array contains only 255 values. */
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+#endif
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+#ifdef COMPILE_PCRE16
+JUMPHERE(jump);
+#endif
}

static void skip_char_back(compiler_common *common)
{
/* Goes one character back. Only affects STR_PTR. Does not check begin. */
DEFINE_COMPILER;
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
struct sljit_label *label;

if (common->utf)
{
label = LABEL();
- OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label);
return;
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ /* Skip low surrogate if necessary. */
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ return;
+ }
+#endif
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
}

@@ -1477,10 +1547,12 @@
}
}

-#ifdef SUPPORT_UTF8
-static void do_utf8readchar(compiler_common *common)
+#ifdef SUPPORT_UTF
+
+#ifdef COMPILE_PCRE8
+static void do_utfreadchar(compiler_common *common)
{
-/* Fast decoding an utf8 character. TMP1 contains the first byte
+/* Fast decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
DEFINE_COMPILER;
struct sljit_jump *jump;
@@ -1489,82 +1561,57 @@
/* Searching for the first zero. */
OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);

OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 3 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Three byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0f);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 12);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 2);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);

-OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x08);
-jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 4 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+/* Four byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x07);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 18);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 3);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(3));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 3);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-JUMPHERE(jump);
-
-/* 5 byte sequence */
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x03);
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 24);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 4);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 4);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 4);
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

-static void do_utf8readtype8(compiler_common *common)
+static void do_utfreadtype8(compiler_common *common)
{
-/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
+/* Fast decoding a UTF-8 character type. TMP2 contains the first byte
+of the character (>= 0xc0). Return value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_jump *compare;
@@ -1573,9 +1620,9 @@

OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20);
jump = JUMP(SLJIT_C_NOT_ZERO);
-/* 2 byte sequence */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
@@ -1596,8 +1643,39 @@
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

-#endif
+#else /* COMPILE_PCRE8 */

+#ifdef COMPILE_PCRE16
+static void do_utfreadchar(compiler_common *common)
+{
+/* Fast decoding a UTF-16 character. TMP1 contains the first 16 bit char
+of the character (>= 0xd800). Return char value in TMP1, length - 1 in TMP2. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xdc00);
+/* Do nothing, only return. */
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+/* Combine two 16 bit characters. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3ff);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
+
#ifdef SUPPORT_UCP

/* UCD_BLOCK_SIZE must be 128 (see the assert below). */
@@ -1634,8 +1712,8 @@
struct sljit_jump *start;
struct sljit_jump *end = NULL;
struct sljit_jump *nl = NULL;
-#ifdef SUPPORT_UTF8
-struct sljit_jump *singlebyte;
+#ifdef SUPPORT_UTF
+struct sljit_jump *singlechar;
#endif
jump_list *newline = NULL;
BOOL newlinecheck = FALSE;
@@ -1708,15 +1786,27 @@
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
- singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
- JUMPHERE(singlebyte);
+ JUMPHERE(singlechar);
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(singlechar);
+ }
+#endif
JUMPHERE(start);

if (newlinecheck)
@@ -1770,7 +1860,7 @@
}

OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1778,6 +1868,17 @@
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
+#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
JUMPHERE(leave);
@@ -1900,7 +2001,7 @@
OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (common->utf)
{
CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
@@ -1908,6 +2009,17 @@
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
#endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE16
+if (common->utf)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start);
+ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
+#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
JUMPHERE(leave);
@@ -2335,10 +2447,10 @@
context->sourcereg = TMP2;
}

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
utflength = 1;
-if (common->utf && *cc >= 0xc0)
- utflength += PRIV(utf8_table4)[*cc & 0x3f];
+if (common->utf && HAS_EXTRALEN(*cc))
+ utflength += GET_EXTRALEN(*cc);

 do
   {
@@ -2523,8 +2635,8 @@
   if (*cc == XCL_SINGLE)
     {
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2533,12 +2645,12 @@
   else if (*cc == XCL_RANGE)
     {
     cc += 2;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     cc++;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
 #ifdef SUPPORT_UCP
     needschar = TRUE;
@@ -2875,24 +2987,35 @@

   case OP_ALLANY:
   check_input_end(common, fallbacks);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (common->utf)
     {
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#ifdef COMPILE_PCRE8
     jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
     OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#else /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+    jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+    OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+    COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+    OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+#endif /* COMPILE_PCRE16 */
+#endif /* COMPILE_PCRE8 */
     JUMPHERE(jump[0]);
     return cc;
     }
 #endif
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   return cc;

case OP_ANYBYTE:
check_input_end(common, fallbacks);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return cc;

 #ifdef SUPPORT_UTF8
@@ -3095,8 +3218,8 @@
   case OP_CHAR:
   case OP_CHARI:
   length = 1;
-#ifdef SUPPORT_UTF8
-  if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
 #endif
   if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
     {
@@ -3129,11 +3252,11 @@

   case OP_NOT:
   case OP_NOTI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (common->utf)
     {
     length = 1;
-    if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f];
+    if (HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);

     check_input_end(common, fallbacks);
     GETCHAR(c, cc);
@@ -3152,7 +3275,9 @@
       /* Skip the variable-length character. */
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
       jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+#ifdef COMPILE_PCRE8
       OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0);
+#endif
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
       JUMPHERE(jump[0]);
       return cc + length;
@@ -3268,21 +3393,21 @@
   if (*cc == OP_CHAR)
     {
     size = 1;
-#ifdef SUPPORT_UTF8
-    if (common->utf && cc[1] >= 0xc0)
-      size += PRIV(utf8_table4)[cc[1] & 0x3f];
+#ifdef SUPPORT_UTF
+    if (common->utf && HAS_EXTRALEN(cc[1]))
+      size += GET_EXTRALEN(cc[1]);
 #endif
     }
   else if (*cc == OP_CHARI)
     {
     size = 1;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (common->utf)
       {
       if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
         size = 0;
-      else if (cc[1] >= 0xc0)
-        size += PRIV(utf8_table4)[cc[1] & 0x3f];
+      else if (HAS_EXTRALEN(cc[1]))
+        size += GET_EXTRALEN(cc[1]);
       }
     else
 #endif
@@ -4786,8 +4911,8 @@
 if (end != NULL)
   {
   *end = cc + 1;
-#ifdef SUPPORT_UTF8
-  if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f];
+#ifdef SUPPORT_UTF
+  if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc);
 #endif
   }
 return cc;
@@ -6259,9 +6384,11 @@
 #ifdef SUPPORT_UCP
 common->useucp = (re->options & PCRE_UCP) != 0;
 #endif
-common->utf8readchar = NULL;
-common->utf8readtype8 = NULL;
+common->utfreadchar = NULL;
+#ifdef COMPILE_PCRE8
+common->utfreadtype8 = NULL;
 #endif
+#endif /* SUPPORT_UTF8 */
 #ifdef SUPPORT_UCP
 common->getucd = NULL;
 #endif
@@ -6487,18 +6614,20 @@
   set_jumps(common->caselesscmp, LABEL());
   do_caselesscmp(common);
   }
-#ifdef SUPPORT_UTF8
-if (common->utf8readchar != NULL)
+#ifdef SUPPORT_UTF
+if (common->utfreadchar != NULL)
   {
-  set_jumps(common->utf8readchar, LABEL());
-  do_utf8readchar(common);
+  set_jumps(common->utfreadchar, LABEL());
+  do_utfreadchar(common);
   }
-if (common->utf8readtype8 != NULL)
+#ifdef COMPILE_PCRE8
+if (common->utfreadtype8 != NULL)
   {
-  set_jumps(common->utf8readtype8, LABEL());
-  do_utf8readtype8(common);
+  set_jumps(common->utfreadtype8, LABEL());
+  do_utfreadtype8(common);
   }
 #endif
+#endif /* COMPILE_PCRE8 */
 #ifdef SUPPORT_UCP
 if (common->getucd != NULL)
   {

Modified: code/branches/pcre16/pcre_printint.src
===================================================================
--- code/branches/pcre16/pcre_printint.src    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_printint.src    2011-12-03 23:58:37 UTC (rev 782)
@@ -72,17 +72,20 @@
 *************************************************/

static int
-print_char(FILE *f, pcre_uchar *ptr, BOOL utf8)
+print_char(FILE *f, pcre_uchar *ptr, BOOL utf)
{
int c = *ptr;

-#ifndef SUPPORT_UTF8
-(void)utf8; /* Avoid compiler warning */
+#ifndef SUPPORT_UTF
+(void)utf; /* Avoid compiler warning */
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
return 0;

 #else
-if (!utf8 || (c & 0xc0) != 0xc0)
+
+#ifdef COMPILE_PCRE8
+
+if (!utf || (c & 0xc0) != 0xc0)
   {
   if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
   return 0;
@@ -110,14 +113,45 @@
     s -= 6;
     c |= (ptr[i] & 0x3f) << s;
     }
-  if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
+  fprintf(f, "\\x{%x}", c);
   return a;
   }
-#endif
+
+#else
+
+#ifdef COMPILE_PCRE16
+
+if (!utf || (c & 0xfc00) != 0xd800)
+  {
+  if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+  return 0;
+  }
+else
+  {
+  /* This is a check for malformed UTF-16; it should only occur if the sanity
+  check has been turned off. Rather than swallow a low surrogate, just stop if
+  we hit a bad one. Print it with \X instead of \x as an indication. */
+
+  if ((ptr[1] & 0xfc00) != 0xdc00)
+    {
+    fprintf(f, "\\X{%x}", c);
+    return 0;
+    }
+
+  c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
+  fprintf(f, "\\x{%x}", c);
+  return 1;
+  }
+
+#endif /* COMPILE_PCRE16 */
+
+#endif /* COMPILE_PCRE8 */
+
+#endif /* SUPPORT_UTF */
 }

 /*************************************************
-*  Print uchar string (regardless of utf8)       *
+*  Print uchar string (regardless of utf)        *
 *************************************************/

static void
@@ -168,7 +202,7 @@
{
real_pcre *re = (real_pcre *)external_re;
pcre_uchar *codestart, *code;
-BOOL utf8;
+BOOL utf;

unsigned int options = re->options;
int offset = re->name_table_offset;
@@ -187,7 +221,8 @@
}

code = codestart = (pcre_uchar *)re + offset + count * size;
-utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+utf = (options & PCRE_UTF8) != 0;

 for(;;)
   {
@@ -232,7 +267,7 @@
     do
       {
       code++;
-      code += 1 + print_char(f, code, utf8);
+      code += 1 + print_char(f, code, utf);
       }
     while (*code == OP_CHAR);
     fprintf(f, "\n");
@@ -243,7 +278,7 @@
     do
       {
       code++;
-      code += 1 + print_char(f, code, utf8);
+      code += 1 + print_char(f, code, utf);
       }
     while (*code == OP_CHARI);
     fprintf(f, "\n");
@@ -349,7 +384,7 @@
         extra = 2;
         }
       }
-    else extra = print_char(f, code+1, utf8);
+    else extra = print_char(f, code+1, utf);
     fprintf(f, "%s", OP_names[*code]);
     break;

@@ -364,7 +399,7 @@
     case OP_MINUPTO:
     case OP_POSUPTO:
     fprintf(f, " %s ", flag);
-    extra = print_char(f, code + 1 + IMM2_SIZE, utf8);
+    extra = print_char(f, code + 1 + IMM2_SIZE, utf);
     fprintf(f, "{");
     if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
     fprintf(f, "%d}", GET2(code,1));
@@ -557,7 +592,7 @@
           }
         }

-      /* Indicate a non-UTF8 class which was created by negation */
+      /* Indicate a non-UTF class which was created by negation */

       fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");

Modified: code/branches/pcre16/pcre_study.c
===================================================================
--- code/branches/pcre16/pcre_study.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_study.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -225,7 +225,7 @@
     branchlength++;
     cc += 2;
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;

@@ -246,7 +246,7 @@
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;

@@ -487,7 +487,7 @@

     cc += PRIV(OP_lengths)[op];
 #ifdef SUPPORT_UTF8
-    if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f];
+    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;

Modified: code/branches/pcre16/pcre_tables.c
===================================================================
--- code/branches/pcre16/pcre_tables.c    2011-12-03 07:58:30 UTC (rev 781)
+++ code/branches/pcre16/pcre_tables.c    2011-12-03 23:58:37 UTC (rev 782)
@@ -65,8 +65,10 @@
 /* These are the breakpoints for different numbers of bytes in a UTF-8
 character. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF

+#ifdef COMPILE_PCRE8
+
const int PRIV(utf8_table1)[] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};

@@ -87,6 +89,8 @@
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };

+#endif /* COMPILE_PCRE8 */
+
/* Table to translate from particular type value to the general value. */

const int PRIV(ucp_gentype)[] = {
@@ -554,6 +558,6 @@

const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */

/* End of pcre_tables.c */

This message is part of the following thread:
	the complete thread tree sorted by date

[Pcre-svn] [782] code/branches/pcre16: Start working on UTF-…