[Pcre-svn] [785] code/branches/pcre16: Improving UTF-16 support by fixing a lot of issues.

Author: Subversion repository
Date:
To: pcre-svn
Subject: [Pcre-svn] [785] code/branches/pcre16: Improving UTF-16 support by fixing a lot of issues.

Revision: 785

          http://vcs.pcre.org/viewvc?view=rev&revision=785
Author:   zherczeg
Date:     2011-12-05 20:12:24 +0000 (Mon, 05 Dec 2011)

Log Message:
-----------
Improving UTF-16 support by fixing a lot of issues.

Modified Paths:
--------------
    code/branches/pcre16/Makefile.am
    code/branches/pcre16/pcre.h.in
    code/branches/pcre16/pcre_compile.c
    code/branches/pcre16/pcre_dfa_exec.c
    code/branches/pcre16/pcre_exec.c
    code/branches/pcre16/pcre_fullinfo.c
    code/branches/pcre16/pcre_info.c
    code/branches/pcre16/pcre_internal.h
    code/branches/pcre16/pcre_jit_compile.c
    code/branches/pcre16/pcre_newline.c
    code/branches/pcre16/pcre_printint.src
    code/branches/pcre16/pcre_study.c
    code/branches/pcre16/pcre_version.c

Added Paths:
-----------
    code/branches/pcre16/pcre16_fullinfo.c
    code/branches/pcre16/pcre16_info.c
    code/branches/pcre16/pcre16_version.c

Modified: code/branches/pcre16/Makefile.am
===================================================================
--- code/branches/pcre16/Makefile.am    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/Makefile.am    2011-12-05 20:12:24 UTC (rev 785)
@@ -212,6 +212,8 @@
   pcre16_chartables.c \
   pcre16_compile.c \
   pcre16_exec.c \
+  pcre16_fullinfo.c \
+  pcre16_info.c \
   pcre16_jit_compile.c \
   pcre16_newline.c \
   pcre16_ord2utf16.c \
@@ -222,6 +224,7 @@
   pcre16_ucd.c \
   pcre16_utf16_utils.c \
   pcre16_valid_utf16.c \
+  pcre16_version.c \
   pcre16_xclass.c

## This file is generated as part of the building process, so don't distribute.

Modified: code/branches/pcre16/pcre.h.in
===================================================================
--- code/branches/pcre16/pcre.h.in    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre.h.in    2011-12-05 20:12:24 UTC (rev 785)
@@ -367,6 +367,8 @@
 PCRE_EXP_DECL void pcre_free_substring_list(const char **);
 PCRE_EXP_DECL int  pcre_fullinfo(const pcre *, const pcre_extra *, int,
                   void *);
+PCRE_EXP_DECL int  pcre16_fullinfo(const pcre *, const pcre_extra *, int,
+                  void *);
 PCRE_EXP_DECL int  pcre_get_named_substring(const pcre *, const char *,
                   int *, int, const char *, const char **);
 PCRE_EXP_DECL int  pcre_get_stringnumber(const pcre *, const char *);
@@ -377,16 +379,20 @@
 PCRE_EXP_DECL int  pcre_get_substring_list(const char *, int *, int,
                   const char ***);
 PCRE_EXP_DECL int  pcre_info(const pcre *, int *, int *);
+PCRE_EXP_DECL int  pcre16_info(const pcre *, int *, int *);
 PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
 PCRE_EXP_DECL int  pcre_refcount(pcre *, int);
-PCRE_EXP_DECL int  pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
-                  PCRE_SPTR16, int, int);
 PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
 PCRE_EXP_DECL pcre_extra *pcre16_study(const pcre *, int, const char **);
 PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
 PCRE_EXP_DECL void pcre16_free_study(pcre_extra *);
 PCRE_EXP_DECL const char *pcre_version(void);
+PCRE_EXP_DECL const char *pcre16_version(void);

+/* Utility functions. */
+PCRE_EXP_DECL int  pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+                  PCRE_SPTR16, int, int);
+
 /* JIT compiler related functions. */

PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);

Added: code/branches/pcre16/pcre16_fullinfo.c
===================================================================
--- code/branches/pcre16/pcre16_fullinfo.c                            (rev 0)
+++ code/branches/pcre16/pcre16_fullinfo.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_fullinfo.c"
+
+/* End of pcre16_fullinfo.c */

Added: code/branches/pcre16/pcre16_info.c
===================================================================
--- code/branches/pcre16/pcre16_info.c                            (rev 0)
+++ code/branches/pcre16/pcre16_info.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_info.c"
+
+/* End of pcre16_info.c */

Added: code/branches/pcre16/pcre16_version.c
===================================================================
--- code/branches/pcre16/pcre16_version.c                            (rev 0)
+++ code/branches/pcre16/pcre16_version.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_version.c"
+
+/* End of pcre16_version.c */

Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_compile.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -102,6 +102,10 @@
 #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
 #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */

+/* Repeated character flags. */
+
+#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
+
 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 are simple data values; negative values are for special things like \d and so
 on. Zero means further processing is needed (for things like \x), or the escape
@@ -2896,7 +2900,7 @@
 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
   const pcre_uchar *ptr, int options, compile_data *cd)
 {
-int c, next;
+pcre_int32 c, next;
 int op_code = *previous++;

 /* Skip whitespace and comments in extended mode */
@@ -2932,15 +2936,13 @@
   if (temperrorcode != 0) return FALSE;
   ptr++;    /* Point after the escape sequence */
   }
-
-else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
+else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
   {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   if (utf) { GETCHARINC(next, ptr); } else
 #endif
   next = *ptr++;
   }
-
 else return FALSE;

/* Skip whitespace and comments in extended mode */
@@ -4603,20 +4605,25 @@

       /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
-      hold the length of the character in bytes, plus 0x80 to flag that it's a
-      length rather than a small character. */
+      hold the length of the character in bytes, plus UTF_LENGTH to flag that
+      it's a length rather than a small character. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+#ifdef COMPILE_PCRE8
       if (utf && (code[-1] & 0x80) != 0)
+#endif /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+      if (utf && (code[-1] & 0xfc00) == 0xdc00)
+#endif /* COMPILE_PCRE8 */
         {
         pcre_uchar *lastchar = code - 1;
         BACKCHAR(lastchar);
         c = code - lastchar;            /* Length of UTF-8 character */
         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
-        c |= 0x80;                      /* Flag c as a length */
+        c |= UTF_LENGTH;                /* Flag c as a length */
         }
       else
-#endif
+#endif /* SUPPORT_UTF */

       /* Handle the case of a single charater - either with no UTF support, or
       with UTF disabled, or for a single character UTF character. */
@@ -4758,14 +4765,14 @@
         we have to insert the character for the previous code. For a repeated
         Unicode property match, there are two extra bytes that define the
         required property. In UTF-8 mode, long characters have their length in
-        c, with the 0x80 bit as a flag. */
+        c, with the UTF_LENGTH bit as a flag. */

         if (repeat_max < 0)
           {
-#ifdef SUPPORT_UTF8
-          if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+          if (utf && (c & UTF_LENGTH) != 0)
             {
-            memcpy(code, utf_chars, c & 7);
+            memcpy(code, utf_chars, IN_UCHARS(c & 7));
             code += c & 7;
             }
           else
@@ -4787,10 +4794,10 @@

         else if (repeat_max != repeat_min)
           {
-#ifdef SUPPORT_UTF8
-          if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+          if (utf && (c & UTF_LENGTH) != 0)
             {
-            memcpy(code, utf_chars, c & 7);
+            memcpy(code, utf_chars, IN_UCHARS(c & 7));
             code += c & 7;
             }
           else
@@ -4817,10 +4824,10 @@

       /* The character or character type itself comes last in all cases. */

-#ifdef SUPPORT_UTF8
-      if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+      if (utf && (c & UTF_LENGTH) != 0)
         {
-        memcpy(code, utf_chars, c & 7);
+        memcpy(code, utf_chars, IN_UCHARS(c & 7));
         code += c & 7;
         }
       else
@@ -6661,9 +6668,7 @@

 #ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(c))
-      {
-      INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
-      }
+      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
 #endif

     /* At this point we have the character's bytes in mcbuffer, and the length
@@ -7789,9 +7794,27 @@
       re->first_char = firstchar & 0xffff;
 #endif
 #endif
-      if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)
-        && cd->fcc[re->first_char] != re->first_char)
-        re->flags |= PCRE_FCH_CASELESS;
+      if ((firstchar & REQ_CASELESS) != 0)
+        {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+        /* We ignore non-ASCII first chars in 8 bit mode. */
+        if (utf)
+          {
+          if (re->first_char < 128)
+            {
+            if (cd->fcc[re->first_char] != re->first_char)
+              re->flags |= PCRE_FCH_CASELESS;
+            }
+          else if ((options & PCRE_UCP) != 0
+              && UCD_OTHERCASE(re->first_char) != re->first_char)
+            re->flags |= PCRE_FCH_CASELESS;
+          }
+        else
+#endif
+        if (MAX_255(re->first_char)
+            && cd->fcc[re->first_char] != re->first_char)
+          re->flags |= PCRE_FCH_CASELESS;
+        }

       re->flags |= PCRE_FIRSTSET;
       }
@@ -7814,9 +7837,26 @@
   re->req_char = reqchar & 0xffff;
 #endif
 #endif
-  if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)
-    && cd->fcc[re->req_char] != re->req_char)
-    re->flags |= PCRE_RCH_CASELESS;
+  if ((reqchar & REQ_CASELESS) != 0)
+    {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    /* We ignore non-ASCII first chars in 8 bit mode. */
+    if (utf)
+      {
+      if (re->first_char < 128)
+        {
+        if (cd->fcc[re->first_char] != re->first_char)
+          re->flags |= PCRE_RCH_CASELESS;
+        }
+      else if ((options & PCRE_UCP) != 0
+          && UCD_OTHERCASE(re->first_char) != re->first_char)
+        re->flags |= PCRE_RCH_CASELESS;
+      }
+    else
+#endif
+    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
+      re->flags |= PCRE_RCH_CASELESS;
+    }

re->flags |= PCRE_REQCHSET;
}

Modified: code/branches/pcre16/pcre_dfa_exec.c
===================================================================
--- code/branches/pcre16/pcre_dfa_exec.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_dfa_exec.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -480,7 +480,7 @@
       {
       if (current_subject <= start_subject) break;
       current_subject--;
-      INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
+      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
       }
     }
   else
@@ -3199,7 +3199,13 @@
     has_first_char = TRUE;
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
+      {
       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+      if (first_char > 127 && utf && md->use_ucp)
+        first_char2 = UCD_OTHERCASE(first_char);
+#endif
+      }
     }
   else
     {
@@ -3217,7 +3223,13 @@
   has_req_char = TRUE;
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
+    {
     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    if (req_char > 127 && utf && md->use_ucp)
+      req_char2 = UCD_OTHERCASE(req_char);
+#endif
+    }
   }

 /* Call the main matching function, looping for a non-anchored regex after a
@@ -3246,7 +3258,7 @@
         while (t < md->end_subject && !IS_NEWLINE(t))
           {
           t++;
-          INTERNALCHAR(t < end_subject, *t, t++);
+          ACROSSCHAR(t < end_subject, *t, t++);
           }
         }
       else
@@ -3290,7 +3302,7 @@
                    !WAS_NEWLINE(current_subject))
               {
               current_subject++;
-              INTERNALCHAR(current_subject < end_subject, *current_subject,
+              ACROSSCHAR(current_subject < end_subject, *current_subject,
                 current_subject++);
               }
             }
@@ -3318,12 +3330,17 @@
         while (current_subject < end_subject)
           {
           register unsigned int c = *current_subject;
+#ifndef COMPILE_PCRE8
+          if (c > 255) c = 255;
+#endif
           if ((start_bits[c/8] & (1 << (c&7))) == 0)
             {
             current_subject++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+            /* In non 8-bit mode, the iteration will stop for
+            characters > 255 at the beginning or not stop at all. */
             if (utf)
-              INTERNALCHAR(current_subject < end_subject, *current_subject,
+              ACROSSCHAR(current_subject < end_subject, *current_subject,
                 current_subject++);
 #endif
             }
@@ -3434,7 +3451,7 @@
 #ifdef SUPPORT_UTF
   if (utf)
     {
-    INTERNALCHAR(current_subject < end_subject, *current_subject,
+    ACROSSCHAR(current_subject < end_subject, *current_subject,
       current_subject++);
     }
 #endif

Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_exec.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -2069,7 +2069,7 @@
       be "non-word" characters. Remember the earliest consulted character for
       partial matching. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         /* Get status of previous character */
@@ -2190,7 +2190,7 @@
       }
     eptr++;
 #ifdef SUPPORT_UTF
-    if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+    if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
 #endif
     ecode++;
     break;
@@ -3066,7 +3066,7 @@
     /* Match a single character, caselessly */

     case OP_CHARI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -4089,7 +4089,7 @@
             }
           if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4102,7 +4102,7 @@
             MRRETURN(MATCH_NOMATCH);
             }
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4301,7 +4301,7 @@
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
             MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -4330,7 +4330,7 @@
           if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
             MRRETURN(MATCH_NOMATCH);
           eptr++;
-          INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+          ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
           }
         break;

@@ -5330,7 +5330,7 @@
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }

@@ -5347,7 +5347,7 @@
                 }
               if (IS_NEWLINE(eptr)) break;
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           break;
@@ -5363,7 +5363,7 @@
                 break;
                 }
               eptr++;
-              INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+              ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
               }
             }
           else
@@ -6264,7 +6264,13 @@
     has_first_char = TRUE;
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
+      {
       first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+      if (first_char > 127 && utf && md->use_ucp)
+        first_char2 = UCD_OTHERCASE(first_char);
+#endif
+      }
     }
   else
     if (!startline && study != NULL &&
@@ -6280,7 +6286,13 @@
   has_req_char = TRUE;
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
+    {
     req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+    if (req_char > 127 && utf && md->use_ucp)
+      req_char2 = UCD_OTHERCASE(req_char);
+#endif
+    }
   }

@@ -6309,7 +6321,7 @@
       while (t < md->end_subject && !IS_NEWLINE(t))
         {
         t++;
-        INTERNALCHAR(t < end_subject, *t, t++);
+        ACROSSCHAR(t < end_subject, *t, t++);
         }
       }
     else
@@ -6351,7 +6363,7 @@
           while (start_match < end_subject && !WAS_NEWLINE(start_match))
             {
             start_match++;
-            INTERNALCHAR(start_match < end_subject, *start_match,
+            ACROSSCHAR(start_match < end_subject, *start_match,
               start_match++);
             }
           }
@@ -6378,17 +6390,18 @@
       {
       while (start_match < end_subject)
         {
-#ifdef COMPILE_PCRE
         register unsigned int c = *start_match;
-#else
-        register unsigned int c = *start_match & 0xff;
+#ifndef COMPILE_PCRE8
+        if (c > 255) c = 255;
 #endif
         if ((start_bits[c/8] & (1 << (c&7))) == 0)
           {
           start_match++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+          /* In non 8-bit mode, the iteration will stop for
+          characters > 255 at the beginning or not stop at all. */
           if (utf)
-            INTERNALCHAR(start_match < end_subject, *start_match,
+            ACROSSCHAR(start_match < end_subject, *start_match,
               start_match++);
 #endif
           }
@@ -6520,7 +6533,7 @@
     new_start_match = start_match + 1;
 #ifdef SUPPORT_UTF
     if (utf)
-      INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+      ACROSSCHAR(new_start_match < end_subject, *new_start_match,
         new_start_match++);
 #endif
     break;

Modified: code/branches/pcre16/pcre_fullinfo.c
===================================================================
--- code/branches/pcre16/pcre_fullinfo.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_fullinfo.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -65,9 +65,15 @@
 Returns:           0 if data returned, negative on error
 */

+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+ void *where)
+#endif
{
real_pcre internal_re;
pcre_study_data internal_study;

Modified: code/branches/pcre16/pcre_info.c
===================================================================
--- code/branches/pcre16/pcre_info.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_info.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -72,8 +72,13 @@
                 or negative values on error
 */

+#ifdef COMPILE_PCRE8
 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+pcre_info(const pcre *argument_re, int *optptr, int *first_char)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_info(const pcre *argument_re, int *optptr, int *first_char)
+#endif
 {
 real_pcre internal_re;
 const real_pcre *re = (const real_pcre *)argument_re;
@@ -84,8 +89,8 @@
   if (re == NULL) return PCRE_ERROR_BADMAGIC;
   }
 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
-if (first_byte != NULL)
-  *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
+if (first_char != NULL)
+  *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
      ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
 return re->top_bracket;
 }

Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_internal.h    2011-12-05 20:12:24 UTC (rev 785)
@@ -542,7 +542,7 @@
 /* #define GETCHARLENTEST(c, eptr, len) */
 /* #define BACKCHAR(eptr) */
 /* #define FORWARDCHAR(eptr) */
-/* #define INTERNALCHAR(condition, eptr, action) */
+/* #define ACROSSCHAR(condition, eptr, action) */

#else /* SUPPORT_UTF */

@@ -708,7 +708,7 @@
#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++

/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
while((condition) && ((eptr) & 0xc0) == 0x80) action

#else /* COMPILE_PCRE8 */
@@ -748,7 +748,7 @@
the pointer. */

#define GETUTF16INC(c, eptr) \
- { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+ { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; }

/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */
@@ -797,7 +797,7 @@
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++

/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action

#endif

Modified: code/branches/pcre16/pcre_jit_compile.c
===================================================================
--- code/branches/pcre16/pcre_jit_compile.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_jit_compile.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -300,7 +300,7 @@
 #ifdef SUPPORT_UTF8
   BOOL utf;
 #ifdef SUPPORT_UCP
-  BOOL useucp;
+  BOOL use_ucp;
 #endif
   jump_list *utfreadchar;
 #ifdef COMPILE_PCRE8
@@ -390,10 +390,12 @@
 #define PRIV_DATA(cc)    (common->localptrs[(cc) - common->start])

 #ifdef COMPILE_PCRE8
-#define MOV_UCHAR SLJIT_MOV_UB
+#define MOV_UCHAR  SLJIT_MOV_UB
+#define MOVU_UCHAR SLJIT_MOVU_UB
 #else
 #ifdef COMPILE_PCRE16
-#define MOV_UCHAR SLJIT_MOV_UH
+#define MOV_UCHAR  SLJIT_MOV_UH
+#define MOVU_UCHAR SLJIT_MOVU_UH
 #else
 #error Unsupported compiling mode
 #endif
@@ -1369,10 +1371,10 @@
   if (bit >= (1 << 10))
     bit >>= 10;
   else
-    return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+    return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
   }
 #endif /* SUPPORT_UTF16 */
-return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+return (bit < 256) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
 #endif /* COMPILE_PCRE16 */

#endif /* COMPILE_PCRE8 */
@@ -1420,7 +1422,7 @@
struct sljit_jump *jump;
#endif

-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF
if (common->utf)
{
@@ -1461,7 +1463,7 @@
#else
#ifdef COMPILE_PCRE16
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
- jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
JUMPHERE(jump);
/* Skip low surrogate if necessary. */
@@ -1478,9 +1480,9 @@
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef COMPILE_PCRE16
-/* The ctypes array contains only 255 values. */
+/* The ctypes array contains only 256 values. */
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
-jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
#endif
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
#ifdef COMPILE_PCRE16
@@ -1542,7 +1544,7 @@
}
else
{
- SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline <= 255);
+ SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
add_jump(compiler, fallbacks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
}
@@ -1660,7 +1662,7 @@

JUMPHERE(jump);
/* Combine two 16 bit characters. */
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
@@ -1818,7 +1820,7 @@
return mainloop;
}

-static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar firstchar, BOOL caseless, BOOL firstline)
+static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline)
{
DEFINE_COMPILER;
struct sljit_label *start;
@@ -1836,22 +1838,28 @@
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);

-oc = firstchar;
+oc = first_char;
 if (caseless)
-  oc = TABLE_GET(firstchar, common->fcc, firstchar);
-if (firstchar == oc)
-  found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, firstchar);
+  {
+  oc = TABLE_GET(first_char, common->fcc, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+  if (first_char > 127 && common->utf && common->use_ucp)
+    oc = UCD_OTHERCASE(first_char);
+#endif
+  }
+if (first_char == oc)
+  found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, first_char);
 else
   {
-  bit = firstchar ^ oc;
+  bit = first_char ^ oc;
   if (ispowerof2(bit))
     {
     OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit);
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, firstchar | bit);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit);
     }
   else
     {
-    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, firstchar);
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char);
     COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
     OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
     COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
@@ -1912,16 +1920,19 @@
   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
   firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0);

- OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_GREATER_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);

loop = LABEL();
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop);
CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop);

@@ -1952,9 +1963,12 @@
leave = JUMP(SLJIT_JUMP);
JUMPHERE(foundcr);
notfoundnl = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
JUMPHERE(notfoundnl);
JUMPHERE(leave);
@@ -1972,6 +1986,9 @@
struct sljit_label *start;
struct sljit_jump *leave;
struct sljit_jump *found;
+#ifndef COMPILE_PCRE8
+struct sljit_jump *jump;
+#endif

if (firstline)
{
@@ -1987,7 +2004,9 @@
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
#ifndef COMPILE_PCRE8
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 255);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 255);
+JUMPHERE(jump);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2028,7 +2047,7 @@
OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0);
}

-static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar reqchar, BOOL caseless, BOOL has_firstchar)
+static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar req_char, BOOL caseless, BOOL has_firstchar)
{
DEFINE_COMPILER;
struct sljit_label *loop;
@@ -2045,34 +2064,40 @@
alreadyfound = CMP(SLJIT_C_LESS, STR_PTR, 0, TMP2, 0);

if (has_firstchar)
- OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
else
OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0);

loop = LABEL();
notfound = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, STR_END, 0);

-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), 0);
-oc = reqchar;
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0);
+oc = req_char;
 if (caseless)
-  oc = TABLE_GET(reqchar, common->fcc, reqchar);
-if (reqchar == oc)
-  found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+  {
+  oc = TABLE_GET(req_char, common->fcc, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+  if (req_char > 127 && common->utf && common->use_ucp)
+    oc = UCD_OTHERCASE(req_char);
+#endif
+  }
+if (req_char == oc)
+  found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
 else
   {
-  bit = reqchar ^ oc;
+  bit = req_char ^ oc;
   if (ispowerof2(bit))
     {
     OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit);
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar | bit);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit);
     }
   else
     {
-    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+    found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
     foundoc = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, oc);
     }
   }
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
 JUMPTO(SLJIT_JUMP, loop);

JUMPHERE(found);
@@ -2126,7 +2151,7 @@
{
DEFINE_COMPILER;
struct sljit_jump *beginend;
-#ifdef SUPPORT_UTF8
+#if !(defined COMPILE_PCRE8) || defined SUPPORT_UTF
struct sljit_jump *jump;
#endif

@@ -2143,7 +2168,7 @@

 /* Testing char type. */
 #ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
   {
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
   jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2160,20 +2185,24 @@
 else
 #endif
   {
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
   /* Here LOCALS1 has already been zeroed. */
   jump = NULL;
   if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
-#endif
+#endif /* COMPILE_PCRE8 */
   OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
   OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
   OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
   OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, TMP1, 0);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  JUMPHERE(jump);
+#elif defined SUPPORT_UTF
   if (jump != NULL)
     JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
   }
 JUMPHERE(beginend);

@@ -2183,7 +2212,7 @@

 /* Testing char type. This is a code duplication. */
 #ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
   {
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
   jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2199,8 +2228,12 @@
 else
 #endif
   {
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  /* TMP2 may be destroyed by peek_char. */
   OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
+  jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
+  OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
   jump = NULL;
   if (common->utf)
     jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
@@ -2208,10 +2241,12 @@
   OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
   OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */);
   OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+  JUMPHERE(jump);
+#elif defined SUPPORT_UTF
   if (jump != NULL)
     JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
   }
 JUMPHERE(beginend);

@@ -2314,18 +2349,18 @@
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR2, 0);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);

JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -2346,20 +2381,30 @@
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, CHAR2, 0);
OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+#ifndef COMPILE_PCRE8
+jump = CMP(SLJIT_C_GREATER, CHAR1, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+jump = CMP(SLJIT_C_GREATER, CHAR2, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+#endif
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);

JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1);
@@ -2378,7 +2423,7 @@
/* This function would be ineffective to do in JIT level. */
int c1, c2;
const pcre_uchar *src2 = args->ptr;
-const pcre_uchar *end2 = (pcre_uchar *)args->end;
+const pcre_uchar *end2 = args->end;

 while (src1 < end1)
   {
@@ -2976,7 +3021,7 @@
     {
     jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
     jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
     add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
     JUMPHERE(jump[1]);
     JUMPHERE(jump[0]);
@@ -3037,9 +3082,9 @@
   read_char(common);
   jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
   jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
   jump[2] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   jump[3] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[0]);
   check_newlinechar(common, common->bsr_nltype, fallbacks, FALSE);
@@ -3089,36 +3134,37 @@
   jump[0] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
   else if (common->nltype == NLTYPE_FIXED)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
     }
   else
     {
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     jump[1] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0);
     jump[2] = JUMP(SLJIT_C_GREATER);
     add_jump(compiler, fallbacks, JUMP(SLJIT_C_LESS));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 1);
+    /* Equal. */
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     jump[3] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
     add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));

     JUMPHERE(jump[1]);
     if (common->nltype == NLTYPE_ANYCRLF)
       {
-      OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
+      OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
       add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, STR_END, 0));
       add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
       }
@@ -3158,15 +3204,13 @@
   jump[0] = JUMP(SLJIT_JUMP);
   JUMPHERE(jump[1]);

-  OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, end));
-  add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, STR_PTR, 0));
-
+  add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, STR_PTR, 0, STR_END, 0));
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, TMP1, 0));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
@@ -3200,10 +3244,10 @@

   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
-    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+    OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, TMP2, 0, STR_END, 0));
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-    OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
     add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
     }
@@ -6382,7 +6426,7 @@
 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
 common->utf = (re->options & PCRE_UTF8) != 0;
 #ifdef SUPPORT_UCP
-common->useucp = (re->options & PCRE_UCP) != 0;
+common->use_ucp = (re->options & PCRE_UCP) != 0;
 #endif
 common->utfreadchar = NULL;
 #ifdef COMPILE_PCRE8

Modified: code/branches/pcre16/pcre_newline.c
===================================================================
--- code/branches/pcre16/pcre_newline.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_newline.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -77,7 +77,15 @@
   BOOL utf)
 {
 int c;
-if (utf) { GETCHAR(c, ptr); } else c = *ptr;
+(void)utf;
+#ifdef SUPPORT_UTF
+if (utf)
+  {
+  GETCHAR(c, ptr);
+  }
+else
+#endif  /* SUPPORT_UTF8 */
+  c = *ptr;

 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -96,9 +104,15 @@
   case 0x000c: *lenptr = 1; return TRUE;             /* FF */
   case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
                return TRUE;                          /* CR */
+#ifdef COMPILE_PCRE8
   case 0x0085: *lenptr = utf? 2 : 1; return TRUE;    /* NEL */
   case 0x2028:                                       /* LS */
   case 0x2029: *lenptr = 3; return TRUE;             /* PS */
+#else
+  case 0x0085:                                       /* NEL */
+  case 0x2028:                                       /* LS */
+  case 0x2029: *lenptr = 1; return TRUE;             /* PS */
+#endif /* COMPILE_PCRE8 */
   default: return FALSE;
   }
 }
@@ -127,17 +141,17 @@
   BOOL utf)
 {
 int c;
+(void)utf;
 ptr--;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 if (utf)
   {
   BACKCHAR(ptr);
   GETCHAR(c, ptr);
   }
-else c = *ptr;
-#else   /* no UTF-8 support */
-c = *ptr;
+else
 #endif  /* SUPPORT_UTF8 */
+  c = *ptr;

 if (type == NLTYPE_ANYCRLF) switch(c)
   {
@@ -154,9 +168,15 @@
   case 0x000b:                                      /* VT */
   case 0x000c:                                      /* FF */
   case 0x000d: *lenptr = 1; return TRUE;            /* CR */
+#ifdef COMPILE_PCRE8
   case 0x0085: *lenptr = utf? 2 : 1; return TRUE;   /* NEL */
   case 0x2028:                                      /* LS */
   case 0x2029: *lenptr = 3; return TRUE;            /* PS */
+#else
+  case 0x0085:                                       /* NEL */
+  case 0x2028:                                       /* LS */
+  case 0x2029: *lenptr = 1; return TRUE;             /* PS */
+#endif /* COMPILE_PCRE8 */
   default: return FALSE;
   }
 }

Modified: code/branches/pcre16/pcre_printint.src
===================================================================
--- code/branches/pcre16/pcre_printint.src    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_printint.src    2011-12-05 20:12:24 UTC (rev 785)
@@ -123,7 +123,9 @@

if (!utf || (c & 0xfc00) != 0xd800)
{
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ if (PRINTABLE(c)) fprintf(f, "%c", c);
+ else if (c <= 0xff) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%x}", c);
return 0;
}
else

Modified: code/branches/pcre16/pcre_study.c
===================================================================
--- code/branches/pcre16/pcre_study.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_study.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -224,7 +224,7 @@
     case OP_NOTPOSPLUSI:
     branchlength++;
     cc += 2;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -245,7 +245,7 @@
     case OP_NOTEXACTI:
     branchlength += GET2(cc,1);
     cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -293,7 +293,7 @@
     appear, but leave the code, just in case.) */

     case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf) return -1;
 #endif
     branchlength++;
@@ -486,7 +486,7 @@
     case OP_NOTPOSQUERYI:

     cc += PRIV(OP_lengths)[op];
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 #endif
     break;
@@ -549,9 +549,10 @@
 {
 unsigned int c = *p;

+#ifdef COMPILE_PCRE8
SET_BIT(c);

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && c > 127)
{
GETCHARINC(c, p);
@@ -572,6 +573,33 @@

 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+  c = 0xff;
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+  {
+  GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+  if (caseless)
+    {
+    c = UCD_OTHERCASE(c);
+    if (c > 0xff)
+      c = 0xff;
+    SET_BIT(c);
+    }
+#endif
+  return p;
+  }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
 }

@@ -602,7 +630,7 @@
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
@@ -644,7 +672,9 @@
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
}

@@ -679,7 +709,11 @@
{
register int c;
int yield = SSB_DONE;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif

 #if 0
 /* ========================================================================= */
@@ -951,14 +985,23 @@
       case OP_HSPACE:
       SET_BIT(0x09);
       SET_BIT(0x20);
+#ifdef SUPPORT_UTF
       if (utf)
         {
+#ifdef COMPILE_PCRE8
         SET_BIT(0xC2);  /* For U+00A0 */
         SET_BIT(0xE1);  /* For U+1680, U+180E */
         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
         SET_BIT(0xE3);  /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+        SET_BIT(0xA0);
+        SET_BIT(0xFF);  /* For characters > 255 */
+#endif
         }
-      else SET_BIT(0xA0);
+      else
+#endif /* SUPPORT_UTF */
+        SET_BIT(0xA0);
       try_next = FALSE;
       break;

@@ -968,12 +1011,21 @@
       SET_BIT(0x0B);
       SET_BIT(0x0C);
       SET_BIT(0x0D);
+#ifdef SUPPORT_UTF
       if (utf)
         {
+#ifdef COMPILE_PCRE8
         SET_BIT(0xC2);  /* For U+0085 */
         SET_BIT(0xE2);  /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+        SET_BIT(0x85);
+        SET_BIT(0xFF);  /* For characters > 255 */
+#endif
         }
-      else SET_BIT(0x85);
+      else
+#endif /* SUPPORT_UTF */
+        SET_BIT(0x85);
       try_next = FALSE;
       break;

@@ -1058,14 +1110,23 @@
         case OP_HSPACE:
         SET_BIT(0x09);
         SET_BIT(0x20);
+#ifdef COMPILE_PCRE8
         if (utf)
           {
+#ifdef COMPILE_PCRE8
           SET_BIT(0xC2);  /* For U+00A0 */
           SET_BIT(0xE1);  /* For U+1680, U+180E */
           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
           SET_BIT(0xE3);  /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+          SET_BIT(0xA0);
+          SET_BIT(0xFF);  /* For characters > 255 */
+#endif
           }
-        else SET_BIT(0xA0);
+        else
+#endif /* SUPPORT_UTF */
+          SET_BIT(0xA0);
         break;

         case OP_ANYNL:
@@ -1074,12 +1135,21 @@
         SET_BIT(0x0B);
         SET_BIT(0x0C);
         SET_BIT(0x0D);
+#ifdef COMPILE_PCRE8
         if (utf)
           {
+#ifdef COMPILE_PCRE8
           SET_BIT(0xC2);  /* For U+0085 */
           SET_BIT(0xE2);  /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+          SET_BIT(0x85);
+          SET_BIT(0xFF);  /* For characters > 255 */
+#endif
           }
-        else SET_BIT(0x85);
+        else
+#endif /* SUPPORT_UTF */
+          SET_BIT(0x85);
         break;

         case OP_NOT_DIGIT:
@@ -1126,13 +1196,16 @@
       character with a value > 255. */

       case OP_NCLASS:
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
       if (utf)
         {
         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
         }
 #endif
+#ifdef COMPILE_PCRE16
+      SET_BIT(0xFF);                         /* For characters > 255 */
+#endif
       /* Fall through */

       case OP_CLASS:
@@ -1147,7 +1220,7 @@
         value is > 127. In fact, there are only two possible starting bytes for
         characters in the range 128 - 255. */

-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
         if (utf)
           {
           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
@@ -1161,12 +1234,10 @@
               }
             }
           }
-
-        /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
         else
 #endif
           {
+          /* In non-UTF-8 mode, the two bit maps are completely compatible. */
           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
           }

@@ -1342,6 +1413,18 @@
     memcpy(study->start_bits, start_bits, sizeof(start_bits));
     }

+#ifdef PCRE_DEBUG
+  if (bits_set)
+    {
+    pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
+    int i;
+
+    printf("Start bits:\n");
+    for (i = 0; i < 32; i++)
+      printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+    }
+#endif
+
   /* Always set the minlength value in the block, because the JIT compiler
   makes use of it. However, don't set the bit unless the length is greater than
   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time

Modified: code/branches/pcre16/pcre_version.c
===================================================================
--- code/branches/pcre16/pcre_version.c    2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_version.c    2011-12-05 20:12:24 UTC (rev 785)
@@ -79,8 +79,13 @@
 pre-processor time. This hack uses a standard trick for avoiding calling
 the STRING macro with an empty argument when doing the test. */

+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
pcre_version(void)
+#else
+PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
+pcre16_version(void)
+#endif
{
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :

This message is part of the following thread:
	the complete thread tree sorted by date

[Pcre-svn] [785] code/branches/pcre16: Improving UTF-16 supp…