[Pcre-svn] [789] code/branches/pcre16: UTF16 fixes: iterated character parsing, named references

Autor: Subversion repository
Datum:
To: pcre-svn
Betreff: [Pcre-svn] [789] code/branches/pcre16: UTF16 fixes: iterated character parsing, named references

Revision: 789

          http://vcs.pcre.org/viewvc?view=rev&revision=789
Author:   zherczeg
Date:     2011-12-07 14:36:26 +0000 (Wed, 07 Dec 2011)

Log Message:
-----------
UTF16 fixes: iterated character parsing, named references

Modified Paths:
--------------
    code/branches/pcre16/pcre16_ord2utf16.c
    code/branches/pcre16/pcre16_utf16_utils.c
    code/branches/pcre16/pcre_compile.c
    code/branches/pcre16/pcre_exec.c
    code/branches/pcre16/pcre_internal.h
    code/branches/pcre16/pcre_jit_test.c
    code/branches/pcre16/testdata/testoutput10

Modified: code/branches/pcre16/pcre16_ord2utf16.c
===================================================================
--- code/branches/pcre16/pcre16_ord2utf16.c    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre16_ord2utf16.c    2011-12-07 14:36:26 UTC (rev 789)
@@ -86,11 +86,9 @@
 return 2;

#else
-
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
return 0;
-
#endif
}

Modified: code/branches/pcre16/pcre16_utf16_utils.c
===================================================================
--- code/branches/pcre16/pcre16_utf16_utils.c    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre16_utf16_utils.c    2011-12-07 14:36:26 UTC (rev 789)
@@ -51,6 +51,29 @@

#include "pcre_internal.h"

+/*************************************************
+*  Convert any UTF-16 string to host byte order  *
+*************************************************/
+
+/* This function takes an UTF-16 string and converts
+it to host byte order. The length can be explicitly set,
+or autmatically detected for zero terminated strings.
+BOMs can be kept or discarded during the conversion.
+Conversion can be done in place (output == input).
+
+Arguments:
+  output     the output buffer, its size must be greater
+             or equal than the input string
+  input      any UTF-16 string
+  length     the number of characters in the input string
+             can be less than zero for zero terminated strings
+  keep_boms  for a non-zero value, the BOM (0xfeff) characters
+             are copied as well
+
+Returns:     the number of characters placed into the output buffer,
+             including the zero-terminator
+*/
+
 int
 pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
 {
@@ -58,25 +81,31 @@
 /* This function converts any UTF-16 string to host byte order and optionally removes
 any Byte Order Marks (BOMS). Returns with the remainig length. */
 BOOL same_bo = TRUE;
-PCRE_SPTR16 end = input + length;
+pcre_uchar *optr = (pcre_uchar *)output;
+const pcre_uchar *iptr = (const pcre_uchar *)input;
+const pcre_uchar *end;
 /* The c variable must be unsigned. */
 register pcre_uchar c;

-while (input < end)
+if (length < 0)
+  length = STRLEN_UC(iptr) + 1;
+end = iptr + length;
+
+while (iptr < end)
   {
-  c = *input++;
+  c = *iptr++;
   if (c == 0xfeff || c == 0xfffe)
     {
     /* Detecting the byte order of the machine is unnecessary, it is
     enough to know that the UTF-16 string has the same byte order or not. */
     same_bo = c == 0xfeff;
     if (keep_boms != 0)
-      *output++ = 0xfeff;
+      *optr++ = 0xfeff;
     else
       length--;
     }
   else
-    *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+    *optr++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
   }

#else

Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_compile.c    2011-12-07 14:36:26 UTC (rev 789)
@@ -4202,11 +4202,10 @@

 #ifdef SUPPORT_UTF
         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#endif
-#ifndef COMPILE_PCRE8
+#elif !(defined COMPILE_PCRE8)
         if (d > 255)
 #endif
-#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
           {
           xclass = TRUE;

@@ -5817,9 +5816,9 @@
               *errorcodeptr = ERR49;
               goto FAILED;
               }
-            if (namelen + 3 > cd->name_entry_size)
+            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
               {
-              cd->name_entry_size = namelen + 3;
+              cd->name_entry_size = namelen + IMM2_SIZE + 1;
               if (namelen > MAX_NAME_SIZE)
                 {
                 *errorcodeptr = ERR48;
@@ -5848,10 +5847,10 @@

             for (i = 0; i < cd->names_found; i++)
               {
-              int crc = memcmp(name, slot+2, IN_UCHARS(namelen));
+              int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
               if (crc == 0)
                 {
-                if (slot[2+namelen] == 0)
+                if (slot[IMM2_SIZE+namelen] == 0)
                   {
                   if (GET2(slot, 0) != cd->bracount + 1 &&
                       (options & PCRE_DUPNAMES) == 0)
@@ -5903,8 +5902,8 @@
               }

             PUT2(slot, 0, cd->bracount + 1);
-            memcpy(slot + 2, name, IN_UCHARS(namelen));
-            slot[2 + namelen] = 0;
+            memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
+            slot[IMM2_SIZE + namelen] = 0;
             }
           }

@@ -5988,7 +5987,7 @@
           for (i = 0; i < cd->names_found; i++)
             {
             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
-                slot[2+namelen] == 0)
+                slot[IMM2_SIZE+namelen] == 0)
               break;
             slot += cd->name_entry_size;
             }
@@ -7614,7 +7613,7 @@
 because nowadays we limit the maximum value of cd->names_found and
 cd->name_entry_size. */

-size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);
+size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
re = (real_pcre *)(pcre_malloc)(size);

if (re == NULL)

Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_exec.c    2011-12-07 14:36:26 UTC (rev 789)
@@ -181,7 +181,7 @@

 if (caseless)
   {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 #ifdef SUPPORT_UCP
   if (md->utf)
     {
@@ -365,7 +365,7 @@
   /* Function local variables */

PCRE_PUCHAR Xcallpat;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
PCRE_PUCHAR Xcharptr;
#endif
PCRE_PUCHAR Xdata;
@@ -527,7 +527,7 @@

/* Ditto for the local variables */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 #define charptr            frame->Xcharptr
 #endif
 #define callpat            frame->Xcallpat
@@ -585,7 +585,7 @@
 below are for variables that do not have to be preserved over a recursive call
 to RMATCH(). */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 const pcre_uchar *charptr;
 #endif
 const pcre_uchar *callpat;
@@ -634,6 +634,7 @@
 #define code_offset   codelink
 #define condassert    condition
 #define matched_once  prev_is_word
+#define foc           number

/* These statements are here to stop the compiler complaining about unitialized
variables. */
@@ -659,7 +660,7 @@
complicated macro. It has to be used in one particular way. This shouldn't,
however, impact performance when true recursion is being used. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
 utf = md->utf;       /* Local copy of the flag */
 #else
 utf = FALSE;
@@ -1596,7 +1597,7 @@
     back a number of characters, not bytes. */

     case OP_REVERSE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       i = GET(ecode, 1);
@@ -2216,7 +2217,7 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_digit) != 0
@@ -2233,8 +2234,8 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_digit) == 0
        )
@@ -2250,7 +2251,7 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_space) != 0
@@ -2267,8 +2268,8 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_space) == 0
        )
@@ -2284,7 +2285,7 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
        c < 256 &&
 #endif
        (md->ctypes[c] & ctype_word) != 0
@@ -2301,8 +2302,8 @@
       }
     GETCHARINCTEST(c, eptr);
     if (
-#ifdef SUPPORT_UTF8
-       c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+       c > 255 ||
 #endif
        (md->ctypes[c] & ctype_word) == 0
        )
@@ -3036,7 +3037,7 @@
     /* Match a single character, casefully */

     case OP_CHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -3108,7 +3109,7 @@
         }
       }
     else
-#endif   /* SUPPORT_UTF8 */
+#endif   /* SUPPORT_UTF */

     /* Not UTF mode */
       {
@@ -3117,7 +3118,9 @@
         SCHECK_PARTIAL();            /* This one can use SCHECK_PARTIAL() */
         MRRETURN(MATCH_NOMATCH);
         }
-      if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+      if (TABLE_GET(ecode[1], md->lcc, ecode[1])
+          != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
+      eptr++;
       ecode += 2;
       }
     break;
@@ -3190,7 +3193,7 @@
     /* Common code for all repeated single-character matches. */

     REPEATCHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     if (utf)
       {
       length = 1;
@@ -3214,7 +3217,7 @@
         for (i = 1; i <= min; i++)
           {
           if (eptr <= md->end_subject - length &&
-            memcmp(eptr, charptr, length) == 0) eptr += length;
+            memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
           else if (oclength > 0 &&
                    eptr <= md->end_subject - oclength &&
@@ -3237,7 +3240,7 @@
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
             if (fi >= max) MRRETURN(MATCH_NOMATCH);
             if (eptr <= md->end_subject - length &&
-              memcmp(eptr, charptr, length) == 0) eptr += length;
+              memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
@@ -3258,7 +3261,7 @@
           for (i = min; i < max; i++)
             {
             if (eptr <= md->end_subject - length &&
-                memcmp(eptr, charptr, length) == 0) eptr += length;
+                memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
 #ifdef SUPPORT_UCP
             else if (oclength > 0 &&
                      eptr <= md->end_subject - oclength &&
@@ -3294,14 +3297,12 @@
       value of fc will always be < 128. */
       }
     else
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
+      /* When not in UTF-8 mode, load a single-byte character. */
+      fc = *ecode++;

-    /* When not in UTF-8 mode, load a single-byte character. */
-
-    fc = *ecode++;
-
-    /* The value of fc at this point is always less than 256, though we may or
-    may not be in UTF-8 mode. The code is duplicated for the caseless and
+    /* The value of fc at this point is always one character, though we may
+    or may not be in UTF mode. The code is duplicated for the caseless and
     caseful cases, for speed, since matching characters is likely to be quite
     common. First, ensure the minimum number of matches are present. If min =
     max, continue at the same level without recursing. Otherwise, if
@@ -3314,7 +3315,23 @@

     if (op >= OP_STARI)  /* Caseless */
       {
-      fc = md->lcc[fc];
+#ifdef COMPILE_PCRE8
+      /* fc must be < 128 */
+      foc = md->fcc[fc];
+#else
+#ifdef SUPPORT_UTF
+#ifdef SUPPORT_UCP
+      if (utf && fc > 127)
+        foc = UCD_OTHERCASE(fc);
+#else
+      if (utf && fc > 127)
+        foc = fc;
+#endif /* SUPPORT_UCP */
+      else
+#endif /* SUPPORT_UTF */
+        foc = TABLE_GET(fc, md->fcc, fc);
+#endif /* COMPILE_PCRE8 */
+
       for (i = 1; i <= min; i++)
         {
         if (eptr >= md->end_subject)
@@ -3322,7 +3339,8 @@
           SCHECK_PARTIAL();
           MRRETURN(MATCH_NOMATCH);
           }
-        if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+        if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+        eptr++;
         }
       if (min == max) continue;
       if (minimize)
@@ -3337,7 +3355,8 @@
             SCHECK_PARTIAL();
             MRRETURN(MATCH_NOMATCH);
             }
-          if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+          if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+          eptr++;
           }
         /* Control never gets here */
         }
@@ -3351,7 +3370,7 @@
             SCHECK_PARTIAL();
             break;
             }
-          if (fc != md->lcc[*eptr]) break;
+          if (fc != *eptr && foc != *eptr) break;
           eptr++;
           }

@@ -3440,10 +3459,10 @@
     GETCHARINCTEST(c, eptr);
     if (op == OP_NOTI)         /* The caseless case */
       {
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
       if (c < 256)
 #endif
-      c = md->lcc[c];
+        c = md->lcc[c];
       if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
       }
     else    /* Caseful */
@@ -3543,9 +3562,9 @@

     if (op >= OP_NOTSTARI)     /* Caseless */
       {
-      fc = md->lcc[fc];
+      fc = TABLE_GET(fc, md->lcc, fc);

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         register unsigned int d;
@@ -3580,7 +3599,7 @@

       if (minimize)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3625,7 +3644,7 @@
         {
         pp = eptr;

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3683,7 +3702,7 @@

     else
       {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         register unsigned int d;
@@ -3717,7 +3736,7 @@

       if (minimize)
         {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -3761,7 +3780,7 @@
         {
         pp = eptr;

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (utf)
           {
           register unsigned int d;
@@ -4353,7 +4372,7 @@
         }  /* End switch(ctype) */

       else
-#endif     /* SUPPORT_UTF8 */
+#endif     /* SUPPORT_UTF */

       /* Code for the non-UTF-8 case for minimum matching of operators other
       than OP_PROP and OP_NOTPROP. */
@@ -4796,7 +4815,7 @@
       else
 #endif     /* SUPPORT_UCP */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
       if (utf)
         {
         for (fi = min;; fi++)
@@ -5596,7 +5615,7 @@
           }
         }
       else
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
       /* Not UTF mode */
         {
         switch(ctype)
@@ -5844,14 +5863,14 @@
   LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
   LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
   LBL(65) LBL(66)
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
   LBL(32) LBL(34) LBL(42) LBL(46)
 #ifdef SUPPORT_UCP
   LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
   LBL(59) LBL(60) LBL(61) LBL(62)
 #endif  /* SUPPORT_UCP */
-#endif  /* SUPPORT_UTF8 */
+#endif  /* SUPPORT_UTF */
   default:
   DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
   return PCRE_ERROR_INTERNAL;
@@ -6002,7 +6021,7 @@
 /* Check a UTF-8 string if required. Pass back the character offset and error
 code for an invalid string if a results vector is available. */

-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
@@ -6138,6 +6157,7 @@
md->hasthen = (re->flags & PCRE_HASTHEN) != 0;

md->lcc = tables + lcc_offset;
+md->fcc = tables + fcc_offset;
md->ctypes = tables + ctypes_offset;

 /* Handle different \R options. */
@@ -6265,7 +6285,7 @@
     first_char = first_char2 = re->first_char;
     if ((re->flags & PCRE_FCH_CASELESS) != 0)
       {
-      first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+      first_char2 = TABLE_GET(first_char, md->fcc, first_char);
 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
       if (utf && first_char > 127)
         first_char2 = UCD_OTHERCASE(first_char);
@@ -6287,7 +6307,7 @@
   req_char = req_char2 = re->req_char;
   if ((re->flags & PCRE_RCH_CASELESS) != 0)
     {
-    req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+    req_char2 = TABLE_GET(req_char, md->fcc, req_char);
 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
     if (utf && req_char > 127)
       req_char2 = UCD_OTHERCASE(req_char);

Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_internal.h    2011-12-07 14:36:26 UTC (rev 789)
@@ -2055,6 +2055,7 @@
   pcre_uchar *name_table;         /* Table of names */
   pcre_uchar nl[4];               /* Newline string when fixed */
   const  pcre_uint8 *lcc;         /* Points to lower casing table */
+  const  pcre_uint8 *fcc;         /* Points to case-flipping table */
   const  pcre_uint8 *ctypes;      /* Points to table of type maps */
   BOOL   offset_overflow;         /* Set if too many extractions */
   BOOL   notbol;                  /* NOTBOL flag */
@@ -2262,6 +2263,7 @@
 extern const int         PRIV(ucp_typerange)[];
 #endif

+#ifdef SUPPORT_UCP
/* UCD access macros */

#define UCD_BLOCK_SIZE 128
@@ -2274,6 +2276,8 @@
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)

+#endif /* SUPPORT_UCP */
+
#endif

/* End of pcre_internal.h */

Modified: code/branches/pcre16/pcre_jit_test.c
===================================================================
--- code/branches/pcre16/pcre_jit_test.c    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_jit_test.c    2011-12-07 14:36:26 UTC (rev 789)
@@ -621,11 +621,11 @@
     return (pcre_jit_stack *)arg;
 }

-static void setstack(pcre_extra *extra, int realloc)
+static void setstack(pcre_extra *extra, int alloc_again)
 {
     static pcre_jit_stack *stack;

-    if (realloc) {
+    if (alloc_again) {
         if (stack)
             pcre_jit_stack_free(stack);
         stack = pcre_jit_stack_alloc(1, 1024 * 1024);
@@ -638,29 +638,29 @@

 static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *offsetmap, int max_length)
 {
-    unsigned char *ptr = (unsigned char*)input;
-    PCRE_SCHAR16 *optr = output;
+    unsigned char *iptr = (unsigned char*)input;
+    unsigned short *optr = (unsigned short *)output;
     unsigned int c;

     if (max_length == 0)
         return 0;

-    while (*ptr && max_length > 1) {
+    while (*iptr && max_length > 1) {
         c = 0;
         if (offsetmap)
-            *offsetmap++ = (int)(ptr - (unsigned char*)input);
+            *offsetmap++ = (int)(iptr - (unsigned char*)input);

-        if (!(*ptr & 0x80))
-            c = *ptr++;
-        else if (!(*ptr & 0x20)) {
-            c = ((ptr[0] & 0x1f) << 6) | (ptr[1] & 0x3f);
-            ptr += 2;
-        } else if (!(*ptr & 0x10)) {
-            c = ((ptr[0] & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
-            ptr += 3;
-        } else if (!(*ptr & 0x08)) {
-            c = ((ptr[0] & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
-            ptr += 4;
+        if (!(*iptr & 0x80))
+            c = *iptr++;
+        else if (!(*iptr & 0x20)) {
+            c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
+            iptr += 2;
+        } else if (!(*iptr & 0x10)) {
+            c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
+            iptr += 3;
+        } else if (!(*iptr & 0x08)) {
+            c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
+            iptr += 4;
         }

         if (c < 65536) {
@@ -668,7 +668,7 @@
             max_length--;
         } else if (max_length <= 2) {
             *optr = '\0';
-            return optr - output;
+            return (int)(optr - (unsigned short *)output);
         } else {
             c -= 0x10000;
             *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
@@ -679,24 +679,25 @@
         }
     }
     if (offsetmap)
-        *offsetmap = (int)(ptr - (unsigned char*)input);
+        *offsetmap = (int)(iptr - (unsigned char*)input);
     *optr = '\0';
-    return optr - output;
+    return (int)(optr - (unsigned short *)output);
 }

 static int copy_char8_to_char16(const char *input, PCRE_SCHAR16 *output, int max_length)
 {
-    PCRE_SCHAR16 *optr = output;
+    unsigned char *iptr = (unsigned char*)input;
+    unsigned short *optr = (unsigned short *)output;

     if (max_length == 0)
         return 0;

-    while (*input && max_length > 1) {
-        *optr++ = *input++;
+    while (*iptr && max_length > 1) {
+        *optr++ = *iptr++;
         max_length--;
     }
     *optr = '\0';
-    return optr - output;
+    return (int)(optr - (unsigned short *)output);
 }

 #define REGTEST_MAX_LENGTH 4096
@@ -768,6 +769,7 @@
             current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8),
             &error, &err_offs, NULL);

+        extra8 = NULL;
         if (re8) {
             error = NULL;
             extra8 = pcre_study(re8, PCRE_STUDY_JIT_COMPILE, &error);
@@ -786,10 +788,15 @@
             printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern);
 #endif
 #ifdef SUPPORT_PCRE16
-        convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+        if (current->flags & PCRE_UTF8)
+            convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+        else
+            copy_char8_to_char16(current->pattern, regtest_buf, REGTEST_MAX_LENGTH);
         re16 = pcre16_compile(regtest_buf,
             current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16),
             &error, &err_offs, NULL);
+
+        extra16 = NULL;
         if (re16) {
             error = NULL;
             extra16 = pcre16_study(re16, PCRE_STUDY_JIT_COMPILE, &error);
@@ -813,6 +820,8 @@
             setstack(NULL, 1);

 #ifdef SUPPORT_PCRE8
+        return_value8_1 = -1000;
+        return_value8_2 = -1000;
         if (re8) {
             setstack(extra8, 0);
             for (i = 0; i < 32; ++i)
@@ -828,6 +837,8 @@
 #endif

 #ifdef SUPPORT_PCRE16
+        return_value16_1 = -1000;
+        return_value16_2 = -1000;
         if (re16) {
             setstack(extra16, 0);
             if (current->flags & PCRE_UTF8)
@@ -853,7 +864,7 @@
         is_succesful = 1;
         if (!(current->flags & PCRE_BUG)) {
 #if defined SUPPORT_PCRE8 && defined SUPPORT_PCRE16
-            if ((current->flags & PCRE_UTF8) && utf8 && utf16) {
+            if (utf8 == utf16) {
                 /* All results must be the same. */
                 if (return_value8_1 != return_value8_2 || return_value8_1 != return_value16_1 || return_value8_1 != return_value16_2) {
                     printf("\n8 and 16 bit: Return value differs(%d:%d:%d:%d): [%d] '%s' @ '%s'\n",
@@ -863,11 +874,13 @@
                 } else if (return_value8_1 >= 0) {
                     return_value8_1 *= 2;
                     /* Transform back the results. */
-                    for (i = 0; i < return_value8_1; ++i) {
-                        if (ovector16_1[i] >= 0)
-                            ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
-                        if (ovector16_2[i] >= 0)
-                            ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+                    if (current->flags & PCRE_UTF8) {
+                        for (i = 0; i < return_value8_1; ++i) {
+                            if (ovector16_1[i] >= 0)
+                                ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
+                            if (ovector16_2[i] >= 0)
+                                ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+                        }
                     }

                     for (i = 0; i < return_value8_1; ++i)

Modified: code/branches/pcre16/testdata/testoutput10
===================================================================
--- code/branches/pcre16/testdata/testoutput10    2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/testdata/testoutput10    2011-12-07 14:36:26 UTC (rev 789)
@@ -194,7 +194,7 @@
 ------------------------------------------------------------------

 /a(?P<name1>b|c)d(?P<longername2>e)/BM
-Memory allocation (code space): 42
+Memory allocation (code space): 36
 ------------------------------------------------------------------
   0  32 Bra
   3     a
@@ -212,7 +212,7 @@
 ------------------------------------------------------------------

/(?:a(?P<c>c(?P<d>d)))(?P<a>a)/BM
-Memory allocation (code space): 54
+Memory allocation (code space): 45
------------------------------------------------------------------
0 41 Bra
3 25 Bra
@@ -232,7 +232,7 @@
------------------------------------------------------------------

/(?P<a>a)...(?P=a)bbb(?P>a)d/BM
-Memory allocation (code space): 37
+Memory allocation (code space): 34
------------------------------------------------------------------
0 30 Bra
3 7 CBra 1

Diese Nachricht ist Teil des folgenden Threads:
	Der komplette Thread sortiert nach Datum

[Pcre-svn] [789] code/branches/pcre16: UTF16 fixes: iterated…