Revision: 789
http://vcs.pcre.org/viewvc?view=rev&revision=789
Author: zherczeg
Date: 2011-12-07 14:36:26 +0000 (Wed, 07 Dec 2011)
Log Message:
-----------
UTF16 fixes: iterated character parsing, named references
Modified Paths:
--------------
code/branches/pcre16/pcre16_ord2utf16.c
code/branches/pcre16/pcre16_utf16_utils.c
code/branches/pcre16/pcre_compile.c
code/branches/pcre16/pcre_exec.c
code/branches/pcre16/pcre_internal.h
code/branches/pcre16/pcre_jit_test.c
code/branches/pcre16/testdata/testoutput10
Modified: code/branches/pcre16/pcre16_ord2utf16.c
===================================================================
--- code/branches/pcre16/pcre16_ord2utf16.c 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre16_ord2utf16.c 2011-12-07 14:36:26 UTC (rev 789)
@@ -86,11 +86,9 @@
return 2;
#else
-
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
return 0;
-
#endif
}
Modified: code/branches/pcre16/pcre16_utf16_utils.c
===================================================================
--- code/branches/pcre16/pcre16_utf16_utils.c 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre16_utf16_utils.c 2011-12-07 14:36:26 UTC (rev 789)
@@ -51,6 +51,29 @@
#include "pcre_internal.h"
+/*************************************************
+* Convert any UTF-16 string to host byte order *
+*************************************************/
+
+/* This function takes an UTF-16 string and converts
+it to host byte order. The length can be explicitly set,
+or autmatically detected for zero terminated strings.
+BOMs can be kept or discarded during the conversion.
+Conversion can be done in place (output == input).
+
+Arguments:
+ output the output buffer, its size must be greater
+ or equal than the input string
+ input any UTF-16 string
+ length the number of characters in the input string
+ can be less than zero for zero terminated strings
+ keep_boms for a non-zero value, the BOM (0xfeff) characters
+ are copied as well
+
+Returns: the number of characters placed into the output buffer,
+ including the zero-terminator
+*/
+
int
pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
{
@@ -58,25 +81,31 @@
/* This function converts any UTF-16 string to host byte order and optionally removes
any Byte Order Marks (BOMS). Returns with the remainig length. */
BOOL same_bo = TRUE;
-PCRE_SPTR16 end = input + length;
+pcre_uchar *optr = (pcre_uchar *)output;
+const pcre_uchar *iptr = (const pcre_uchar *)input;
+const pcre_uchar *end;
/* The c variable must be unsigned. */
register pcre_uchar c;
-while (input < end)
+if (length < 0)
+ length = STRLEN_UC(iptr) + 1;
+end = iptr + length;
+
+while (iptr < end)
{
- c = *input++;
+ c = *iptr++;
if (c == 0xfeff || c == 0xfffe)
{
/* Detecting the byte order of the machine is unnecessary, it is
enough to know that the UTF-16 string has the same byte order or not. */
same_bo = c == 0xfeff;
if (keep_boms != 0)
- *output++ = 0xfeff;
+ *optr++ = 0xfeff;
else
length--;
}
else
- *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+ *optr++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
}
#else
Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_compile.c 2011-12-07 14:36:26 UTC (rev 789)
@@ -4202,11 +4202,10 @@
#ifdef SUPPORT_UTF
if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
-#endif
-#ifndef COMPILE_PCRE8
+#elif !(defined COMPILE_PCRE8)
if (d > 255)
#endif
-#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
{
xclass = TRUE;
@@ -5817,9 +5816,9 @@
*errorcodeptr = ERR49;
goto FAILED;
}
- if (namelen + 3 > cd->name_entry_size)
+ if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
{
- cd->name_entry_size = namelen + 3;
+ cd->name_entry_size = namelen + IMM2_SIZE + 1;
if (namelen > MAX_NAME_SIZE)
{
*errorcodeptr = ERR48;
@@ -5848,10 +5847,10 @@
for (i = 0; i < cd->names_found; i++)
{
- int crc = memcmp(name, slot+2, IN_UCHARS(namelen));
+ int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
if (crc == 0)
{
- if (slot[2+namelen] == 0)
+ if (slot[IMM2_SIZE+namelen] == 0)
{
if (GET2(slot, 0) != cd->bracount + 1 &&
(options & PCRE_DUPNAMES) == 0)
@@ -5903,8 +5902,8 @@
}
PUT2(slot, 0, cd->bracount + 1);
- memcpy(slot + 2, name, IN_UCHARS(namelen));
- slot[2 + namelen] = 0;
+ memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
+ slot[IMM2_SIZE + namelen] = 0;
}
}
@@ -5988,7 +5987,7 @@
for (i = 0; i < cd->names_found; i++)
{
if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
- slot[2+namelen] == 0)
+ slot[IMM2_SIZE+namelen] == 0)
break;
slot += cd->name_entry_size;
}
@@ -7614,7 +7613,7 @@
because nowadays we limit the maximum value of cd->names_found and
cd->name_entry_size. */
-size = sizeof(real_pcre) + (length + cd->names_found * (cd->name_entry_size + 3)) * sizeof(pcre_uchar);
+size = sizeof(real_pcre) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
re = (real_pcre *)(pcre_malloc)(size);
if (re == NULL)
Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_exec.c 2011-12-07 14:36:26 UTC (rev 789)
@@ -181,7 +181,7 @@
if (caseless)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
#ifdef SUPPORT_UCP
if (md->utf)
{
@@ -365,7 +365,7 @@
/* Function local variables */
PCRE_PUCHAR Xcallpat;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
PCRE_PUCHAR Xcharptr;
#endif
PCRE_PUCHAR Xdata;
@@ -527,7 +527,7 @@
/* Ditto for the local variables */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
#define charptr frame->Xcharptr
#endif
#define callpat frame->Xcallpat
@@ -585,7 +585,7 @@
below are for variables that do not have to be preserved over a recursive call
to RMATCH(). */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
const pcre_uchar *charptr;
#endif
const pcre_uchar *callpat;
@@ -634,6 +634,7 @@
#define code_offset codelink
#define condassert condition
#define matched_once prev_is_word
+#define foc number
/* These statements are here to stop the compiler complaining about unitialized
variables. */
@@ -659,7 +660,7 @@
complicated macro. It has to be used in one particular way. This shouldn't,
however, impact performance when true recursion is being used. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
utf = md->utf; /* Local copy of the flag */
#else
utf = FALSE;
@@ -1596,7 +1597,7 @@
back a number of characters, not bytes. */
case OP_REVERSE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
i = GET(ecode, 1);
@@ -2216,7 +2217,7 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_digit) != 0
@@ -2233,8 +2234,8 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_digit) == 0
)
@@ -2250,7 +2251,7 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_space) != 0
@@ -2267,8 +2268,8 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_space) == 0
)
@@ -2284,7 +2285,7 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
c < 256 &&
#endif
(md->ctypes[c] & ctype_word) != 0
@@ -2301,8 +2302,8 @@
}
GETCHARINCTEST(c, eptr);
if (
-#ifdef SUPPORT_UTF8
- c >= 256 ||
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
+ c > 255 ||
#endif
(md->ctypes[c] & ctype_word) == 0
)
@@ -3036,7 +3037,7 @@
/* Match a single character, casefully */
case OP_CHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -3108,7 +3109,7 @@
}
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
@@ -3117,7 +3118,9 @@
SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
MRRETURN(MATCH_NOMATCH);
}
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (TABLE_GET(ecode[1], md->lcc, ecode[1])
+ != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
+ eptr++;
ecode += 2;
}
break;
@@ -3190,7 +3193,7 @@
/* Common code for all repeated single-character matches. */
REPEATCHAR:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -3214,7 +3217,7 @@
for (i = 1; i <= min; i++)
{
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3237,7 +3240,7 @@
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max) MRRETURN(MATCH_NOMATCH);
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3258,7 +3261,7 @@
for (i = min; i < max; i++)
{
if (eptr <= md->end_subject - length &&
- memcmp(eptr, charptr, length) == 0) eptr += length;
+ memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
#ifdef SUPPORT_UCP
else if (oclength > 0 &&
eptr <= md->end_subject - oclength &&
@@ -3294,14 +3297,12 @@
value of fc will always be < 128. */
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
+ /* When not in UTF-8 mode, load a single-byte character. */
+ fc = *ecode++;
- /* When not in UTF-8 mode, load a single-byte character. */
-
- fc = *ecode++;
-
- /* The value of fc at this point is always less than 256, though we may or
- may not be in UTF-8 mode. The code is duplicated for the caseless and
+ /* The value of fc at this point is always one character, though we may
+ or may not be in UTF mode. The code is duplicated for the caseless and
caseful cases, for speed, since matching characters is likely to be quite
common. First, ensure the minimum number of matches are present. If min =
max, continue at the same level without recursing. Otherwise, if
@@ -3314,7 +3315,23 @@
if (op >= OP_STARI) /* Caseless */
{
- fc = md->lcc[fc];
+#ifdef COMPILE_PCRE8
+ /* fc must be < 128 */
+ foc = md->fcc[fc];
+#else
+#ifdef SUPPORT_UTF
+#ifdef SUPPORT_UCP
+ if (utf && fc > 127)
+ foc = UCD_OTHERCASE(fc);
+#else
+ if (utf && fc > 127)
+ foc = fc;
+#endif /* SUPPORT_UCP */
+ else
+#endif /* SUPPORT_UTF */
+ foc = TABLE_GET(fc, md->fcc, fc);
+#endif /* COMPILE_PCRE8 */
+
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject)
@@ -3322,7 +3339,8 @@
SCHECK_PARTIAL();
MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+ eptr++;
}
if (min == max) continue;
if (minimize)
@@ -3337,7 +3355,8 @@
SCHECK_PARTIAL();
MRRETURN(MATCH_NOMATCH);
}
- if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
+ if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
+ eptr++;
}
/* Control never gets here */
}
@@ -3351,7 +3370,7 @@
SCHECK_PARTIAL();
break;
}
- if (fc != md->lcc[*eptr]) break;
+ if (fc != *eptr && foc != *eptr) break;
eptr++;
}
@@ -3440,10 +3459,10 @@
GETCHARINCTEST(c, eptr);
if (op == OP_NOTI) /* The caseless case */
{
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
if (c < 256)
#endif
- c = md->lcc[c];
+ c = md->lcc[c];
if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
}
else /* Caseful */
@@ -3543,9 +3562,9 @@
if (op >= OP_NOTSTARI) /* Caseless */
{
- fc = md->lcc[fc];
+ fc = TABLE_GET(fc, md->lcc, fc);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3580,7 +3599,7 @@
if (minimize)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3625,7 +3644,7 @@
{
pp = eptr;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3683,7 +3702,7 @@
else
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3717,7 +3736,7 @@
if (minimize)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -3761,7 +3780,7 @@
{
pp = eptr;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
register unsigned int d;
@@ -4353,7 +4372,7 @@
} /* End switch(ctype) */
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Code for the non-UTF-8 case for minimum matching of operators other
than OP_PROP and OP_NOTPROP. */
@@ -4796,7 +4815,7 @@
else
#endif /* SUPPORT_UCP */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
for (fi = min;; fi++)
@@ -5596,7 +5615,7 @@
}
}
else
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
switch(ctype)
@@ -5844,14 +5863,14 @@
LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
LBL(65) LBL(66)
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
#ifdef SUPPORT_UCP
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
LBL(59) LBL(60) LBL(61) LBL(62)
#endif /* SUPPORT_UCP */
-#endif /* SUPPORT_UTF8 */
+#endif /* SUPPORT_UTF */
default:
DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
return PCRE_ERROR_INTERNAL;
@@ -6002,7 +6021,7 @@
/* Check a UTF-8 string if required. Pass back the character offset and error
code for an invalid string if a results vector is available. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
{
int erroroffset;
@@ -6138,6 +6157,7 @@
md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
md->lcc = tables + lcc_offset;
+md->fcc = tables + fcc_offset;
md->ctypes = tables + ctypes_offset;
/* Handle different \R options. */
@@ -6265,7 +6285,7 @@
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
{
- first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+ first_char2 = TABLE_GET(first_char, md->fcc, first_char);
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
if (utf && first_char > 127)
first_char2 = UCD_OTHERCASE(first_char);
@@ -6287,7 +6307,7 @@
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
{
- req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+ req_char2 = TABLE_GET(req_char, md->fcc, req_char);
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
if (utf && req_char > 127)
req_char2 = UCD_OTHERCASE(req_char);
Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_internal.h 2011-12-07 14:36:26 UTC (rev 789)
@@ -2055,6 +2055,7 @@
pcre_uchar *name_table; /* Table of names */
pcre_uchar nl[4]; /* Newline string when fixed */
const pcre_uint8 *lcc; /* Points to lower casing table */
+ const pcre_uint8 *fcc; /* Points to case-flipping table */
const pcre_uint8 *ctypes; /* Points to table of type maps */
BOOL offset_overflow; /* Set if too many extractions */
BOOL notbol; /* NOTBOL flag */
@@ -2262,6 +2263,7 @@
extern const int PRIV(ucp_typerange)[];
#endif
+#ifdef SUPPORT_UCP
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
@@ -2274,6 +2276,8 @@
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
+#endif /* SUPPORT_UCP */
+
#endif
/* End of pcre_internal.h */
Modified: code/branches/pcre16/pcre_jit_test.c
===================================================================
--- code/branches/pcre16/pcre_jit_test.c 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/pcre_jit_test.c 2011-12-07 14:36:26 UTC (rev 789)
@@ -621,11 +621,11 @@
return (pcre_jit_stack *)arg;
}
-static void setstack(pcre_extra *extra, int realloc)
+static void setstack(pcre_extra *extra, int alloc_again)
{
static pcre_jit_stack *stack;
- if (realloc) {
+ if (alloc_again) {
if (stack)
pcre_jit_stack_free(stack);
stack = pcre_jit_stack_alloc(1, 1024 * 1024);
@@ -638,29 +638,29 @@
static int convert_utf8_to_utf16(const char *input, PCRE_SCHAR16 *output, int *offsetmap, int max_length)
{
- unsigned char *ptr = (unsigned char*)input;
- PCRE_SCHAR16 *optr = output;
+ unsigned char *iptr = (unsigned char*)input;
+ unsigned short *optr = (unsigned short *)output;
unsigned int c;
if (max_length == 0)
return 0;
- while (*ptr && max_length > 1) {
+ while (*iptr && max_length > 1) {
c = 0;
if (offsetmap)
- *offsetmap++ = (int)(ptr - (unsigned char*)input);
+ *offsetmap++ = (int)(iptr - (unsigned char*)input);
- if (!(*ptr & 0x80))
- c = *ptr++;
- else if (!(*ptr & 0x20)) {
- c = ((ptr[0] & 0x1f) << 6) | (ptr[1] & 0x3f);
- ptr += 2;
- } else if (!(*ptr & 0x10)) {
- c = ((ptr[0] & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
- ptr += 3;
- } else if (!(*ptr & 0x08)) {
- c = ((ptr[0] & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
- ptr += 4;
+ if (!(*iptr & 0x80))
+ c = *iptr++;
+ else if (!(*iptr & 0x20)) {
+ c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
+ iptr += 2;
+ } else if (!(*iptr & 0x10)) {
+ c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
+ iptr += 3;
+ } else if (!(*iptr & 0x08)) {
+ c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
+ iptr += 4;
}
if (c < 65536) {
@@ -668,7 +668,7 @@
max_length--;
} else if (max_length <= 2) {
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
} else {
c -= 0x10000;
*optr++ = 0xd800 | ((c >> 10) & 0x3ff);
@@ -679,24 +679,25 @@
}
}
if (offsetmap)
- *offsetmap = (int)(ptr - (unsigned char*)input);
+ *offsetmap = (int)(iptr - (unsigned char*)input);
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
}
static int copy_char8_to_char16(const char *input, PCRE_SCHAR16 *output, int max_length)
{
- PCRE_SCHAR16 *optr = output;
+ unsigned char *iptr = (unsigned char*)input;
+ unsigned short *optr = (unsigned short *)output;
if (max_length == 0)
return 0;
- while (*input && max_length > 1) {
- *optr++ = *input++;
+ while (*iptr && max_length > 1) {
+ *optr++ = *iptr++;
max_length--;
}
*optr = '\0';
- return optr - output;
+ return (int)(optr - (unsigned short *)output);
}
#define REGTEST_MAX_LENGTH 4096
@@ -768,6 +769,7 @@
current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags8),
&error, &err_offs, NULL);
+ extra8 = NULL;
if (re8) {
error = NULL;
extra8 = pcre_study(re8, PCRE_STUDY_JIT_COMPILE, &error);
@@ -786,10 +788,15 @@
printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern);
#endif
#ifdef SUPPORT_PCRE16
- convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+ if (current->flags & PCRE_UTF8)
+ convert_utf8_to_utf16(current->pattern, regtest_buf, NULL, REGTEST_MAX_LENGTH);
+ else
+ copy_char8_to_char16(current->pattern, regtest_buf, REGTEST_MAX_LENGTH);
re16 = pcre16_compile(regtest_buf,
current->flags & ~(PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART | disabled_flags16),
&error, &err_offs, NULL);
+
+ extra16 = NULL;
if (re16) {
error = NULL;
extra16 = pcre16_study(re16, PCRE_STUDY_JIT_COMPILE, &error);
@@ -813,6 +820,8 @@
setstack(NULL, 1);
#ifdef SUPPORT_PCRE8
+ return_value8_1 = -1000;
+ return_value8_2 = -1000;
if (re8) {
setstack(extra8, 0);
for (i = 0; i < 32; ++i)
@@ -828,6 +837,8 @@
#endif
#ifdef SUPPORT_PCRE16
+ return_value16_1 = -1000;
+ return_value16_2 = -1000;
if (re16) {
setstack(extra16, 0);
if (current->flags & PCRE_UTF8)
@@ -853,7 +864,7 @@
is_succesful = 1;
if (!(current->flags & PCRE_BUG)) {
#if defined SUPPORT_PCRE8 && defined SUPPORT_PCRE16
- if ((current->flags & PCRE_UTF8) && utf8 && utf16) {
+ if (utf8 == utf16) {
/* All results must be the same. */
if (return_value8_1 != return_value8_2 || return_value8_1 != return_value16_1 || return_value8_1 != return_value16_2) {
printf("\n8 and 16 bit: Return value differs(%d:%d:%d:%d): [%d] '%s' @ '%s'\n",
@@ -863,11 +874,13 @@
} else if (return_value8_1 >= 0) {
return_value8_1 *= 2;
/* Transform back the results. */
- for (i = 0; i < return_value8_1; ++i) {
- if (ovector16_1[i] >= 0)
- ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
- if (ovector16_2[i] >= 0)
- ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+ if (current->flags & PCRE_UTF8) {
+ for (i = 0; i < return_value8_1; ++i) {
+ if (ovector16_1[i] >= 0)
+ ovector16_1[i] = regtest_offsetmap[ovector16_1[i]];
+ if (ovector16_2[i] >= 0)
+ ovector16_2[i] = regtest_offsetmap[ovector16_2[i]];
+ }
}
for (i = 0; i < return_value8_1; ++i)
Modified: code/branches/pcre16/testdata/testoutput10
===================================================================
--- code/branches/pcre16/testdata/testoutput10 2011-12-06 15:38:01 UTC (rev 788)
+++ code/branches/pcre16/testdata/testoutput10 2011-12-07 14:36:26 UTC (rev 789)
@@ -194,7 +194,7 @@
------------------------------------------------------------------
/a(?P<name1>b|c)d(?P<longername2>e)/BM
-Memory allocation (code space): 42
+Memory allocation (code space): 36
------------------------------------------------------------------
0 32 Bra
3 a
@@ -212,7 +212,7 @@
------------------------------------------------------------------
/(?:a(?P<c>c(?P<d>d)))(?P<a>a)/BM
-Memory allocation (code space): 54
+Memory allocation (code space): 45
------------------------------------------------------------------
0 41 Bra
3 25 Bra
@@ -232,7 +232,7 @@
------------------------------------------------------------------
/(?P<a>a)...(?P=a)bbb(?P>a)d/BM
-Memory allocation (code space): 37
+Memory allocation (code space): 34
------------------------------------------------------------------
0 30 Bra
3 7 CBra 1