Revision: 785
http://vcs.pcre.org/viewvc?view=rev&revision=785
Author: zherczeg
Date: 2011-12-05 20:12:24 +0000 (Mon, 05 Dec 2011)
Log Message:
-----------
Improving UTF-16 support by fixing a lot of issues.
Modified Paths:
--------------
code/branches/pcre16/Makefile.am
code/branches/pcre16/pcre.h.in
code/branches/pcre16/pcre_compile.c
code/branches/pcre16/pcre_dfa_exec.c
code/branches/pcre16/pcre_exec.c
code/branches/pcre16/pcre_fullinfo.c
code/branches/pcre16/pcre_info.c
code/branches/pcre16/pcre_internal.h
code/branches/pcre16/pcre_jit_compile.c
code/branches/pcre16/pcre_newline.c
code/branches/pcre16/pcre_printint.src
code/branches/pcre16/pcre_study.c
code/branches/pcre16/pcre_version.c
Added Paths:
-----------
code/branches/pcre16/pcre16_fullinfo.c
code/branches/pcre16/pcre16_info.c
code/branches/pcre16/pcre16_version.c
Modified: code/branches/pcre16/Makefile.am
===================================================================
--- code/branches/pcre16/Makefile.am 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/Makefile.am 2011-12-05 20:12:24 UTC (rev 785)
@@ -212,6 +212,8 @@
pcre16_chartables.c \
pcre16_compile.c \
pcre16_exec.c \
+ pcre16_fullinfo.c \
+ pcre16_info.c \
pcre16_jit_compile.c \
pcre16_newline.c \
pcre16_ord2utf16.c \
@@ -222,6 +224,7 @@
pcre16_ucd.c \
pcre16_utf16_utils.c \
pcre16_valid_utf16.c \
+ pcre16_version.c \
pcre16_xclass.c
## This file is generated as part of the building process, so don't distribute.
Modified: code/branches/pcre16/pcre.h.in
===================================================================
--- code/branches/pcre16/pcre.h.in 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre.h.in 2011-12-05 20:12:24 UTC (rev 785)
@@ -367,6 +367,8 @@
PCRE_EXP_DECL void pcre_free_substring_list(const char **);
PCRE_EXP_DECL int pcre_fullinfo(const pcre *, const pcre_extra *, int,
void *);
+PCRE_EXP_DECL int pcre16_fullinfo(const pcre *, const pcre_extra *, int,
+ void *);
PCRE_EXP_DECL int pcre_get_named_substring(const pcre *, const char *,
int *, int, const char *, const char **);
PCRE_EXP_DECL int pcre_get_stringnumber(const pcre *, const char *);
@@ -377,16 +379,20 @@
PCRE_EXP_DECL int pcre_get_substring_list(const char *, int *, int,
const char ***);
PCRE_EXP_DECL int pcre_info(const pcre *, int *, int *);
+PCRE_EXP_DECL int pcre16_info(const pcre *, int *, int *);
PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
PCRE_EXP_DECL int pcre_refcount(pcre *, int);
-PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
- PCRE_SPTR16, int, int);
PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
PCRE_EXP_DECL pcre_extra *pcre16_study(const pcre *, int, const char **);
PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
PCRE_EXP_DECL void pcre16_free_study(pcre_extra *);
PCRE_EXP_DECL const char *pcre_version(void);
+PCRE_EXP_DECL const char *pcre16_version(void);
+/* Utility functions. */
+PCRE_EXP_DECL int pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+ PCRE_SPTR16, int, int);
+
/* JIT compiler related functions. */
PCRE_EXP_DECL pcre_jit_stack *pcre_jit_stack_alloc(int, int);
Added: code/branches/pcre16/pcre16_fullinfo.c
===================================================================
--- code/branches/pcre16/pcre16_fullinfo.c (rev 0)
+++ code/branches/pcre16/pcre16_fullinfo.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_fullinfo.c"
+
+/* End of pcre16_fullinfo.c */
Added: code/branches/pcre16/pcre16_info.c
===================================================================
--- code/branches/pcre16/pcre16_info.c (rev 0)
+++ code/branches/pcre16/pcre16_info.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_info.c"
+
+/* End of pcre16_info.c */
Added: code/branches/pcre16/pcre16_version.c
===================================================================
--- code/branches/pcre16/pcre16_version.c (rev 0)
+++ code/branches/pcre16/pcre16_version.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -0,0 +1,45 @@
+/*************************************************
+* Perl-Compatible Regular Expressions *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+ Written by Philip Hazel
+ Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of the University of Cambridge nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_version.c"
+
+/* End of pcre16_version.c */
Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_compile.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -102,6 +102,10 @@
#define REQ_CASELESS 0x10000000l /* Indicates caselessness */
#define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */
+/* Repeated character flags. */
+
+#define UTF_LENGTH 0x10000000l /* The char contains its length. */
+
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
are simple data values; negative values are for special things like \d and so
on. Zero means further processing is needed (for things like \x), or the escape
@@ -2896,7 +2900,7 @@
check_auto_possessive(const pcre_uchar *previous, BOOL utf,
const pcre_uchar *ptr, int options, compile_data *cd)
{
-int c, next;
+pcre_int32 c, next;
int op_code = *previous++;
/* Skip whitespace and comments in extended mode */
@@ -2932,15 +2936,13 @@
if (temperrorcode != 0) return FALSE;
ptr++; /* Point after the escape sequence */
}
-
-else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
+else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
{
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) { GETCHARINC(next, ptr); } else
#endif
next = *ptr++;
}
-
else return FALSE;
/* Skip whitespace and comments in extended mode */
@@ -4603,20 +4605,25 @@
/* Deal with UTF characters that take up more than one character. It's
easier to write this out separately than try to macrify it. Use c to
- hold the length of the character in bytes, plus 0x80 to flag that it's a
- length rather than a small character. */
+ hold the length of the character in bytes, plus UTF_LENGTH to flag that
+ it's a length rather than a small character. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
+#ifdef COMPILE_PCRE8
if (utf && (code[-1] & 0x80) != 0)
+#endif /* COMPILE_PCRE8 */
+#ifdef COMPILE_PCRE16
+ if (utf && (code[-1] & 0xfc00) == 0xdc00)
+#endif /* COMPILE_PCRE8 */
{
pcre_uchar *lastchar = code - 1;
BACKCHAR(lastchar);
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
- c |= 0x80; /* Flag c as a length */
+ c |= UTF_LENGTH; /* Flag c as a length */
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Handle the case of a single charater - either with no UTF support, or
with UTF disabled, or for a single character UTF character. */
@@ -4758,14 +4765,14 @@
we have to insert the character for the previous code. For a repeated
Unicode property match, there are two extra bytes that define the
required property. In UTF-8 mode, long characters have their length in
- c, with the 0x80 bit as a flag. */
+ c, with the UTF_LENGTH bit as a flag. */
if (repeat_max < 0)
{
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -4787,10 +4794,10 @@
else if (repeat_max != repeat_min)
{
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -4817,10 +4824,10 @@
/* The character or character type itself comes last in all cases. */
-#ifdef SUPPORT_UTF8
- if (utf && c >= 128)
+#ifdef SUPPORT_UTF
+ if (utf && (c & UTF_LENGTH) != 0)
{
- memcpy(code, utf_chars, c & 7);
+ memcpy(code, utf_chars, IN_UCHARS(c & 7));
code += c & 7;
}
else
@@ -6661,9 +6668,7 @@
#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(c))
- {
- INTERNALCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
- }
+ ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
#endif
/* At this point we have the character's bytes in mcbuffer, and the length
@@ -7789,9 +7794,27 @@
re->first_char = firstchar & 0xffff;
#endif
#endif
- if ((firstchar & REQ_CASELESS) != 0 && MAX_255(re->first_char)
- && cd->fcc[re->first_char] != re->first_char)
- re->flags |= PCRE_FCH_CASELESS;
+ if ((firstchar & REQ_CASELESS) != 0)
+ {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ /* We ignore non-ASCII first chars in 8 bit mode. */
+ if (utf)
+ {
+ if (re->first_char < 128)
+ {
+ if (cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
+ else if ((options & PCRE_UCP) != 0
+ && UCD_OTHERCASE(re->first_char) != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
+ else
+#endif
+ if (MAX_255(re->first_char)
+ && cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_FCH_CASELESS;
+ }
re->flags |= PCRE_FIRSTSET;
}
@@ -7814,9 +7837,26 @@
re->req_char = reqchar & 0xffff;
#endif
#endif
- if ((reqchar & REQ_CASELESS) != 0 && MAX_255(re->req_char)
- && cd->fcc[re->req_char] != re->req_char)
- re->flags |= PCRE_RCH_CASELESS;
+ if ((reqchar & REQ_CASELESS) != 0)
+ {
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ /* We ignore non-ASCII first chars in 8 bit mode. */
+ if (utf)
+ {
+ if (re->first_char < 128)
+ {
+ if (cd->fcc[re->first_char] != re->first_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
+ else if ((options & PCRE_UCP) != 0
+ && UCD_OTHERCASE(re->first_char) != re->first_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
+ else
+#endif
+ if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
+ re->flags |= PCRE_RCH_CASELESS;
+ }
re->flags |= PCRE_REQCHSET;
}
Modified: code/branches/pcre16/pcre_dfa_exec.c
===================================================================
--- code/branches/pcre16/pcre_dfa_exec.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_dfa_exec.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -480,7 +480,7 @@
{
if (current_subject <= start_subject) break;
current_subject--;
- INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
+ ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
}
}
else
@@ -3199,7 +3199,13 @@
has_first_char = TRUE;
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
+ {
first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && utf && md->use_ucp)
+ first_char2 = UCD_OTHERCASE(first_char);
+#endif
+ }
}
else
{
@@ -3217,7 +3223,13 @@
has_req_char = TRUE;
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
+ {
req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && utf && md->use_ucp)
+ req_char2 = UCD_OTHERCASE(req_char);
+#endif
+ }
}
/* Call the main matching function, looping for a non-anchored regex after a
@@ -3246,7 +3258,7 @@
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- INTERNALCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, *t, t++);
}
}
else
@@ -3290,7 +3302,7 @@
!WAS_NEWLINE(current_subject))
{
current_subject++;
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
}
}
@@ -3318,12 +3330,17 @@
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
+#ifndef COMPILE_PCRE8
+ if (c > 255) c = 255;
+#endif
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
current_subject++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ /* In non 8-bit mode, the iteration will stop for
+ characters > 255 at the beginning or not stop at all. */
if (utf)
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
#endif
}
@@ -3434,7 +3451,7 @@
#ifdef SUPPORT_UTF
if (utf)
{
- INTERNALCHAR(current_subject < end_subject, *current_subject,
+ ACROSSCHAR(current_subject < end_subject, *current_subject,
current_subject++);
}
#endif
Modified: code/branches/pcre16/pcre_exec.c
===================================================================
--- code/branches/pcre16/pcre_exec.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_exec.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -2069,7 +2069,7 @@
be "non-word" characters. Remember the earliest consulted character for
partial matching. */
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
/* Get status of previous character */
@@ -2190,7 +2190,7 @@
}
eptr++;
#ifdef SUPPORT_UTF
- if (utf) INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
#endif
ecode++;
break;
@@ -3066,7 +3066,7 @@
/* Match a single character, caselessly */
case OP_CHARI:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
length = 1;
@@ -4089,7 +4089,7 @@
}
if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4102,7 +4102,7 @@
MRRETURN(MATCH_NOMATCH);
}
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4301,7 +4301,7 @@
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -4330,7 +4330,7 @@
if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
MRRETURN(MATCH_NOMATCH);
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@@ -5330,7 +5330,7 @@
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
@@ -5347,7 +5347,7 @@
}
if (IS_NEWLINE(eptr)) break;
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
break;
@@ -5363,7 +5363,7 @@
break;
}
eptr++;
- INTERNALCHAR(eptr < md->end_subject, *eptr, eptr++);
+ ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
}
else
@@ -6264,7 +6264,13 @@
has_first_char = TRUE;
first_char = first_char2 = re->first_char;
if ((re->flags & PCRE_FCH_CASELESS) != 0)
+ {
first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && utf && md->use_ucp)
+ first_char2 = UCD_OTHERCASE(first_char);
+#endif
+ }
}
else
if (!startline && study != NULL &&
@@ -6280,7 +6286,13 @@
has_req_char = TRUE;
req_char = req_char2 = re->req_char;
if ((re->flags & PCRE_RCH_CASELESS) != 0)
+ {
req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && utf && md->use_ucp)
+ req_char2 = UCD_OTHERCASE(req_char);
+#endif
+ }
}
@@ -6309,7 +6321,7 @@
while (t < md->end_subject && !IS_NEWLINE(t))
{
t++;
- INTERNALCHAR(t < end_subject, *t, t++);
+ ACROSSCHAR(t < end_subject, *t, t++);
}
}
else
@@ -6351,7 +6363,7 @@
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
- INTERNALCHAR(start_match < end_subject, *start_match,
+ ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
}
}
@@ -6378,17 +6390,18 @@
{
while (start_match < end_subject)
{
-#ifdef COMPILE_PCRE
register unsigned int c = *start_match;
-#else
- register unsigned int c = *start_match & 0xff;
+#ifndef COMPILE_PCRE8
+ if (c > 255) c = 255;
#endif
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
start_match++;
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ /* In non 8-bit mode, the iteration will stop for
+ characters > 255 at the beginning or not stop at all. */
if (utf)
- INTERNALCHAR(start_match < end_subject, *start_match,
+ ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
#endif
}
@@ -6520,7 +6533,7 @@
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF
if (utf)
- INTERNALCHAR(new_start_match < end_subject, *new_start_match,
+ ACROSSCHAR(new_start_match < end_subject, *new_start_match,
new_start_match++);
#endif
break;
Modified: code/branches/pcre16/pcre_fullinfo.c
===================================================================
--- code/branches/pcre16/pcre_fullinfo.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_fullinfo.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -65,9 +65,15 @@
Returns: 0 if data returned, negative on error
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
void *where)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
+ void *where)
+#endif
{
real_pcre internal_re;
pcre_study_data internal_study;
Modified: code/branches/pcre16/pcre_info.c
===================================================================
--- code/branches/pcre16/pcre_info.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_info.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -72,8 +72,13 @@
or negative values on error
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
-pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
+pcre_info(const pcre *argument_re, int *optptr, int *first_char)
+#else
+PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
+pcre16_info(const pcre *argument_re, int *optptr, int *first_char)
+#endif
{
real_pcre internal_re;
const real_pcre *re = (const real_pcre *)argument_re;
@@ -84,8 +89,8 @@
if (re == NULL) return PCRE_ERROR_BADMAGIC;
}
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
-if (first_byte != NULL)
- *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
+if (first_char != NULL)
+ *first_char = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_char :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_internal.h 2011-12-05 20:12:24 UTC (rev 785)
@@ -542,7 +542,7 @@
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
-/* #define INTERNALCHAR(condition, eptr, action) */
+/* #define ACROSSCHAR(condition, eptr, action) */
#else /* SUPPORT_UTF */
@@ -708,7 +708,7 @@
#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
while((condition) && ((eptr) & 0xc0) == 0x80) action
#else /* COMPILE_PCRE8 */
@@ -748,7 +748,7 @@
the pointer. */
#define GETUTF16INC(c, eptr) \
- { c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; eptr++; }
+ { c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; }
/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */
@@ -797,7 +797,7 @@
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
/* Same as above, but it allows a fully customizable form. */
-#define INTERNALCHAR(condition, eptr, action) \
+#define ACROSSCHAR(condition, eptr, action) \
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
#endif
Modified: code/branches/pcre16/pcre_jit_compile.c
===================================================================
--- code/branches/pcre16/pcre_jit_compile.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_jit_compile.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -300,7 +300,7 @@
#ifdef SUPPORT_UTF8
BOOL utf;
#ifdef SUPPORT_UCP
- BOOL useucp;
+ BOOL use_ucp;
#endif
jump_list *utfreadchar;
#ifdef COMPILE_PCRE8
@@ -390,10 +390,12 @@
#define PRIV_DATA(cc) (common->localptrs[(cc) - common->start])
#ifdef COMPILE_PCRE8
-#define MOV_UCHAR SLJIT_MOV_UB
+#define MOV_UCHAR SLJIT_MOV_UB
+#define MOVU_UCHAR SLJIT_MOVU_UB
#else
#ifdef COMPILE_PCRE16
-#define MOV_UCHAR SLJIT_MOV_UH
+#define MOV_UCHAR SLJIT_MOV_UH
+#define MOVU_UCHAR SLJIT_MOVU_UH
#else
#error Unsupported compiling mode
#endif
@@ -1369,10 +1371,10 @@
if (bit >= (1 << 10))
bit >>= 10;
else
- return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
+ return (bit < 256) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8));
}
#endif /* SUPPORT_UTF16 */
-return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
+return (bit < 256) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8));
#endif /* COMPILE_PCRE16 */
#endif /* COMPILE_PCRE8 */
@@ -1420,7 +1422,7 @@
struct sljit_jump *jump;
#endif
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF
if (common->utf)
{
@@ -1461,7 +1463,7 @@
#else
#ifdef COMPILE_PCRE16
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
- jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
JUMPHERE(jump);
/* Skip low surrogate if necessary. */
@@ -1478,9 +1480,9 @@
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#ifdef COMPILE_PCRE16
-/* The ctypes array contains only 255 values. */
+/* The ctypes array contains only 256 values. */
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
-jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
#endif
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
#ifdef COMPILE_PCRE16
@@ -1542,7 +1544,7 @@
}
else
{
- SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline <= 255);
+ SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
add_jump(compiler, fallbacks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
}
@@ -1660,7 +1662,7 @@
JUMPHERE(jump);
/* Combine two 16 bit characters. */
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
@@ -1818,7 +1820,7 @@
return mainloop;
}
-static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar firstchar, BOOL caseless, BOOL firstline)
+static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline)
{
DEFINE_COMPILER;
struct sljit_label *start;
@@ -1836,22 +1838,28 @@
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-oc = firstchar;
+oc = first_char;
if (caseless)
- oc = TABLE_GET(firstchar, common->fcc, firstchar);
-if (firstchar == oc)
- found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, firstchar);
+ {
+ oc = TABLE_GET(first_char, common->fcc, first_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (first_char > 127 && common->utf && common->use_ucp)
+ oc = UCD_OTHERCASE(first_char);
+#endif
+ }
+if (first_char == oc)
+ found = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, first_char);
else
{
- bit = firstchar ^ oc;
+ bit = first_char ^ oc;
if (ispowerof2(bit))
{
OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit);
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, firstchar | bit);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit);
}
else
{
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, firstchar);
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
@@ -1912,16 +1920,19 @@
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
- OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, STR_PTR, 0, TMP1, 0);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_GREATER_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
loop = LABEL();
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, loop);
CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff, loop);
@@ -1952,9 +1963,12 @@
leave = JUMP(SLJIT_JUMP);
JUMPHERE(foundcr);
notfoundnl = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL);
+#ifdef COMPILE_PCRE16
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
JUMPHERE(notfoundnl);
JUMPHERE(leave);
@@ -1972,6 +1986,9 @@
struct sljit_label *start;
struct sljit_jump *leave;
struct sljit_jump *found;
+#ifndef COMPILE_PCRE8
+struct sljit_jump *jump;
+#endif
if (firstline)
{
@@ -1987,7 +2004,9 @@
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
#ifndef COMPILE_PCRE8
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xff);
+jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 255);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 255);
+JUMPHERE(jump);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -2028,7 +2047,7 @@
OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0);
}
-static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar reqchar, BOOL caseless, BOOL has_firstchar)
+static SLJIT_INLINE struct sljit_jump *search_requested_char(compiler_common *common, pcre_uchar req_char, BOOL caseless, BOOL has_firstchar)
{
DEFINE_COMPILER;
struct sljit_label *loop;
@@ -2045,34 +2064,40 @@
alreadyfound = CMP(SLJIT_C_LESS, STR_PTR, 0, TMP2, 0);
if (has_firstchar)
- OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
else
OP1(SLJIT_MOV, TMP1, 0, STR_PTR, 0);
loop = LABEL();
notfound = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, STR_END, 0);
-OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), 0);
-oc = reqchar;
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(TMP1), 0);
+oc = req_char;
if (caseless)
- oc = TABLE_GET(reqchar, common->fcc, reqchar);
-if (reqchar == oc)
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+ {
+ oc = TABLE_GET(req_char, common->fcc, req_char);
+#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
+ if (req_char > 127 && common->utf && common->use_ucp)
+ oc = UCD_OTHERCASE(req_char);
+#endif
+ }
+if (req_char == oc)
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
else
{
- bit = reqchar ^ oc;
+ bit = req_char ^ oc;
if (ispowerof2(bit))
{
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, bit);
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar | bit);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char | bit);
}
else
{
- found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, reqchar);
+ found = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, req_char);
foundoc = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, oc);
}
}
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_JUMP, loop);
JUMPHERE(found);
@@ -2126,7 +2151,7 @@
{
DEFINE_COMPILER;
struct sljit_jump *beginend;
-#ifdef SUPPORT_UTF8
+#if !(defined COMPILE_PCRE8) || defined SUPPORT_UTF
struct sljit_jump *jump;
#endif
@@ -2143,7 +2168,7 @@
/* Testing char type. */
#ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2160,20 +2185,24 @@
else
#endif
{
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
/* Here LOCALS1 has already been zeroed. */
jump = NULL;
if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
-#endif
+#endif /* COMPILE_PCRE8 */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, TMP1, 0);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ JUMPHERE(jump);
+#elif defined SUPPORT_UTF
if (jump != NULL)
JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
}
JUMPHERE(beginend);
@@ -2183,7 +2212,7 @@
/* Testing char type. This is a code duplication. */
#ifdef SUPPORT_UCP
-if (common->useucp)
+if (common->use_ucp)
{
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1);
jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_UNDERSCORE);
@@ -2199,8 +2228,12 @@
else
#endif
{
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ /* TMP2 may be destroyed by peek_char. */
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
+ jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
+#elif defined SUPPORT_UTF
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
jump = NULL;
if (common->utf)
jump = CMP(SLJIT_C_GREATER, TMP1, 0, SLJIT_IMM, 255);
@@ -2208,10 +2241,12 @@
OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), common->ctypes);
OP2(SLJIT_LSHR, TMP2, 0, TMP2, 0, SLJIT_IMM, 4 /* ctype_word */);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
-#ifdef SUPPORT_UTF8
+#ifndef COMPILE_PCRE8
+ JUMPHERE(jump);
+#elif defined SUPPORT_UTF
if (jump != NULL)
JUMPHERE(jump);
-#endif
+#endif /* COMPILE_PCRE8 */
}
JUMPHERE(beginend);
@@ -2314,18 +2349,18 @@
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP1(SLJIT_MOV, TMP3, 0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR2, 0);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);
JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, CHAR1, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -2346,20 +2381,30 @@
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, CHAR1, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, CHAR2, 0);
OP1(SLJIT_MOV, LCC_TABLE, 0, SLJIT_IMM, common->lcc);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
-OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
label = LABEL();
-OP1(SLJIT_MOVU_UB, CHAR1, 0, SLJIT_MEM1(TMP1), 1);
-OP1(SLJIT_MOVU_UB, CHAR2, 0, SLJIT_MEM1(STR_PTR), 1);
+OP1(MOVU_UCHAR, CHAR1, 0, SLJIT_MEM1(TMP1), IN_UCHARS(1));
+OP1(MOVU_UCHAR, CHAR2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+#ifndef COMPILE_PCRE8
+jump = CMP(SLJIT_C_GREATER, CHAR1, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR1, 0, SLJIT_MEM2(LCC_TABLE, CHAR1), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+jump = CMP(SLJIT_C_GREATER, CHAR2, 0, SLJIT_IMM, 255);
+#endif
OP1(SLJIT_MOV_UB, CHAR2, 0, SLJIT_MEM2(LCC_TABLE, CHAR2), 0);
+#ifndef COMPILE_PCRE8
+JUMPHERE(jump);
+#endif
jump = CMP(SLJIT_C_NOT_EQUAL, CHAR1, 0, CHAR2, 0);
-OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+OP2(SLJIT_SUB | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPTO(SLJIT_C_NOT_ZERO, label);
JUMPHERE(jump);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP1(SLJIT_MOV, LCC_TABLE, 0, TMP3, 0);
OP1(SLJIT_MOV, CHAR1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
OP1(SLJIT_MOV, CHAR2, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1);
@@ -2378,7 +2423,7 @@
/* This function would be ineffective to do in JIT level. */
int c1, c2;
const pcre_uchar *src2 = args->ptr;
-const pcre_uchar *end2 = (pcre_uchar *)args->end;
+const pcre_uchar *end2 = args->end;
while (src1 < end1)
{
@@ -2976,7 +3021,7 @@
{
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff));
JUMPHERE(jump[1]);
JUMPHERE(jump[0]);
@@ -3037,9 +3082,9 @@
read_char(common);
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
jump[1] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
jump[2] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
jump[3] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
check_newlinechar(common, common->bsr_nltype, fallbacks, FALSE);
@@ -3089,36 +3134,37 @@
jump[0] = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
else if (common->nltype == NLTYPE_FIXED)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, STR_END, 0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
else
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump[1] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP2, 0, STR_END, 0);
jump[2] = JUMP(SLJIT_C_GREATER);
add_jump(compiler, fallbacks, JUMP(SLJIT_C_LESS));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 1);
+ /* Equal. */
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
jump[3] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL);
add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
JUMPHERE(jump[1]);
if (common->nltype == NLTYPE_ANYCRLF)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, STR_END, 0));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
}
@@ -3158,15 +3204,13 @@
jump[0] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[1]);
- OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, end));
- add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, STR_PTR, 0));
-
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, STR_PTR, 0, STR_END, 0));
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS, TMP2, 0, TMP1, 0));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), -2);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), -1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
@@ -3200,10 +3244,10 @@
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 2);
+ OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, TMP2, 0, STR_END, 0));
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, common->newline & 0xff));
}
@@ -6382,7 +6426,7 @@
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
-common->useucp = (re->options & PCRE_UCP) != 0;
+common->use_ucp = (re->options & PCRE_UCP) != 0;
#endif
common->utfreadchar = NULL;
#ifdef COMPILE_PCRE8
Modified: code/branches/pcre16/pcre_newline.c
===================================================================
--- code/branches/pcre16/pcre_newline.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_newline.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -77,7 +77,15 @@
BOOL utf)
{
int c;
-if (utf) { GETCHAR(c, ptr); } else c = *ptr;
+(void)utf;
+#ifdef SUPPORT_UTF
+if (utf)
+ {
+ GETCHAR(c, ptr);
+ }
+else
+#endif /* SUPPORT_UTF8 */
+ c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@@ -96,9 +104,15 @@
case 0x000c: *lenptr = 1; return TRUE; /* FF */
case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
return TRUE; /* CR */
+#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
+#else
+ case 0x0085: /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 1; return TRUE; /* PS */
+#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}
@@ -127,17 +141,17 @@
BOOL utf)
{
int c;
+(void)utf;
ptr--;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
-else c = *ptr;
-#else /* no UTF-8 support */
-c = *ptr;
+else
#endif /* SUPPORT_UTF8 */
+ c = *ptr;
if (type == NLTYPE_ANYCRLF) switch(c)
{
@@ -154,9 +168,15 @@
case 0x000b: /* VT */
case 0x000c: /* FF */
case 0x000d: *lenptr = 1; return TRUE; /* CR */
+#ifdef COMPILE_PCRE8
case 0x0085: *lenptr = utf? 2 : 1; return TRUE; /* NEL */
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
+#else
+ case 0x0085: /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: *lenptr = 1; return TRUE; /* PS */
+#endif /* COMPILE_PCRE8 */
default: return FALSE;
}
}
Modified: code/branches/pcre16/pcre_printint.src
===================================================================
--- code/branches/pcre16/pcre_printint.src 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_printint.src 2011-12-05 20:12:24 UTC (rev 785)
@@ -123,7 +123,9 @@
if (!utf || (c & 0xfc00) != 0xd800)
{
- if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
+ if (PRINTABLE(c)) fprintf(f, "%c", c);
+ else if (c <= 0xff) fprintf(f, "\\x%02x", c);
+ else fprintf(f, "\\x{%x}", c);
return 0;
}
else
Modified: code/branches/pcre16/pcre_study.c
===================================================================
--- code/branches/pcre16/pcre_study.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_study.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -224,7 +224,7 @@
case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -245,7 +245,7 @@
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -293,7 +293,7 @@
appear, but leave the code, just in case.) */
case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf) return -1;
#endif
branchlength++;
@@ -486,7 +486,7 @@
case OP_NOTPOSQUERYI:
cc += PRIV(OP_lengths)[op];
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -549,9 +549,10 @@
{
unsigned int c = *p;
+#ifdef COMPILE_PCRE8
SET_BIT(c);
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
if (utf && c > 127)
{
GETCHARINC(c, p);
@@ -572,6 +573,33 @@
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+ c = 0xff;
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ c = UCD_OTHERCASE(c);
+ if (c > 0xff)
+ c = 0xff;
+ SET_BIT(c);
+ }
+#endif
+ return p;
+ }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
}
@@ -602,7 +630,7 @@
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
@@ -644,7 +672,9 @@
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
}
@@ -679,7 +709,11 @@
{
register int c;
int yield = SSB_DONE;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif
#if 0
/* ========================================================================= */
@@ -951,14 +985,23 @@
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
+#ifdef SUPPORT_UTF
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -968,12 +1011,21 @@
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
+#ifdef SUPPORT_UTF
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
try_next = FALSE;
break;
@@ -1058,14 +1110,23 @@
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
+#ifdef COMPILE_PCRE8
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -1074,12 +1135,21 @@
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
+#ifdef COMPILE_PCRE8
if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
@@ -1126,13 +1196,16 @@
character with a value > 255. */
case OP_NCLASS:
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
}
#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
/* Fall through */
case OP_CLASS:
@@ -1147,7 +1220,7 @@
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
{
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
@@ -1161,12 +1234,10 @@
}
}
}
-
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
else
#endif
{
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
for (c = 0; c < 32; c++) start_bits[c] |= map[c];
}
@@ -1342,6 +1413,18 @@
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
+#ifdef PCRE_DEBUG
+ if (bits_set)
+ {
+ pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
+ int i;
+
+ printf("Start bits:\n");
+ for (i = 0; i < 32; i++)
+ printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+ }
+#endif
+
/* Always set the minlength value in the block, because the JIT compiler
makes use of it. However, don't set the bit unless the length is greater than
zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
Modified: code/branches/pcre16/pcre_version.c
===================================================================
--- code/branches/pcre16/pcre_version.c 2011-12-05 12:33:44 UTC (rev 784)
+++ code/branches/pcre16/pcre_version.c 2011-12-05 20:12:24 UTC (rev 785)
@@ -79,8 +79,13 @@
pre-processor time. This hack uses a standard trick for avoiding calling
the STRING macro with an empty argument when doing the test. */
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
pcre_version(void)
+#else
+PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION
+pcre16_version(void)
+#endif
{
return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :