[Pcre-svn] [757] code/branches/pcre16: More 16-bit patches

Startseite
Nachricht löschen
Autor: Subversion repository
Datum:  
To: pcre-svn
Betreff: [Pcre-svn] [757] code/branches/pcre16: More 16-bit patches
Revision: 757
          http://vcs.pcre.org/viewvc?view=rev&revision=757
Author:   ph10
Date:     2011-11-21 11:44:55 +0000 (Mon, 21 Nov 2011)


Log Message:
-----------
More 16-bit patches

Modified Paths:
--------------
    code/branches/pcre16/Makefile.am
    code/branches/pcre16/configure.ac
    code/branches/pcre16/libpcre.pc.in
    code/branches/pcre16/pcre.h.in
    code/branches/pcre16/pcre_compile.c
    code/branches/pcre16/pcre_internal.h


Added Paths:
-----------
    code/branches/pcre16/libpcre16.pc.in
    code/branches/pcre16/pcre16_compile.c
    code/branches/pcre16/pcre16_convert_utf16.c
    code/branches/pcre16/pcre16_valid_utf16.c


Property Changed:
----------------
    code/branches/pcre16/



Property changes on: code/branches/pcre16
___________________________________________________________________
Name: svn:ignore
- .deps
.libs
CMakeCache.txt
CMakeFiles
DartTestfile.txt
INSTALL
Makefile
Makefile.in
Testing
aclocal.m4
autom4te.cache
cmake_install.cmake
config.guess
config.h
config.h.generic
config.h.in
config.log
config.status
config.sub
configure
depcomp
dftables
install-sh
libpcre.pc
libpcre.so
libpcrecpp.pc
libpcrecpp.so
libpcreposix.pc
libpcreposix.so
libtool
ltmain.sh
m4
missing
pcre.h
pcre.h.generic
pcre_chartables.c
pcre-config
pcre_jit_test
pcre_scanner_unittest
pcre_stringpiece.h
pcre_stringpiece_unittest
pcrecpparg.h
pcrecpp_unittest
pcredemo
pcregrep
pcretest
progress.make
stamp-h1
test3input
test3output
testNinput
testsavedregex
teststderr
teststdout
testtry

+ .deps
.libs
CMakeCache.txt
CMakeFiles
DartTestfile.txt
INSTALL
Makefile
Makefile.in
Testing
aclocal.m4
autom4te.cache
cmake_install.cmake
config.guess
config.h
config.h.generic
config.h.in
config.log
config.status
config.sub
configure
depcomp
dftables
install-sh
libpcre.pc
libpcre16.pc
libpcre.so
libpcrecpp.pc
libpcrecpp.so
libpcreposix.pc
libpcreposix.so
libtool
ltmain.sh
m4
missing
pcre.h
pcre.h.generic
pcre_chartables.c
pcre-config
pcre_jit_test
pcre_scanner_unittest
pcre_stringpiece.h
pcre_stringpiece_unittest
pcrecpparg.h
pcrecpp_unittest
pcredemo
pcregrep
pcretest
progress.make
stamp-h1
test3input
test3output
testNinput
testsavedregex
teststderr
teststdout
testtry


Modified: code/branches/pcre16/Makefile.am
===================================================================
--- code/branches/pcre16/Makefile.am    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/Makefile.am    2011-11-21 11:44:55 UTC (rev 757)
@@ -79,7 +79,7 @@
 dist_noinst_SCRIPTS =


# Some of the binaries we make are to be installed, and others are
-# (non-user-visible) helper programs needed to build libpcre.
+# (non-user-visible) helper programs needed to build libpcre or libpcre16.
bin_PROGRAMS =
noinst_PROGRAMS =

@@ -170,6 +170,9 @@


## The main pcre library
+
+# Build the 8 bit library if it is enabled.
+if WITH_PCRE8
lib_LTLIBRARIES += libpcre.la
libpcre_la_SOURCES = \
pcre_compile.c \
@@ -199,6 +202,22 @@
nodist_libpcre_la_SOURCES = \
pcre_chartables.c

+endif # WITH_PCRE8
+
+# Build the 16 bit library if it is enabled.
+if WITH_PCRE16
+lib_LTLIBRARIES += libpcre16.la
+libpcre16_la_SOURCES = \
+ pcre16_compile.c \
+ pcre16_convert_utf16.c \
+ pcre16_valid_utf16.c
+
+## This file is generated as part of the building process, so don't distribute.
+nodist_libpcre16_la_SOURCES = \
+ pcre_chartables.c
+
+endif # WITH_PCRE16
+
# The pcre_printint.src file is #included by some source files, so it must be
# distributed. The pcre_chartables.c.dist file is the default version of
# pcre_chartables.c, used unless --enable-rebuild-chartables is specified.
@@ -224,7 +243,12 @@
sljit/sljitNativeX86_common.c \
sljit/sljitUtils.c

+if WITH_PCRE8
libpcre_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
+endif # WITH_PCRE8
+if WITH_PCRE16
+libpcre16_la_LDFLAGS = $(EXTRA_LIBPCRE_LDFLAGS)
+endif # WITH_PCRE16

CLEANFILES += pcre_chartables.c

@@ -233,15 +257,23 @@
TESTS += pcre_jit_test
noinst_PROGRAMS += pcre_jit_test
pcre_jit_test_SOURCES = pcre_jit_test.c
-pcre_jit_test_LDADD = libpcre.la
+pcre_jit_test_LDADD =
+if WITH_PCRE8
+pcre_jit_test_LDADD += libpcre.la
+endif # WITH_PCRE8
+if WITH_PCRE16
+pcre_jit_test_LDADD += libpcre16.la
+endif # WITH_PCRE16
endif # WITH_JIT

## A version of the main pcre library that has a posix re API.
+if WITH_PCRE8
lib_LTLIBRARIES += libpcreposix.la
libpcreposix_la_SOURCES = \
pcreposix.c
libpcreposix_la_LDFLAGS = $(EXTRA_LIBPCREPOSIX_LDFLAGS)
libpcreposix_la_LIBADD = libpcre.la
+endif # WITH_PCRE8

## There's a C++ library as well.
if WITH_PCRE_CPP
@@ -282,13 +314,19 @@
EXTRA_DIST += RunTest.bat
bin_PROGRAMS += pcretest
pcretest_SOURCES = pcretest.c
-pcretest_LDADD = libpcreposix.la $(LIBREADLINE)
+pcretest_LDADD = $(LIBREADLINE)
+if WITH_PCRE8
+pcretest_LDADD += libpcreposix.la
+endif # WITH_PCRE8

TESTS += RunGrepTest
dist_noinst_SCRIPTS += RunGrepTest
bin_PROGRAMS += pcregrep
pcregrep_SOURCES = pcregrep.c
-pcregrep_LDADD = libpcreposix.la $(LIBZ) $(LIBBZ2)
+pcregrep_LDADD = $(LIBZ) $(LIBBZ2)
+if WITH_PCRE8
+pcregrep_LDADD += libpcreposix.la
+endif # WITH_PCRE8

EXTRA_DIST += \
testdata/grepinput \
@@ -378,6 +416,9 @@
# We have .pc files for pkg-config users.
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpcre.pc libpcreposix.pc
+if WITH_PCRE16
+pkgconfig_DATA += libpcre16.pc
+endif
if WITH_PCRE_CPP
pkgconfig_DATA += libpcrecpp.pc
endif

Modified: code/branches/pcre16/configure.ac
===================================================================
--- code/branches/pcre16/configure.ac    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/configure.ac    2011-11-21 11:44:55 UTC (rev 757)
@@ -104,6 +104,18 @@
   htmldir='${docdir}/html'
 fi


+# Handle --disable-pcre8 (enabled by default)
+AC_ARG_ENABLE(pcre8,
+              AS_HELP_STRING([--disable-pcre8],
+                             [enable 8 bit character support]),
+              , enable_pcre8=unset)
+
+# Handle --enable-pcre16 (disabled by default)
+AC_ARG_ENABLE(pcre16,
+              AS_HELP_STRING([--enable-pcre16],
+                             [enable 16 bit character support]),
+              , enable_pcre16=unset)
+
 # Handle --disable-cpp. The substitution of enable_cpp is needed for use in
 # pcre-config.
 AC_ARG_ENABLE(cpp,
@@ -136,10 +148,16 @@
                              [enable UTF-8 support (incompatible with --enable-ebcdic)]),
               , enable_utf8=unset)


+# Handle --enable-utf16 (disabled by default)
+AC_ARG_ENABLE(utf16,
+              AS_HELP_STRING([--enable-utf16],
+                             [enable UTF-16 support (incompatible with --enable-ebcdic)]),
+              , enable_utf16=unset)
+
 # Handle --enable-unicode-properties
 AC_ARG_ENABLE(unicode-properties,
               AS_HELP_STRING([--enable-unicode-properties],
-                             [enable Unicode properties support (implies --enable-utf8)]),
+                             [enable Unicode properties support (implies --enable-utf8 and --enable-utf16)]),
               , enable_unicode_properties=no)


 # Handle --enable-newline=NL
@@ -245,8 +263,46 @@
                            [default limit on internal recursion (default=MATCH_LIMIT)]),
             , with_match_limit_recursion=MATCH_LIMIT)


-# Make sure that if enable_unicode_properties was set, that UTF-8 support
-# is enabled.
+# Make sure that if enable_utf8 was set, that enable_pcre8 support is enabled
+if test "x$enable_utf8" = "xyes"
+then
+  if test "x$enable_pcre8" = "xno"
+  then
+    AC_MSG_ERROR([support for UTF-8 requires pcre library with 8 bit characters])
+  fi
+  enable_pcre8=yes
+fi
+
+# Make sure that if enable_utf16 was set, that enable_pcre16 support is enabled
+if test "x$enable_utf16" = "xyes"
+then
+  if test "x$enable_pcre16" = "xno"
+  then
+    AC_MSG_ERROR([support for UTF-16 requires pcre library with 16 bit characters])
+  fi
+  enable_pcre16=yes
+fi
+
+# Set the default value for pcre8
+if test "x$enable_pcre8" = "xunset"
+then
+  enable_pcre8=yes
+fi
+
+# Set the default value for pcre16
+if test "x$enable_pcre16" = "xunset"
+then
+  enable_pcre16=no
+fi
+
+# Make sure enable_pcre8 or enable_pcre16 was set
+if test "x$enable_pcre8$enable_pcre16" = "xnono"
+then
+  AC_MSG_ERROR([Either 8 or 16 bit (or both) pcre library must be enabled])
+fi
+
+# Make sure that if enable_unicode_properties was set, that UTF-8 or UTF-16
+# support enabled.
 #
 if test "x$enable_unicode_properties" = "xyes"
 then
@@ -254,17 +310,44 @@
   then
     AC_MSG_ERROR([support for Unicode properties requires UTF-8 support])
   fi
-  enable_utf8=yes
+  if test "x$enable_utf16" = "xno"
+  then
+    AC_MSG_ERROR([support for Unicode properties requires UTF-16 support])
+  fi
+  if test "x$enable_pcre8" = "xyes"
+  then
+    enable_utf8=yes
+  fi
+  if test "x$enable_pcre16" = "xyes"
+  then
+    enable_utf16=yes
+  fi
 fi


+# enable_utf8 is disabled by default.
if test "x$enable_utf8" = "xunset"
then
enable_utf8=no
fi

+# enable_utf16 is disabled by default.
+if test "x$enable_utf16" = "xunset"
+then
+  enable_utf16=no
+fi
+
+# Make sure that if enable_cpp was set, that enable_pcre8 support is enabled
+if test "x$enable_cpp" = "xyes"
+then
+  if test "x$enable_pcre8" = "xno"
+  then
+    AC_MSG_ERROR([C++ library requires pcre library with 8 bit characters])
+  fi
+fi
+
 # Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
-# Also check that UTF-8 support is not requested, because PCRE cannot handle
-# EBCDIC and UTF-8 in the same build. To do so it would need to use different
+# Also check that UTF-8 or UTF-16 support is not requested, because PCRE cannot
+# handle EBCDIC and UTF in the same build. To do so it would need to use different
 # character constants depending on the mode.
 #
 if test "x$enable_ebcdic" = "xyes"
@@ -274,6 +357,10 @@
   then
     AC_MSG_ERROR([support for EBCDIC and UTF-8 cannot be enabled at the same time])
   fi
+  if test "x$enable_utf16" = "xyes"
+  then
+    AC_MSG_ERROR([support for EBCDIC and UTF-16 cannot be enabled at the same time])
+  fi
 fi


# Convert the newline identifier into the appropriate integer value.
@@ -410,10 +497,13 @@
AC_SUBST(pcre_have_bits_type_traits)

# Conditional compilation
+AM_CONDITIONAL(WITH_PCRE8, test "x$enable_pcre8" = "xyes")
+AM_CONDITIONAL(WITH_PCRE16, test "x$enable_pcre16" = "xyes")
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes")
AM_CONDITIONAL(WITH_UTF8, test "x$enable_utf8" = "xyes")
+AM_CONDITIONAL(WITH_UTF16, test "x$enable_utf16" = "xyes")

# Checks for typedefs, structures, and compiler characteristics.

@@ -482,6 +572,16 @@

# Here is where pcre specific defines are handled

+if test "$enable_pcre8" = "yes"; then
+  AC_DEFINE([SUPPORT_PCRE8], [], [
+    Define to enable the 8 bit PCRE library.])
+fi
+
+if test "$enable_pcre16" = "yes"; then
+  AC_DEFINE([SUPPORT_PCRE16], [], [
+    Define to enable the 16 bit PCRE library.])
+fi
+
 if test "$enable_jit" = "yes"; then
   AC_DEFINE([SUPPORT_JIT], [], [
     Define to enable support for Just-In-Time compiling.])
@@ -502,6 +602,14 @@
     *or* ASCII/UTF-8, but not both at once.])
 fi


+if test "$enable_utf16" = "yes"; then
+  AC_DEFINE([SUPPORT_UTF16], [], [
+    Define to enable support for the UTF-16 Unicode encoding. This will
+    work even in an EBCDIC environment, but it is incompatible with
+    the EBCDIC macro. That is, PCRE can support *either* EBCDIC code
+    *or* ASCII/UTF-16, but not both at once.])
+fi
+
 if test "$enable_unicode_properties" = "yes"; then
   AC_DEFINE([SUPPORT_UCP], [], [
     Define to enable support for Unicode properties.])
@@ -720,7 +828,8 @@
 AC_CONFIG_FILES(
     Makefile
     libpcre.pc
-        libpcreposix.pc
+    libpcre16.pc
+    libpcreposix.pc
     libpcrecpp.pc
     pcre-config
     pcre.h
@@ -756,9 +865,12 @@
     Linker flags .................... : ${LDFLAGS}
     Extra libraries ................. : ${LIBS}


+    Build 8 bit pcre library ........ : ${enable_pcre8}
+    Build 16 bit pcre library ....... : ${enable_pcre16}
     Build C++ library ............... : ${enable_cpp}
     Enable JIT compiling support .... : ${enable_jit}
     Enable UTF-8 support ............ : ${enable_utf8}
+    Enable UTF-16 support ........... : ${enable_utf16}
     Unicode properties .............. : ${enable_unicode_properties}
     Newline char/sequence ........... : ${enable_newline}
     \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}


Modified: code/branches/pcre16/libpcre.pc.in
===================================================================
--- code/branches/pcre16/libpcre.pc.in    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/libpcre.pc.in    2011-11-21 11:44:55 UTC (rev 757)
@@ -6,7 +6,7 @@
 includedir=@includedir@


Name: libpcre
-Description: PCRE - Perl compatible regular expressions C library
+Description: PCRE - Perl compatible regular expressions C library with 8 bit character support
Version: @PACKAGE_VERSION@
Libs: -L${libdir} -lpcre
Cflags: -I${includedir} @PCRE_STATIC_CFLAG@

Added: code/branches/pcre16/libpcre16.pc.in
===================================================================
--- code/branches/pcre16/libpcre16.pc.in                            (rev 0)
+++ code/branches/pcre16/libpcre16.pc.in    2011-11-21 11:44:55 UTC (rev 757)
@@ -0,0 +1,12 @@
+# Package Information for pkg-config
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libpcre16
+Description: PCRE - Perl compatible regular expressions C library with 16 bit character support
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir} -lpcre
+Cflags: -I${includedir} @PCRE_STATIC_CFLAG@


Modified: code/branches/pcre16/pcre.h.in
===================================================================
--- code/branches/pcre16/pcre.h.in    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/pcre.h.in    2011-11-21 11:44:55 UTC (rev 757)
@@ -111,7 +111,8 @@
 #define PCRE_NOTEOL             0x00000100  /* Exec, DFA exec */
 #define PCRE_UNGREEDY           0x00000200  /* Compile */
 #define PCRE_NOTEMPTY           0x00000400  /* Exec, DFA exec */
-#define PCRE_UTF8               0x00000800  /* Compile */
+#define PCRE_UTF8               0x00000800  /* Compile (Same as PCRE_UTF16) */
+#define PCRE_UTF16              0x00000800  /* Compile (Same as PCRE_UTF8) */
 #define PCRE_NO_AUTO_CAPTURE    0x00001000  /* Compile */
 #define PCRE_NO_UTF8_CHECK      0x00002000  /* Compile, exec, DFA exec */
 #define PCRE_AUTO_CALLOUT       0x00004000  /* Compile */
@@ -191,6 +192,14 @@
 #define PCRE_UTF8_ERR20             20
 #define PCRE_UTF8_ERR21             21


+/* Specific error codes for UTF-16 validity checks */
+
+#define PCRE_UTF16_ERR0              0
+#define PCRE_UTF16_ERR1              1
+#define PCRE_UTF16_ERR2              2
+#define PCRE_UTF16_ERR3              3
+#define PCRE_UTF16_ERR4              4
+
 /* Request types for pcre_fullinfo() */


 #define PCRE_INFO_OPTIONS            0
@@ -250,6 +259,17 @@
 struct real_pcre_jit_stack;       /* declaration; the definition is private  */
 typedef struct real_pcre_jit_stack pcre_jit_stack;


+/* If PCRE is compiled with 16 bit character support, PCRE_SCHAR16 must contain
+a 16 bit wide signed data type. Otherwise it can be a dummy data type since
+pcre16 functions are not implemented. There is a check for this in pcre_internal.h. */
+#ifndef PCRE_SCHAR16
+#define PCRE_SCHAR16 short
+#endif
+
+#ifndef PCRE_SPTR16
+#define PCRE_SPTR16 const PCRE_SCHAR16 *
+#endif
+
/* When PCRE is compiled as a C++ library, the subject pointer type can be
replaced with a custom type. For conventional use, the public interface is a
const char *. */
@@ -326,8 +346,12 @@

 PCRE_EXP_DECL pcre *pcre_compile(const char *, int, const char **, int *,
                   const unsigned char *);
+PCRE_EXP_DECL pcre *pcre16_compile(PCRE_SPTR16, int, const char **, int *,
+                  const unsigned char *);
 PCRE_EXP_DECL pcre *pcre_compile2(const char *, int, int *, const char **,
                   int *, const unsigned char *);
+PCRE_EXP_DECL pcre *pcre16_compile2(PCRE_SPTR16, int, int *, const short **,
+                  int *, const unsigned char *);
 PCRE_EXP_DECL int  pcre_config(int, void *);
 PCRE_EXP_DECL int  pcre_copy_named_substring(const pcre *, const char *,
                   int *, int, const char *, char *, int);
@@ -353,6 +377,8 @@
 PCRE_EXP_DECL int  pcre_info(const pcre *, int *, int *);
 PCRE_EXP_DECL const unsigned char *pcre_maketables(void);
 PCRE_EXP_DECL int  pcre_refcount(pcre *, int);
+PCRE_EXP_DECL int  pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *,
+                  PCRE_SPTR16, int, int);
 PCRE_EXP_DECL pcre_extra *pcre_study(const pcre *, int, const char **);
 PCRE_EXP_DECL void pcre_free_study(pcre_extra *);
 PCRE_EXP_DECL const char *pcre_version(void);


Added: code/branches/pcre16/pcre16_compile.c
===================================================================
--- code/branches/pcre16/pcre16_compile.c                            (rev 0)
+++ code/branches/pcre16/pcre16_compile.c    2011-11-21 11:44:55 UTC (rev 757)
@@ -0,0 +1,45 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2011 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* Generate code with 16 bit character support. */
+#define COMPILE_PCRE16
+
+#include "pcre_compile.c"
+
+/* End of pcre16_compile.c */


Added: code/branches/pcre16/pcre16_convert_utf16.c
===================================================================
--- code/branches/pcre16/pcre16_convert_utf16.c                            (rev 0)
+++ code/branches/pcre16/pcre16_convert_utf16.c    2011-11-21 11:44:55 UTC (rev 757)
@@ -0,0 +1,87 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2009 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains a function for converting any UTF-16 character
+strings to host byte order. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+int
+pcre16_utf16_to_host_byte_order(PCRE_SCHAR16 *output, PCRE_SPTR16 input, int length, int keep_boms)
+{
+#ifdef SUPPORT_UTF16
+/* This function converts any UTF-16 string to host byte order and optionally removes
+any Byte Order Marks (BOMS). Returns with the remainig length. */
+BOOL same_bo = TRUE;
+PCRE_SPTR16 end = input + length;
+/* The c variable must be unsigned. */
+register uschar c;
+
+while (input < end)
+  {
+  c = *input++;
+  if (c == 0xfeff || c == 0xfffe)
+    {
+    /* Detecting the byte order of the machine is unnecessary, it is
+    enough to know that the UTF-16 string has the same byte order or not. */
+    same_bo = c == 0xfeff;
+    if (keep_boms != 0)
+      *output++ = 0xfeff;
+    else
+      length--;
+    }
+  else
+    *output++ = same_bo ? c : ((c >> 8) | (c << 8)); /* Flip bytes if needed. */
+  }
+
+#else
+(void)(output);  /* Keep picky compilers happy */
+(void)(input);
+(void)(keep_boms);
+#endif
+return length;
+}
+
+/* End of pcre16_convert_utf16.c */


Added: code/branches/pcre16/pcre16_valid_utf16.c
===================================================================
--- code/branches/pcre16/pcre16_valid_utf16.c                            (rev 0)
+++ code/branches/pcre16/pcre16_valid_utf16.c    2011-11-21 11:44:55 UTC (rev 757)
@@ -0,0 +1,143 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+           Copyright (c) 1997-2009 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+
+/* This module contains an internal function for validating UTF-16 character
+strings. */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre_internal.h"
+
+
+/*************************************************
+*         Validate a UTF-16 string                *
+*************************************************/
+
+/* This function is called (optionally) at the start of compile or match, to
+check that a supposed UTF-16 string is actually valid. The early check means
+that subsequent code can assume it is dealing with a valid string. The check
+can be turned off for maximum performance, but the consequences of supplying an
+invalid string are then undefined.
+
+From release 8.21 more information about the details of the error are passed
+back in the returned value:
+
+PCRE_UTF16_ERR0  No error
+PCRE_UTF16_ERR1  Missing low surrogate at the end of the string
+PCRE_UTF16_ERR2  Invalid low surrogate
+PCRE_UTF16_ERR3  Isolated low surrogate
+PCRE_UTF16_ERR4  Not allowed character.
+
+Arguments:
+  string       points to the string
+  length       length of string, or -1 if the string is zero-terminated
+  errp         pointer to an error position offset variable
+
+Returns:       = 0    if the string is a valid UTF-16 string
+               > 0    otherwise, setting the offset of the bad character
+*/
+
+int
+_pcre16_valid_utf16(USPTR string, int length, int *erroroffset)
+{
+#ifdef SUPPORT_UTF16
+register USPTR p;
+register uschar c;
+
+if (length < 0)
+  {
+  for (p = string; *p != 0; p++);
+  length = p - string;
+  }
+
+for (p = string; length-- > 0; p++)
+  {
+  c = *p;
+
+  if ((c & 0xf800) != 0xd800)
+    {
+    /* Normal UTF-16 code point. Neither high nor low surrogate. */
+
+    /* This is probably a BOM from a different byte-order.
+    Regardless, the string is rejected. */
+    if (c == 0xfffe)
+      {
+      *erroroffset = p - string;
+      return PCRE_UTF16_ERR4;
+      }
+    }
+  else if ((c & 0x0400) == 0)
+    {
+    /* High surrogate. */
+
+    /* Must be a followed by a low surrogate. */
+    if (length == 0)
+      {
+      *erroroffset = p - string;
+      return PCRE_UTF16_ERR1;
+      }
+    p++;
+    length--;
+    if ((*p & 0xfc00) != 0xdc00)
+      {
+      *erroroffset = p - string;
+      return PCRE_UTF16_ERR2;
+      }
+    }
+  else
+    {
+    /* Isolated low surrogate. Always an error. */
+    *erroroffset = p - string;
+    return PCRE_UTF16_ERR3;
+    }
+  }
+
+#else  /* SUPPORT_UTF16 */
+(void)(string);  /* Keep picky compilers happy */
+(void)(length);
+#endif
+
+return PCRE_UTF16_ERR0;   /* This indicates success */
+}
+
+/* End of pcre16_valid_utf16.c */


Modified: code/branches/pcre16/pcre_compile.c
===================================================================
--- code/branches/pcre16/pcre_compile.c    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/pcre_compile.c    2011-11-21 11:44:55 UTC (rev 757)
@@ -7213,17 +7213,33 @@
                 with errorptr and erroroffset set
 */


+#ifndef COMPILE_PCRE16
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
+#else
+PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
+pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
+ int *erroroffset, const unsigned char *tables)
+#endif
{
+#ifndef COMPILE_PCRE16
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
+#else
+return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
+#endif
}


+#ifndef COMPILE_PCRE16
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
+#else
+PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
+pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
+ const char **errorptr, int *erroroffset, const unsigned char *tables)
+#endif
{
real_pcre *re;
int length = 1; /* For final END opcode */

Modified: code/branches/pcre16/pcre_internal.h
===================================================================
--- code/branches/pcre16/pcre_internal.h    2011-11-21 10:48:42 UTC (rev 756)
+++ code/branches/pcre16/pcre_internal.h    2011-11-21 11:44:55 UTC (rev 757)
@@ -51,11 +51,11 @@
 #define PCRE_DEBUG
 #endif


-/* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
+/* We do not support both EBCDIC and UTF-8/16 at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */

-#if defined EBCDIC && defined SUPPORT_UTF8
-#error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
+#if defined EBCDIC && (defined SUPPORT_UTF8 || defined SUPPORT_UTF16)
+#error The use of both EBCDIC and SUPPORT_UTF8/16 is not supported.
#endif

/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
@@ -208,11 +208,26 @@

/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
-However, we leave the interface to the outside world as char *, because that
-should make things easier for callers. */
+However, we leave the interface to the outside world as char * or short *,
+because that should make things easier for callers. We define a short type
+for the current character representation (either 8 or 16 bit) to save lots
+of typing. I tried "uchar", but it causes problems on Digital Unix, where
+it is defined in sys/types, so use "uschar" instead. */

+#ifndef COMPILE_PCRE16
typedef unsigned char pcre_uchar;
+#else
+#if USHRT_MAX != 65535
+/* This is a warning message. Change PCRE_SCHAR16 to a 16 bit data type in
+pcre.h(.in) and disable (comment out) this message. */
+#error Warning: PCRE_SCHAR16 is not a 16 bit data type.
+#endif
+typedef pcre_uint16 uschar;
+#endif

+/* A 8 bit unsigned data type. */
+typedef unsigned char pcre_uint8;
+
/* This is an unsigned int value that no character can ever have. UTF-8
characters only go up to 0x7fffffff (though Unicode doesn't go beyond
0x0010ffff). */
@@ -270,10 +285,11 @@
#define PCRE_PUCHAR CUSTOM_SUBJECT_PTR
#else
#define PCRE_PUCHAR const pcre_uchar *
+
+/* PCRE_SPTR is defined in pcre.h. */
+#define USPTR const uschar *
#endif

-
-
/* Include the public PCRE header and the definitions of UCP character property
values. */

@@ -1936,7 +1952,11 @@
 extern int               _pcre_ord2utf8(int, pcre_uint8 *);
 extern real_pcre        *_pcre_try_flipped(const real_pcre *, real_pcre *,
                            const pcre_study_data *, pcre_study_data *);
+#ifndef COMPILE_PCRE16
 extern int               _pcre_valid_utf8(PCRE_PUCHAR, int, int *);
+#else
+extern int               _pcre16_valid_utf16(PCRE_PUCHAR, int, int *);
+#endif
 extern BOOL              _pcre_was_newline(PCRE_PUCHAR, int, PCRE_PUCHAR,
                            int *, BOOL);
 extern BOOL              _pcre_xclass(int, const pcre_uchar *);