Revision: 1028
http://vcs.pcre.org/viewvc?view=rev&revision=1028
Author: ph10
Date: 2012-09-06 17:55:38 +0100 (Thu, 06 Sep 2012)
Log Message:
-----------
Set config.h NEWLINE values appropriately for EBCDIC, adding
--enable-ebcdic-nl25 (and CMake equivalent) for the alternate NL encoding.
Modified Paths:
--------------
code/trunk/CMakeLists.txt
code/trunk/NON-AUTOTOOLS-BUILD
code/trunk/config-cmake.h.in
code/trunk/configure.ac
code/trunk/pcre_internal.h
Modified: code/trunk/CMakeLists.txt
===================================================================
--- code/trunk/CMakeLists.txt 2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/CMakeLists.txt 2012-09-06 16:55:38 UTC (rev 1028)
@@ -57,6 +57,7 @@
# 2012-01-17 PH applied Stephen Kelly's patch to parse the version data out
# of the configure.ac file
# 2012-02-26 PH added support for libedit
+# 2012-09-06 PH added support for PCRE_EBCDIC_NL25
PROJECT(PCRE C CXX)
@@ -115,8 +116,11 @@
OPTION(PCRE_BUILD_PCRECPP "Build the PCRE C++ library (pcrecpp)." ON)
SET(PCRE_EBCDIC OFF CACHE BOOL
- "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems)")
+ "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems.)")
+SET(PCRE_EBCDIC_NL25 OFF CACHE BOOL
+ "Use 0x25 as EBCDIC NL character instead of 0x15; implies EBCDIC.")
+
SET(PCRE_LINK_SIZE "2" CACHE STRING
"Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.")
@@ -326,8 +330,25 @@
IF(PCRE_EBCDIC)
SET(EBCDIC 1)
+IF(PCRE_NEWLINE STREQUAL "LF")
+ SET(NEWLINE "21")
+ENDIF(PCRE_NEWLINE STREQUAL "LF")
+IF(PCRE_NEWLINE STREQUAL "CRLF")
+ SET(NEWLINE "3349")
+ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
ENDIF(PCRE_EBCDIC)
+IF(PCRE_EBCDIC_NL25)
+ SET(EBCDIC 1)
+ SET(EBCDIC_NL25 1)
+IF(PCRE_NEWLINE STREQUAL "LF")
+ SET(NEWLINE "37")
+ENDIF(PCRE_NEWLINE STREQUAL "LF")
+IF(PCRE_NEWLINE STREQUAL "CRLF")
+ SET(NEWLINE "3365")
+ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
+ENDIF(PCRE_EBCDIC_NL25)
+
IF(PCRE_NO_RECURSE)
SET(NO_RECURSE 1)
ENDIF(PCRE_NO_RECURSE)
@@ -822,6 +843,7 @@
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE_EBCDIC}")
+ MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE_EBCDIC_NL25}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE_REBUILD_CHARTABLES}")
MESSAGE(STATUS " No stack recursion .............. : ${PCRE_NO_RECURSE}")
MESSAGE(STATUS " POSIX mem threshold ............. : ${PCRE_POSIX_MALLOC_THRESHOLD}")
Modified: code/trunk/NON-AUTOTOOLS-BUILD
===================================================================
--- code/trunk/NON-AUTOTOOLS-BUILD 2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/NON-AUTOTOOLS-BUILD 2012-09-06 16:55:38 UTC (rev 1028)
@@ -55,11 +55,16 @@
(1) Copy or rename the file config.h.generic as config.h, and edit the macro
settings that it contains to whatever is appropriate for your environment.
- In particular, if you want to force a specific value for newline, you can
- define the NEWLINE macro. When you compile any of the PCRE modules, you
- must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
- in the sources.
+ In particular, you can alter the definition of the NEWLINE macro to
+ specify what character(s) you want to be interpreted as line terminators.
+ In an EBCDIC environment, you MUST change NEWLINE, because its default
+ value is 10, an ASCII LF. The usual EBCDIC newline character is 21 (0x15,
+ NL), though in some cases it may be 37 (0x25).
+
+ When you compile any of the PCRE modules, you must specify -DHAVE_CONFIG_H
+ to your compiler so that config.h is included in the sources.
+
An alternative approach is not to edit config.h, but to use -D on the
compiler command line to make any changes that you need to the
configuration options. In this case -DHAVE_CONFIG_H must not be set.
@@ -588,4 +593,4 @@
==========================
-Last Updated: 31 August 2012
+Last Updated: 04 September 2012
Modified: code/trunk/config-cmake.h.in
===================================================================
--- code/trunk/config-cmake.h.in 2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/config-cmake.h.in 2012-09-06 16:55:38 UTC (rev 1028)
@@ -25,6 +25,7 @@
#cmakedefine SUPPORT_UTF 1
#cmakedefine SUPPORT_UCP 1
#cmakedefine EBCDIC 1
+#cmakedefine EBCDIC_NL25 1
#cmakedefine BSR_ANYCRLF 1
#cmakedefine NO_RECURSE 1
Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac 2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/configure.ac 2012-09-06 16:55:38 UTC (rev 1028)
@@ -209,6 +209,12 @@
[assume EBCDIC coding rather than ASCII; incompatible with --enable-utf; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]),
, enable_ebcdic=no)
+# Handle --enable-ebcdic-nl25
+AC_ARG_ENABLE(ebcdic-nl25,
+ AS_HELP_STRING([--enable-ebcdic-nl25],
+ [set EBCDIC code for NL to 0x25 instead of 0x15; it implies --enable-ebcdic]),
+ , enable_ebcdic_nl25=no)
+
# Handle --disable-stack-for-recursion
AC_ARG_ENABLE(stack-for-recursion,
AS_HELP_STRING([--disable-stack-for-recursion],
@@ -335,21 +341,10 @@
fi
fi
-# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
-# Also check that UTF support is not requested, because PCRE cannot handle
-# EBCDIC and UTF in the same build. To do so it would need to use different
-# character constants depending on the mode.
-#
-if test "x$enable_ebcdic" = "xyes"
-then
- enable_rebuild_chartables=yes
- if test "x$enable_utf" = "xyes"
- then
- AC_MSG_ERROR([support for EBCDIC and UTF-8/16 cannot be enabled at the same time])
- fi
-fi
+# Convert the newline identifier into the appropriate integer value. The first
+# three are ASCII values 0x0a, 0x0d, and 0x0d0a, but if EBCDIC is enabled, they
+# are changed below.
-# Convert the newline identifier into the appropriate integer value.
case "$enable_newline" in
lf) ac_pcre_newline_value=10 ;;
cr) ac_pcre_newline_value=13 ;;
@@ -361,6 +356,37 @@
;;
esac
+# --enable-ebcdic-nl25 implies --enable-ebcdic
+if test "x$enable_ebcdic_nl25" = "xyes"; then
+ enable_ebcdic=yes
+fi
+
+# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled,
+# and the newline value is adjusted appropriately (CR is still 13, but LF is
+# 21 or 37). Also check that UTF support is not requested, because PCRE cannot
+# handle EBCDIC and UTF in the same build. To do so it would need to use
+# different character constants depending on the mode.
+#
+if test "x$enable_ebcdic" = "xyes"; then
+ enable_rebuild_chartables=yes
+
+ if test "x$enable_utf" = "xyes"; then
+ AC_MSG_ERROR([support for EBCDIC and UTF-8/16 cannot be enabled at the same time])
+ fi
+
+ if test "x$enable_ebcdic_nl25" = "xno"; then
+ case "$ac_pcre_newline_value" in
+ 10) ac_pcre_newline_value=21 ;;
+ 3338) ac_pcre_newline_value=3349 ;;
+ esac
+ else
+ case "$ac_pcre_newline_value" in
+ 10) ac_pcre_newline_value=37 ;;
+ 3338) ac_pcre_newline_value=3365 ;;
+ esac
+ fi
+fi
+
# Check argument to --with-link-size
case "$with_link_size" in
2|3|4) ;;
@@ -681,8 +707,12 @@
AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
The value of NEWLINE determines the newline character sequence. On
systems that support it, "configure" can be used to override the
- default, which is 10. The possible values are 10 (LF), 13 (CR),
- 3338 (CRLF), -1 (ANY), or -2 (ANYCRLF).])
+ default, which is LF. In ASCII environments, the value can be 10 (LF),
+ 13 (CR), or 3338 (CRLF); in EBCDIC environments the value can be 21 or 37
+ (LF), 13 (CR), or 3349 or 3365 (CRLF) because there are two alternative
+ codepoints (0x15 and 0x25) that are used as the NL line terminator that is
+ equivalent to ASCII LF. In both ASCII and EBCDIC environments the value can
+ also be -1 (ANY), or -2 (ANYCRLF).])
if test "$enable_bsr_anycrlf" = "yes"; then
AC_DEFINE([BSR_ANYCRLF], [], [
@@ -768,6 +798,13 @@
supports both EBCDIC and UTF-8/16.])
fi
+if test "$enable_ebcdic_nl25" = "yes"; then
+ AC_DEFINE_UNQUOTED([EBCDIC_NL25], [], [
+ In an EBCDIC environment, define this macro to any value to arrange for
+ the NL character to be 0x25 instead of the default 0x15. NL plays the role
+ that LF does in an ASCII/Unicode environment.])
+fi
+
# Platform specific issues
NO_UNDEFINED=
EXPORT_ALL_SYMBOLS=
@@ -891,9 +928,16 @@
AC_OUTPUT
-# Print out a nice little message after configure is run displaying your
+# Print out a nice little message after configure is run displaying the
# chosen options.
+ebcdic_nl_code=n/a
+if test "$enable_ebcdic_nl25" = "yes"; then
+ ebcdic_nl_code=0x25
+elif test "$enable_ebcdic" = "yes"; then
+ ebcdic_nl_code=0x15
+fi
+
cat <<EOF
$PACKAGE-$VERSION configuration summary:
@@ -919,6 +963,7 @@
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
EBCDIC coding ................... : ${enable_ebcdic}
+ EBCDIC code for NL .............. : ${ebcdic_nl_code}
Rebuild char tables ............. : ${enable_rebuild_chartables}
Use stack recursion ............. : ${enable_stack_for_recursion}
POSIX mem threshold ............. : ${with_posix_malloc_threshold}
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/pcre_internal.h 2012-09-06 16:55:38 UTC (rev 1028)
@@ -945,22 +945,69 @@
#ifndef SUPPORT_UTF
/* UTF-8 support is not enabled; use the platform-dependent character literals
-so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
+so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
+mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
+characters, a common practice has been to use its NL (0x15) character as the
+line terminator in C-like processing environments. However, sometimes the LF
+(0x25) character is used instead, according to this Unicode document:
+http://unicode.org/standard/reports/tr13/tr13-5.html
+
+PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25
+instead. Whichever is *not* chosen is defined as NEL.
+
+In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the
+same code point. */
+
+#ifdef EBCDIC
+
+#ifndef EBCDIC_NL25
+#define CHAR_NL '\x15'
+#define CHAR_NEL '\x25'
+#define STR_NL "\x15"
+#define STR_NEL "\x25"
+#else
+#define CHAR_NL '\x25'
+#define CHAR_NEL '\x15'
+#define STR_NL "\x25"
+#define STR_NEL "\x15"
+#endif
+
+#define CHAR_LF CHAR_NL
+#define STR_LF STR_NL
+
+#define CHAR_ESC '\047'
+#define CHAR_DEL '\007'
+#define STR_ESC "\047"
+#define STR_DEL "\007"
+
+#else /* Not EBCDIC */
+
+/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for
+compatibility. NEL is the Unicode newline character. */
+
+#define CHAR_LF '\n'
+#define CHAR_NL CHAR_LF
+#define CHAR_NEL '\x85'
+#define CHAR_ESC '\033'
+#define CHAR_DEL '\177'
+
+#define STR_LF "\n"
+#define STR_NL STR_LF
+#define STR_NEL "\x85"
+#define STR_ESC "\033"
+#define STR_DEL "\177"
+
+#endif /* EBCDIC */
+
+/* The remaining definitions work in both environments. */
+
#define CHAR_HT '\t'
#define CHAR_VT '\v'
#define CHAR_FF '\f'
#define CHAR_CR '\r'
-#define CHAR_NL '\n'
#define CHAR_BS '\b'
#define CHAR_BEL '\a'
-#ifdef EBCDIC
-#define CHAR_ESC '\047'
-#define CHAR_DEL '\007'
-#else
-#define CHAR_ESC '\033'
-#define CHAR_DEL '\177'
-#endif
#define CHAR_SPACE ' '
#define CHAR_EXCLAMATION_MARK '!'
@@ -1062,16 +1109,8 @@
#define STR_VT "\v"
#define STR_FF "\f"
#define STR_CR "\r"
-#define STR_NL "\n"
#define STR_BS "\b"
#define STR_BEL "\a"
-#ifdef EBCDIC
-#define STR_ESC "\047"
-#define STR_DEL "\007"
-#else
-#define STR_ESC "\033"
-#define STR_DEL "\177"
-#endif
#define STR_SPACE " "
#define STR_EXCLAMATION_MARK "!"
@@ -1221,7 +1260,9 @@
#define CHAR_VT '\013'
#define CHAR_FF '\014'
#define CHAR_CR '\015'
-#define CHAR_NL '\012'
+#define CHAR_LF '\012'
+#define CHAR_NL CHAR_LF
+#define CHAR_NEL '\x85'
#define CHAR_BS '\010'
#define CHAR_BEL '\007'
#define CHAR_ESC '\033'
@@ -1484,7 +1525,7 @@
#endif
#ifndef ESC_n
-#define ESC_n CHAR_NL
+#define ESC_n CHAR_LF
#endif
#ifndef ESC_r