[Pcre-svn] [1028] code/trunk: Set config.h NEWLINE values a…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1028] code/trunk: Set config.h NEWLINE values appropriately for EBCDIC, adding
Revision: 1028
          http://vcs.pcre.org/viewvc?view=rev&revision=1028
Author:   ph10
Date:     2012-09-06 17:55:38 +0100 (Thu, 06 Sep 2012)


Log Message:
-----------
Set config.h NEWLINE values appropriately for EBCDIC, adding
--enable-ebcdic-nl25 (and CMake equivalent) for the alternate NL encoding.

Modified Paths:
--------------
    code/trunk/CMakeLists.txt
    code/trunk/NON-AUTOTOOLS-BUILD
    code/trunk/config-cmake.h.in
    code/trunk/configure.ac
    code/trunk/pcre_internal.h


Modified: code/trunk/CMakeLists.txt
===================================================================
--- code/trunk/CMakeLists.txt    2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/CMakeLists.txt    2012-09-06 16:55:38 UTC (rev 1028)
@@ -57,6 +57,7 @@
 # 2012-01-17 PH applied Stephen Kelly's patch to parse the version data out
 #            of the configure.ac file
 # 2012-02-26 PH added support for libedit
+# 2012-09-06 PH added support for PCRE_EBCDIC_NL25


PROJECT(PCRE C CXX)

@@ -115,8 +116,11 @@
OPTION(PCRE_BUILD_PCRECPP "Build the PCRE C++ library (pcrecpp)." ON)

 SET(PCRE_EBCDIC OFF CACHE BOOL
-    "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems)")
+    "Use EBCDIC coding instead of ASCII. (This is rarely used outside of mainframe systems.)")


+SET(PCRE_EBCDIC_NL25 OFF CACHE BOOL
+    "Use 0x25 as EBCDIC NL character instead of 0x15; implies EBCDIC.")
+
 SET(PCRE_LINK_SIZE "2" CACHE STRING
     "Internal link size (2, 3 or 4 allowed). See LINK_SIZE in config.h.in for details.")


@@ -326,8 +330,25 @@

 IF(PCRE_EBCDIC)
         SET(EBCDIC 1)
+IF(PCRE_NEWLINE STREQUAL "LF")
+        SET(NEWLINE "21")
+ENDIF(PCRE_NEWLINE STREQUAL "LF")
+IF(PCRE_NEWLINE STREQUAL "CRLF")
+        SET(NEWLINE "3349")
+ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
 ENDIF(PCRE_EBCDIC)


+IF(PCRE_EBCDIC_NL25)
+        SET(EBCDIC 1)
+        SET(EBCDIC_NL25 1)
+IF(PCRE_NEWLINE STREQUAL "LF")
+        SET(NEWLINE "37")
+ENDIF(PCRE_NEWLINE STREQUAL "LF")
+IF(PCRE_NEWLINE STREQUAL "CRLF")
+        SET(NEWLINE "3365")
+ENDIF(PCRE_NEWLINE STREQUAL "CRLF")
+ENDIF(PCRE_EBCDIC_NL25)          
+
 IF(PCRE_NO_RECURSE)
         SET(NO_RECURSE 1)
 ENDIF(PCRE_NO_RECURSE)
@@ -822,6 +843,7 @@
   MESSAGE(STATUS "  Newline char/sequence ........... : ${PCRE_NEWLINE}")
   MESSAGE(STATUS "  \\R matches only ANYCRLF ......... : ${PCRE_SUPPORT_BSR_ANYCRLF}")
   MESSAGE(STATUS "  EBCDIC coding ................... : ${PCRE_EBCDIC}")
+  MESSAGE(STATUS "  EBCDIC coding with NL=0x25 ...... : ${PCRE_EBCDIC_NL25}")
   MESSAGE(STATUS "  Rebuild char tables ............. : ${PCRE_REBUILD_CHARTABLES}")
   MESSAGE(STATUS "  No stack recursion .............. : ${PCRE_NO_RECURSE}")
   MESSAGE(STATUS "  POSIX mem threshold ............. : ${PCRE_POSIX_MALLOC_THRESHOLD}")


Modified: code/trunk/NON-AUTOTOOLS-BUILD
===================================================================
--- code/trunk/NON-AUTOTOOLS-BUILD    2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/NON-AUTOTOOLS-BUILD    2012-09-06 16:55:38 UTC (rev 1028)
@@ -55,11 +55,16 @@


  (1) Copy or rename the file config.h.generic as config.h, and edit the macro
      settings that it contains to whatever is appropriate for your environment.
-     In particular, if you want to force a specific value for newline, you can
-     define the NEWLINE macro. When you compile any of the PCRE modules, you
-     must specify -DHAVE_CONFIG_H to your compiler so that config.h is included
-     in the sources.


+     In particular, you can alter the definition of the NEWLINE macro to 
+     specify what character(s) you want to be interpreted as line terminators.
+     In an EBCDIC environment, you MUST change NEWLINE, because its default
+     value is 10, an ASCII LF. The usual EBCDIC newline character is 21 (0x15,
+     NL), though in some cases it may be 37 (0x25).
+      
+     When you compile any of the PCRE modules, you must specify -DHAVE_CONFIG_H
+     to your compiler so that config.h is included in the sources.
+
      An alternative approach is not to edit config.h, but to use -D on the
      compiler command line to make any changes that you need to the
      configuration options. In this case -DHAVE_CONFIG_H must not be set.
@@ -588,4 +593,4 @@



==========================
-Last Updated: 31 August 2012
+Last Updated: 04 September 2012

Modified: code/trunk/config-cmake.h.in
===================================================================
--- code/trunk/config-cmake.h.in    2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/config-cmake.h.in    2012-09-06 16:55:38 UTC (rev 1028)
@@ -25,6 +25,7 @@
 #cmakedefine SUPPORT_UTF 1
 #cmakedefine SUPPORT_UCP 1
 #cmakedefine EBCDIC 1
+#cmakedefine EBCDIC_NL25 1
 #cmakedefine BSR_ANYCRLF 1
 #cmakedefine NO_RECURSE 1



Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac    2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/configure.ac    2012-09-06 16:55:38 UTC (rev 1028)
@@ -209,6 +209,12 @@
                              [assume EBCDIC coding rather than ASCII; incompatible with --enable-utf; use only in (uncommon) EBCDIC environments; it implies --enable-rebuild-chartables]),
               , enable_ebcdic=no)


+# Handle --enable-ebcdic-nl25
+AC_ARG_ENABLE(ebcdic-nl25,
+              AS_HELP_STRING([--enable-ebcdic-nl25],
+                             [set EBCDIC code for NL to 0x25 instead of 0x15; it implies --enable-ebcdic]),
+              , enable_ebcdic_nl25=no)
+
 # Handle --disable-stack-for-recursion
 AC_ARG_ENABLE(stack-for-recursion,
               AS_HELP_STRING([--disable-stack-for-recursion],
@@ -335,21 +341,10 @@
   fi
 fi


-# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
-# Also check that UTF support is not requested, because PCRE cannot handle
-# EBCDIC and UTF in the same build. To do so it would need to use different
-# character constants depending on the mode.
-#
-if test "x$enable_ebcdic" = "xyes"
-then
-  enable_rebuild_chartables=yes
-  if test "x$enable_utf" = "xyes"
-  then
-    AC_MSG_ERROR([support for EBCDIC and UTF-8/16 cannot be enabled at the same time])
-  fi
-fi
+# Convert the newline identifier into the appropriate integer value. The first 
+# three are ASCII values 0x0a, 0x0d, and 0x0d0a, but if EBCDIC is enabled, they
+# are changed below.


-# Convert the newline identifier into the appropriate integer value.
 case "$enable_newline" in
   lf)      ac_pcre_newline_value=10   ;;
   cr)      ac_pcre_newline_value=13   ;;
@@ -361,6 +356,37 @@
   ;;
 esac


+# --enable-ebcdic-nl25 implies --enable-ebcdic
+if test "x$enable_ebcdic_nl25" = "xyes"; then
+  enable_ebcdic=yes
+fi   
+
+# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled,
+# and the newline value is adjusted appropriately (CR is still 13, but LF is
+# 21 or 37). Also check that UTF support is not requested, because PCRE cannot
+# handle EBCDIC and UTF in the same build. To do so it would need to use
+# different character constants depending on the mode.
+#
+if test "x$enable_ebcdic" = "xyes"; then
+  enable_rebuild_chartables=yes
+
+  if test "x$enable_utf" = "xyes"; then
+    AC_MSG_ERROR([support for EBCDIC and UTF-8/16 cannot be enabled at the same time])
+  fi
+
+  if test "x$enable_ebcdic_nl25" = "xno"; then
+    case "$ac_pcre_newline_value" in
+      10)   ac_pcre_newline_value=21 ;;
+      3338) ac_pcre_newline_value=3349 ;;
+    esac    
+  else   
+    case "$ac_pcre_newline_value" in
+      10)   ac_pcre_newline_value=37 ;;
+      3338) ac_pcre_newline_value=3365 ;;
+    esac    
+  fi   
+fi
+
 # Check argument to --with-link-size
 case "$with_link_size" in
   2|3|4) ;;
@@ -681,8 +707,12 @@
 AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
   The value of NEWLINE determines the newline character sequence. On
   systems that support it, "configure" can be used to override the
-  default, which is 10. The possible values are 10 (LF), 13 (CR),
-  3338 (CRLF), -1 (ANY), or -2 (ANYCRLF).])
+  default, which is LF. In ASCII environments, the value can be 10 (LF), 
+  13 (CR), or 3338 (CRLF); in EBCDIC environments the value can be 21 or 37
+  (LF), 13 (CR), or 3349 or 3365 (CRLF) because there are two alternative
+  codepoints (0x15 and 0x25) that are used as the NL line terminator that is 
+  equivalent to ASCII LF. In both ASCII and EBCDIC environments the value can
+  also be -1 (ANY), or -2 (ANYCRLF).])


 if test "$enable_bsr_anycrlf" = "yes"; then
   AC_DEFINE([BSR_ANYCRLF], [], [
@@ -768,6 +798,13 @@
     supports both EBCDIC and UTF-8/16.])
 fi


+if test "$enable_ebcdic_nl25" = "yes"; then
+  AC_DEFINE_UNQUOTED([EBCDIC_NL25], [], [
+    In an EBCDIC environment, define this macro to any value to arrange for
+    the NL character to be 0x25 instead of the default 0x15. NL plays the role 
+    that LF does in an ASCII/Unicode environment.])
+fi       
+
 # Platform specific issues
 NO_UNDEFINED=
 EXPORT_ALL_SYMBOLS=
@@ -891,9 +928,16 @@


AC_OUTPUT

-# Print out a nice little message after configure is run displaying your
+# Print out a nice little message after configure is run displaying the
# chosen options.

+ebcdic_nl_code=n/a
+if test "$enable_ebcdic_nl25" = "yes"; then
+ ebcdic_nl_code=0x25
+elif test "$enable_ebcdic" = "yes"; then
+ ebcdic_nl_code=0x15
+fi
+
cat <<EOF

 $PACKAGE-$VERSION configuration summary:
@@ -919,6 +963,7 @@
     Newline char/sequence ........... : ${enable_newline}
     \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
     EBCDIC coding ................... : ${enable_ebcdic}
+    EBCDIC code for NL .............. : ${ebcdic_nl_code} 
     Rebuild char tables ............. : ${enable_rebuild_chartables}
     Use stack recursion ............. : ${enable_stack_for_recursion}
     POSIX mem threshold ............. : ${with_posix_malloc_threshold}


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2012-09-03 14:01:38 UTC (rev 1027)
+++ code/trunk/pcre_internal.h    2012-09-06 16:55:38 UTC (rev 1028)
@@ -945,22 +945,69 @@
 #ifndef SUPPORT_UTF


/* UTF-8 support is not enabled; use the platform-dependent character literals
-so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
+so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
+mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
+characters, a common practice has been to use its NL (0x15) character as the
+line terminator in C-like processing environments. However, sometimes the LF
+(0x25) character is used instead, according to this Unicode document:

+http://unicode.org/standard/reports/tr13/tr13-5.html
+
+PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25 
+instead. Whichever is *not* chosen is defined as NEL. 
+
+In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the
+same code point. */
+
+#ifdef EBCDIC
+
+#ifndef EBCDIC_NL25
+#define CHAR_NL                     '\x15'
+#define CHAR_NEL                    '\x25'
+#define STR_NL                      "\x15"
+#define STR_NEL                     "\x25"
+#else
+#define CHAR_NL                     '\x25'
+#define CHAR_NEL                    '\x15'
+#define STR_NL                      "\x25"
+#define STR_NEL                     "\x15"
+#endif
+
+#define CHAR_LF                     CHAR_NL
+#define STR_LF                      STR_NL
+
+#define CHAR_ESC                    '\047'
+#define CHAR_DEL                    '\007'
+#define STR_ESC                     "\047"
+#define STR_DEL                     "\007"
+
+#else  /* Not EBCDIC */
+
+/* In ASCII/Unicode, linefeed is '\n' and we equate this to NL for 
+compatibility. NEL is the Unicode newline character. */
+
+#define CHAR_LF                     '\n'
+#define CHAR_NL                     CHAR_LF
+#define CHAR_NEL                    '\x85'
+#define CHAR_ESC                    '\033'
+#define CHAR_DEL                    '\177'
+
+#define STR_LF                      "\n"
+#define STR_NL                      STR_LF
+#define STR_NEL                     "\x85"
+#define STR_ESC                     "\033"
+#define STR_DEL                     "\177"
+
+#endif  /* EBCDIC */
+
+/* The remaining definitions work in both environments. */
+
 #define CHAR_HT                     '\t'
 #define CHAR_VT                     '\v'
 #define CHAR_FF                     '\f'
 #define CHAR_CR                     '\r'
-#define CHAR_NL                     '\n'
 #define CHAR_BS                     '\b'
 #define CHAR_BEL                    '\a'
-#ifdef EBCDIC
-#define CHAR_ESC                    '\047'
-#define CHAR_DEL                    '\007'
-#else
-#define CHAR_ESC                    '\033'
-#define CHAR_DEL                    '\177'
-#endif


 #define CHAR_SPACE                  ' '
 #define CHAR_EXCLAMATION_MARK       '!'
@@ -1062,16 +1109,8 @@
 #define STR_VT                      "\v"
 #define STR_FF                      "\f"
 #define STR_CR                      "\r"
-#define STR_NL                      "\n"
 #define STR_BS                      "\b"
 #define STR_BEL                     "\a"
-#ifdef EBCDIC
-#define STR_ESC                     "\047"
-#define STR_DEL                     "\007"
-#else
-#define STR_ESC                     "\033"
-#define STR_DEL                     "\177"
-#endif


 #define STR_SPACE                   " "
 #define STR_EXCLAMATION_MARK        "!"
@@ -1221,7 +1260,9 @@
 #define CHAR_VT                     '\013'
 #define CHAR_FF                     '\014'
 #define CHAR_CR                     '\015'
-#define CHAR_NL                     '\012'
+#define CHAR_LF                     '\012'
+#define CHAR_NL                     CHAR_LF
+#define CHAR_NEL                    '\x85'
 #define CHAR_BS                     '\010'
 #define CHAR_BEL                    '\007'
 #define CHAR_ESC                    '\033'
@@ -1484,7 +1525,7 @@
 #endif


#ifndef ESC_n
-#define ESC_n CHAR_NL
+#define ESC_n CHAR_LF
#endif

#ifndef ESC_r