Revision: 903
http://vcs.pcre.org/viewvc?view=rev&revision=903
Author: ph10
Date: 2012-01-21 16:37:17 +0000 (Sat, 21 Jan 2012)
Log Message:
-----------
Source file tidies for 8.30-RC1 release; fix Makefile.am bugs for building
symbolic links to man pages.
Modified Paths:
--------------
code/trunk/CMakeLists.txt
code/trunk/ChangeLog
code/trunk/Makefile.am
code/trunk/NEWS
code/trunk/README
code/trunk/RunTest
code/trunk/configure.ac
code/trunk/doc/html/index.html
code/trunk/doc/html/pcre-config.html
code/trunk/doc/html/pcre.html
code/trunk/doc/html/pcre16.html
code/trunk/doc/html/pcre_config.html
code/trunk/doc/html/pcre_fullinfo.html
code/trunk/doc/html/pcre_jit_stack_alloc.html
code/trunk/doc/html/pcre_pattern_to_host_byte_order.html
code/trunk/doc/html/pcre_utf16_to_host_byte_order.html
code/trunk/doc/html/pcreapi.html
code/trunk/doc/html/pcrebuild.html
code/trunk/doc/html/pcrecallout.html
code/trunk/doc/html/pcrecpp.html
code/trunk/doc/html/pcrejit.html
code/trunk/doc/html/pcrematching.html
code/trunk/doc/html/pcrepartial.html
code/trunk/doc/html/pcrepattern.html
code/trunk/doc/html/pcreposix.html
code/trunk/doc/html/pcreprecompile.html
code/trunk/doc/html/pcrestack.html
code/trunk/doc/html/pcresyntax.html
code/trunk/doc/html/pcretest.html
code/trunk/doc/html/pcreunicode.html
code/trunk/doc/pcre-config.1
code/trunk/doc/pcre.3
code/trunk/doc/pcre.txt
code/trunk/doc/pcre16.3
code/trunk/doc/pcre_config.3
code/trunk/doc/pcre_fullinfo.3
code/trunk/doc/pcre_jit_stack_alloc.3
code/trunk/doc/pcre_pattern_to_host_byte_order.3
code/trunk/doc/pcre_utf16_to_host_byte_order.3
code/trunk/doc/pcreapi.3
code/trunk/doc/pcrebuild.3
code/trunk/doc/pcrecallout.3
code/trunk/doc/pcrecpp.3
code/trunk/doc/pcrejit.3
code/trunk/doc/pcrematching.3
code/trunk/doc/pcrepartial.3
code/trunk/doc/pcrepattern.3
code/trunk/doc/pcreposix.3
code/trunk/doc/pcreprecompile.3
code/trunk/doc/pcrestack.3
code/trunk/doc/pcresyntax.3
code/trunk/doc/pcretest.1
code/trunk/doc/pcreunicode.3
code/trunk/doc/perltest.txt
code/trunk/pcre-config.in
code/trunk/pcre_compile.c
code/trunk/pcre_exec.c
code/trunk/pcreposix.c
code/trunk/pcretest.c
code/trunk/perltest.pl
Modified: code/trunk/CMakeLists.txt
===================================================================
--- code/trunk/CMakeLists.txt 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/CMakeLists.txt 2012-01-21 16:37:17 UTC (rev 903)
@@ -393,7 +393,7 @@
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
- pcre_string_utils.c
+ pcre_string_utils.c
pcre_study.c
pcre_tables.c
pcre_ucd.c
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/ChangeLog 2012-01-21 16:37:17 UTC (rev 903)
@@ -32,35 +32,35 @@
8. Ovector size of 2 is also supported by JIT based pcre_exec (the ovector size
rounding is not applied in this particular case).
-
+
9. The invalid Unicode surrogate codepoints U+D800 to U+DFFF are now rejected
if they appear, or are escaped, in patterns.
-
-10. Get rid of a number of -Wunused-but-set-variable warnings.
-11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
- "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
- Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
- also returns the mark "x". This bug applied to capturing parentheses,
- non-capturing parentheses, and atomic parentheses. It also applied to some
+10. Get rid of a number of -Wunused-but-set-variable warnings.
+
+11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
+ "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
+ Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
+ also returns the mark "x". This bug applied to capturing parentheses,
+ non-capturing parentheses, and atomic parentheses. It also applied to some
assertions.
-
-12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
- information out of configure.ac instead of relying on pcre.h.generic, which
- is not stored in the repository.
-
+
+12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
+ information out of configure.ac instead of relying on pcre.h.generic, which
+ is not stored in the repository.
+
13. Applied Dmitry V. Levin's patch for a more portable method for linking with
-lreadline.
-
-14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
-15. Applied Graycode's patch to put the top-level frame on the stack rather
- than the heap when not using the stack for recursion. This gives a
+14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
+
+15. Applied Graycode's patch to put the top-level frame on the stack rather
+ than the heap when not using the stack for recursion. This gives a
performance improvement in many cases when recursion is not deep.
-
-16. Experimental code added to "pcretest -C" to output the stack frame size.
+16. Experimental code added to "pcretest -C" to output the stack frame size.
+
Version 8.21 12-Dec-2011
------------------------
Modified: code/trunk/Makefile.am
===================================================================
--- code/trunk/Makefile.am 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/Makefile.am 2012-01-21 16:37:17 UTC (rev 903)
@@ -520,34 +520,32 @@
doc/pcreunicode.3
# Arrange for the per-function man pages to have 16-bit names as well.
-install-exec-hook:
- pushd $(DESTDIR)($man3dir)
- ln -s pcre_assign_jit_stack.3 pcre16_assign_jit_stack.3
- ln -s pcre_compile.3 pcre16_compile.3
- ln -s pcre_compile2.3 pcre16_compile2.3
- ln -s pcre_config.3 pcre16_config.3
- ln -s pcre_copy_named_substring.3 pcre16_copy_named_substring.3
- ln -s pcre_copy_substring.3 pcre16_copy_substring.3
- ln -s pcre_dfa_exec.3 pcre16_dfa_exec.3
- ln -s pcre_exec.3 pcre16_exec.3
- ln -s pcre_free_study.3 pcre16_free_study.3
- ln -s pcre_free_substring.3 pcre16_free_substring.3
- ln -s pcre_free_substring_list.3 pcre16_free_substring_list.3
- ln -s pcre_fullinfo.3 pcre16_fullinfo.3
- ln -s pcre_get_named_substring.3 pcre16_get_named_substring.3
- ln -s pcre_get_stringnumber.3 pcre16_get_stringnumber.3
- ln -s pcre_get_stringtable_entries.3 pcre16_get_stringtable_entries.3
- ln -s pcre_get_substring.3 pcre16_get_substring.3
- ln -s pcre_get_substring_list.3 pcre16_get_substring_list.3
- ln -s pcre_jit_stack_alloc.3 pcre16_jit_stack_alloc.3
- ln -s pcre_jit_stack_free.3 pcre16_jit_stack_free.3
- ln -s pcre_maketables.3 pcre16_maketables.3
- ln -s pcre_pattern_to_host_byte_order.3 pcre16_pattern_to_host_byte_order.3
- ln -s pcre_refcount.3 pcre16_refcount.3
- ln -s pcre_study.3 pcre16_study.3
- ln -s pcre_utf16_to_host_byte_order.3 pcre16_utf16_to_host_byte_order.3
- ln -s pcre_version.3 pcre16_version.3
- popd
+install-data-hook:
+ ln -s $(DESTDIR)$(man3dir)/pcre_assign_jit_stack.3 $(DESTDIR)$(man3dir)/pcre16_assign_jit_stack.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_compile.3 $(DESTDIR)$(man3dir)/pcre16_compile.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_compile2.3 $(DESTDIR)$(man3dir)/pcre16_compile2.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_config.3 $(DESTDIR)$(man3dir)/pcre16_config.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_copy_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_named_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_copy_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_dfa_exec.3 $(DESTDIR)$(man3dir)/pcre16_dfa_exec.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_exec.3 $(DESTDIR)$(man3dir)/pcre16_exec.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_study.3 $(DESTDIR)$(man3dir)/pcre16_free_study.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_substring.3 $(DESTDIR)$(man3dir)/pcre16_free_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_free_substring_list.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_fullinfo.3 $(DESTDIR)$(man3dir)/pcre16_fullinfo.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_named_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_stringnumber.3 $(DESTDIR)$(man3dir)/pcre16_get_stringnumber.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_stringtable_entries.3 $(DESTDIR)$(man3dir)/pcre16_get_stringtable_entries.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_get_substring_list.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_alloc.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_alloc.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_free.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_free.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_maketables.3 $(DESTDIR)$(man3dir)/pcre16_maketables.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_pattern_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_pattern_to_host_byte_order.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_refcount.3 $(DESTDIR)$(man3dir)/pcre16_refcount.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_study.3 $(DESTDIR)$(man3dir)/pcre16_study.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_utf16_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_utf16_to_host_byte_order.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_version.3 $(DESTDIR)$(man3dir)/pcre16_version.3
pcrecpp_man = doc/pcrecpp.3
EXTRA_DIST += $(pcrecpp_man)
Modified: code/trunk/NEWS
===================================================================
--- code/trunk/NEWS 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/NEWS 2012-01-21 16:37:17 UTC (rev 903)
@@ -5,20 +5,20 @@
------------
Release 8.30 introduces a major new feature: support for 16-bit character
-strings, compiled as a separate library. There are a few changes to the
+strings, compiled as a separate library. There are a few changes to the
8-bit library, in addition to some bug fixes.
. The pcre_info() function, which has been obsolete for over 10 years, has
been removed.
. When a compiled pattern was saved to a file and later reloaded on a host
- with different endianness, PCRE used automatically to swap the bytes in some
+ with different endianness, PCRE used automatically to swap the bytes in some
of the data fields. With the advent of the 16-bit library, where more of this
swapping is needed, it is no longer done automatically. Instead, the bad
endianness is detected and a specific error is given. The user can then call
a new function called pcre_pattern_to_host_byte_order() (or an equivalent
16-bit function) to do the swap.
-
+
. In UTF-8 mode, the values 0xd800 to 0xdfff are not legal Unicode
code points and are now faulted. (They are the so-called "surrogates"
that are reserved for coding high values in UTF-16.)
Modified: code/trunk/README
===================================================================
--- code/trunk/README 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/README 2012-01-21 16:37:17 UTC (rev 903)
@@ -201,7 +201,7 @@
platforms. It is not possible to use both --enable-utf and --enable-ebcdic at
the same time.
-. There are no separate options for enabling UTF-8 and UTF-16 independently
+. There are no separate options for enabling UTF-8 and UTF-16 independently
because that would allow ridiculous settings such as requesting UTF-16
support while building only the 8-bit library. However, the option
--enable-utf8 is retained for backwards compatibility with earlier releases
@@ -669,7 +669,7 @@
The twentieth test is run only in 16-bit mode. It tests some specific 16-bit
features of the DFA matching engine.
-The twenty-first and twenty-second tests are run only in 16-bit mode, when the
+The twenty-first and twenty-second tests are run only in 16-bit mode, when the
link size is set to 2. They test reloading pre-compiled patterns.
Modified: code/trunk/RunTest
===================================================================
--- code/trunk/RunTest 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/RunTest 2012-01-21 16:37:17 UTC (rev 903)
@@ -275,7 +275,7 @@
do19=yes
do20=yes
do21=yes
- do22=yes
+ do22=yes
fi
# Show which release and which test data
Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/configure.ac 2012-01-21 16:37:17 UTC (rev 903)
@@ -11,7 +11,7 @@
m4_define(pcre_major, [8])
m4_define(pcre_minor, [30])
m4_define(pcre_prerelease, [-RC1])
-m4_define(pcre_date, [2012-01-20])
+m4_define(pcre_date, [2012-01-21])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.
@@ -774,8 +774,9 @@
AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS)
AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS)
-# When we run 'make distcheck', use these arguments.
-DISTCHECK_CONFIGURE_FLAGS="--enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties"
+# When we run 'make distcheck', use these arguments. Turning off compiler
+# optimization makes it run faster.
+DISTCHECK_CONFIGURE_FLAGS="CFLAGS='' CXXFLAGS='' --enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties"
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
Modified: code/trunk/doc/html/index.html
===================================================================
--- code/trunk/doc/html/index.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/index.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -1,10 +1,10 @@
<html>
-<!-- This is a manually maintained file that is the root of the HTML version of
- the PCRE documentation. When the HTML documents are built from the man
- page versions, the entire doc/html directory is emptied, this file is then
- copied into doc/html/index.html, and the remaining files therein are
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
--->
+-->
<head>
<title>PCRE specification</title>
</head>
@@ -86,11 +86,11 @@
</table>
<p>
-There are also individual pages that summarize the interface for each function
+There are also individual pages that summarize the interface for each function
in the library. There is a single page for each pair of 8-bit/16-bit functions.
</p>
-<table>
+<table>
<tr><td><a href="pcre_assign_jit_stack.html">pcre_assign_jit_stack</a></td>
<td> Assign stack for JIT matching</td></tr>
@@ -153,7 +153,7 @@
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td> Build character tables in current locale</td></tr>
-
+
<tr><td><a href="pcre_pattern_to_host_byte_order.html">pcre_pattern_to_host_byte_order</a></td>
<td> Convert compiled pattern to host byte order if necessary</td></tr>
Modified: code/trunk/doc/html/pcre-config.html
===================================================================
--- code/trunk/doc/html/pcre-config.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre-config.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -23,15 +23,15 @@
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b>
-<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags] </b>
+<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags]</b>
<b>[--cflags-posix]</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
<b>pcre-config</b> returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them. Some of
-the options apply only to the 8-bit or 16-bit libraries, respectively, and are
-not available if only one of those libraries has been built. If an unavailable
+libraries and the options required to compile a program to use them. Some of
+the options apply only to the 8-bit or 16-bit libraries, respectively, and are
+not available if only one of those libraries has been built. If an unavailable
option is encountered, the "usage" information is output.
</P>
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
Modified: code/trunk/doc/html/pcre.html
===================================================================
--- code/trunk/doc/html/pcre.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -28,11 +28,11 @@
for requesting some minor changes that give better JavaScript compatibility.
</P>
<P>
-Starting with release 8.30, it is possible to compile two separate PCRE
+Starting with release 8.30, it is possible to compile two separate PCRE
libraries: the original, which supports 8-bit character strings (including
UTF-8 strings), and a second library that supports 16-bit character strings
(including UTF-16 strings). The build process allows either one or both to be
-built. The majority of the work to make this possible was done by Zoltan
+built. The majority of the work to make this possible was done by Zoltan
Herczeg.
</P>
<P>
@@ -42,8 +42,8 @@
documentation describes the 8-bit library, with the differences for the 16-bit
library described separately in the
<a href="pcre16.html"><b>pcre16</b></a>
-page. References to functions or structures of the form <i>pcre[16]_xxx</i>
-should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and
+page. References to functions or structures of the form <i>pcre[16]_xxx</i>
+should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and
<i>pcre16_xxx</i> when using the 16-bit library".
</P>
<P>
@@ -109,7 +109,7 @@
of searching. The sections are as follows:
<pre>
pcre this document
- pcre16 details of the 16-bit library
+ pcre16 details of the 16-bit library
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
Modified: code/trunk/doc/html/pcre16.html
===================================================================
--- code/trunk/doc/html/pcre16.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre16.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -160,7 +160,7 @@
<br><a name="SEC5" href="#TOC1">PCRE 16-BIT API 16-BIT-ONLY FUNCTION</a><br>
<P>
<b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b>
-<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>, </b>
+<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>,</b>
<b>int <i>keep_boms</i>);</b>
</P>
<br><a name="SEC6" href="#TOC1">THE PCRE 16-BIT LIBRARY</a><br>
@@ -177,8 +177,8 @@
16-bit library.
</P>
<P>
-WARNING: A single application can be linked with both libraries, but you must
-take care when processing any particular pattern to use functions from just one
+WARNING: A single application can be linked with both libraries, but you must
+take care when processing any particular pattern to use functions from just one
library. For example, if you want to study a pattern that was compiled with
<b>pcre16_compile()</b>, you must do so with <b>pcre16_study()</b>, not
<b>pcre_study()</b>, and you must free the study data with
@@ -186,52 +186,52 @@
</P>
<br><a name="SEC7" href="#TOC1">THE HEADER FILE</a><br>
<P>
-There is only one header file, <b>pcre.h</b>. It contains prototypes for all the
+There is only one header file, <b>pcre.h</b>. It contains prototypes for all the
functions in both libraries, as well as definitions of flags, structures, error
codes, etc.
</P>
<br><a name="SEC8" href="#TOC1">THE LIBRARY NAME</a><br>
<P>
-In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can
-normally be accesss by adding <b>-lpcre16</b> to the command for linking an
+In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can
+normally be accesss by adding <b>-lpcre16</b> to the command for linking an
application that uses PCRE.
</P>
<br><a name="SEC9" href="#TOC1">STRING TYPES</a><br>
<P>
-In the 8-bit library, strings are passed to PCRE library functions as vectors
-of bytes with the C type "char *". In the 16-bit library, strings are passed as
-vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
-appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
-very many environments, "short int" is a 16-bit data type. When PCRE is built,
-it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
-data type. If it is not, the build fails with an error message telling the
+In the 8-bit library, strings are passed to PCRE library functions as vectors
+of bytes with the C type "char *". In the 16-bit library, strings are passed as
+vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
+appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
+very many environments, "short int" is a 16-bit data type. When PCRE is built,
+it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
+data type. If it is not, the build fails with an error message telling the
maintainer to modify the definition appropriately.
</P>
<br><a name="SEC10" href="#TOC1">STRUCTURE TYPES</a><br>
<P>
-The types of the opaque structures that are used for compiled 16-bit patterns
-and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The
-type of the user-accessible structure that is returned by <b>pcre16_study()</b>
+The types of the opaque structures that are used for compiled 16-bit patterns
+and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The
+type of the user-accessible structure that is returned by <b>pcre16_study()</b>
is <b>pcre16_extra</b>, and the type of the structure that is used for passing
-data to a callout function is <b>pcre16_callout_block</b>. These structures
-contain the same fields, with the same names, as their 8-bit counterparts. The
-only difference is that pointers to character strings are 16-bit instead of
+data to a callout function is <b>pcre16_callout_block</b>. These structures
+contain the same fields, with the same names, as their 8-bit counterparts. The
+only difference is that pointers to character strings are 16-bit instead of
8-bit types.
</P>
<br><a name="SEC11" href="#TOC1">16-BIT FUNCTIONS</a><br>
<P>
For every function in the 8-bit library there is a corresponding function in
-the 16-bit library with a name that starts with <b>pcre16_</b> instead of
+the 16-bit library with a name that starts with <b>pcre16_</b> instead of
<b>pcre_</b>. The prototypes are listed above. In addition, there is one extra
-function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function
-that converts a UTF-16 character string to host byte order if necessary. The
-other 16-bit functions expect the strings they are passed to be in host byte
-order.
+function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function
+that converts a UTF-16 character string to host byte order if necessary. The
+other 16-bit functions expect the strings they are passed to be in host byte
+order.
</P>
<P>
The <i>input</i> and <i>output</i> arguments of
-<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is,
-conversion in place is supported. The output buffer must be at least as long as
+<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is,
+conversion in place is supported. The output buffer must be at least as long as
the input.
</P>
<P>
@@ -239,18 +239,18 @@
input string; a negative value specifies a zero-terminated string.
</P>
<P>
-If <i>byte_order</i> is NULL, it is assumed that the string starts off in host
+If <i>byte_order</i> is NULL, it is assumed that the string starts off in host
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
string (commonly as the first character).
</P>
<P>
-If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it
-points means that the input starts off in host byte order, otherwise the
-opposite order is assumed. Again, BOMs in the string can change this. The final
-byte order is passed back at the end of processing.
+If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it
+points means that the input starts off in host byte order, otherwise the
+opposite order is assumed. Again, BOMs in the string can change this. The final
+byte order is passed back at the end of processing.
</P>
<P>
-If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied
+If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied
into the output string. Otherwise they are discarded.
</P>
<P>
@@ -259,14 +259,14 @@
</P>
<br><a name="SEC12" href="#TOC1">SUBJECT STRING OFFSETS</a><br>
<P>
-The offsets within subject strings that are returned by the matching functions
+The offsets within subject strings that are returned by the matching functions
are in 16-bit units rather than bytes.
</P>
<br><a name="SEC13" href="#TOC1">NAMED SUBPATTERNS</a><br>
<P>
-The name-to-number translation table that is maintained for named subpatterns
-uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function
-returns the length of each entry in the table as the number of 16-bit data
+The name-to-number translation table that is maintained for named subpatterns
+uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function
+returns the length of each entry in the table as the number of 16-bit data
units.
</P>
<br><a name="SEC14" href="#TOC1">OPTION NAMES</a><br>
@@ -276,27 +276,27 @@
fact, these new options define the same bits in the options word.
</P>
<P>
-For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16
+For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
given to <b>pcre_config()</b>, or if the PCRE_CONFIG_UTF8 option is given to
<b>pcre16_config()</b>, the result is the PCRE_ERROR_BADOPTION error.
</P>
<br><a name="SEC15" href="#TOC1">CHARACTER CODES</a><br>
<P>
-In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
-same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
-from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
-0xff can therefore be influenced by the locale in the same way as before.
-Characters greater than 0xff have only one case, and no "type" (such as letter
+In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
+same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
+from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
+0xff can therefore be influenced by the locale in the same way as before.
+Characters greater than 0xff have only one case, and no "type" (such as letter
or digit).
</P>
<P>
-In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
-the exception of values in the range 0xd800 to 0xdfff because those are
+In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
+the exception of values in the range 0xd800 to 0xdfff because those are
"surrogate" values that are used in pairs to encode values greater than 0xffff.
</P>
<P>
-A UTF-16 string can indicate its endianness by special code knows as a
+A UTF-16 string can indicate its endianness by special code knows as a
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
to be in host byte order. A utility function called
<b>pcre16_utf16_to_host_byte_order()</b> is provided to help with this (see
@@ -304,18 +304,18 @@
</P>
<br><a name="SEC16" href="#TOC1">ERROR NAMES</a><br>
<P>
-The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
+The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
pattern is passed to a function that processes patterns in the other
-mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to
+mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to
<b>pcre16_exec()</b>.
</P>
<P>
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
-UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
+UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
are described in the section entitled
<a href="pcreapi.html#badutf8reasons">"Reason codes for invalid UTF-8 strings"</a>
-in the main
+in the main
<a href="pcreapi.html"><b>pcreapi</b></a>
page. The UTF-16 errors are:
<pre>
@@ -327,8 +327,8 @@
</P>
<br><a name="SEC17" href="#TOC1">ERROR TEXTS</a><br>
<P>
-If there is an error while compiling a pattern, the error text that is passed
-back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit
+If there is an error while compiling a pattern, the error text that is passed
+back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit
character string, zero-terminated.
</P>
<br><a name="SEC18" href="#TOC1">CALLOUTS</a><br>
@@ -338,23 +338,23 @@
</P>
<br><a name="SEC19" href="#TOC1">TESTING</a><br>
<P>
-The <b>pcretest</b> program continues to operate with 8-bit input and output
-files, but it can be used for testing the 16-bit library. If it is run with the
-command line option <b>-16</b>, patterns and subject strings are converted from
-8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
-are used instead of the 8-bit ones. Returned 16-bit strings are converted to
+The <b>pcretest</b> program continues to operate with 8-bit input and output
+files, but it can be used for testing the 16-bit library. If it is run with the
+command line option <b>-16</b>, patterns and subject strings are converted from
+8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
+are used instead of the 8-bit ones. Returned 16-bit strings are converted to
8-bit for output. If the 8-bit library was not compiled, <b>pcretest</b>
defaults to 16-bit and the <b>-16</b> option is ignored.
</P>
<P>
-When PCRE is being built, the <b>RunTest</b> script that is called by "make
+When PCRE is being built, the <b>RunTest</b> script that is called by "make
check" uses the <b>pcretest</b> <b>-C</b> option to discover which of the 8-bit
and 16-bit libraries has been built, and runs the tests appropriately.
</P>
<br><a name="SEC20" href="#TOC1">NOT SUPPORTED IN 16-BIT MODE</a><br>
<P>
-Not all the features of the 8-bit library are available with the 16-bit
-library. The C++ and POSIX wrapper functions support only the 8-bit library,
+Not all the features of the 8-bit library are available with the 16-bit
+library. The C++ and POSIX wrapper functions support only the 8-bit library,
and the <b>pcregrep</b> program is at present 8-bit only.
</P>
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
Modified: code/trunk/doc/html/pcre_config.html
===================================================================
--- code/trunk/doc/html/pcre_config.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre_config.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -41,6 +41,9 @@
<pre>
PCRE_CONFIG_JIT Availability of just-in-time compiler
support (1=yes 0=no)
+ PCRE_CONFIG_JITTARGET String containing information about the
+ target architecture for the JIT compiler,
+ or NULL if there is no JIT support
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
@@ -66,7 +69,7 @@
Availability of Unicode property support
(1=yes 0=no)
</pre>
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
is also given if PCRE_CONFIG_UTF16 is passed to <b>pcre_config()</b> or if
PCRE_CONFIG_UTF8 is passed to <b>pcre16_config()</b>.
</P>
Modified: code/trunk/doc/html/pcre_fullinfo.html
===================================================================
--- code/trunk/doc/html/pcre_fullinfo.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre_fullinfo.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -50,7 +50,7 @@
PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_JIT Return 1 after successful JIT compilation
- PCRE_INFO_JITSIZE Size of JIT compiled code
+ PCRE_INFO_JITSIZE Size of JIT compiled code
PCRE_INFO_LASTLITERAL Literal last data unit required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
Modified: code/trunk/doc/html/pcre_jit_stack_alloc.html
===================================================================
--- code/trunk/doc/html/pcre_jit_stack_alloc.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre_jit_stack_alloc.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -23,7 +23,7 @@
<b>int <i>maxsize</i>);</b>
</P>
<P>
-<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>, </b>
+<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>,</b>
<b>int <i>maxsize</i>);</b>
</P>
<br><b>
Modified: code/trunk/doc/html/pcre_pattern_to_host_byte_order.html
===================================================================
--- code/trunk/doc/html/pcre_pattern_to_host_byte_order.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre_pattern_to_host_byte_order.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -20,7 +20,7 @@
</P>
<P>
<b>int pcre_pattern_to_host_byte_order(pcre *<i>code</i>,</b>
-<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>); </b>
+<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>);</b>
</P>
<P>
<b>int pcre16_pattern_to_host_byte_order(pcre16 *<i>code</i>,</b>
@@ -31,8 +31,8 @@
</b><br>
<P>
This function ensures that the bytes in 2-byte and 4-byte values in a compiled
-pattern are in the correct order for the current host. It is useful when a
-pattern that has been compiled on one host is transferred to another that might
+pattern are in the correct order for the current host. It is useful when a
+pattern that has been compiled on one host is transferred to another that might
have different endianness. The arguments are:
<pre>
<i>code</i> A compiled regular expression
Modified: code/trunk/doc/html/pcre_utf16_to_host_byte_order.html
===================================================================
--- code/trunk/doc/html/pcre_utf16_to_host_byte_order.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcre_utf16_to_host_byte_order.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -20,15 +20,15 @@
</P>
<P>
<b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b>
-<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>, </b>
+<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>,</b>
<b>int <i>keep_boms</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
-This function, which exists only in the 16-bit library, converts a UTF-16
-string to the correct order for the current host, taking account of any byte
+This function, which exists only in the 16-bit library, converts a UTF-16
+string to the correct order for the current host, taking account of any byte
order marks (BOMs) within the string. Its arguments are:
<pre>
<i>output</i> pointer to output buffer, may be the same as <i>input</i>
Modified: code/trunk/doc/html/pcreapi.html
===================================================================
--- code/trunk/doc/html/pcreapi.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcreapi.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -34,10 +34,11 @@
<li><a name="TOC19" href="#SEC19">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
<li><a name="TOC20" href="#SEC20">DUPLICATE SUBPATTERN NAMES</a>
<li><a name="TOC21" href="#SEC21">FINDING ALL POSSIBLE MATCHES</a>
-<li><a name="TOC22" href="#SEC22">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
-<li><a name="TOC23" href="#SEC23">SEE ALSO</a>
-<li><a name="TOC24" href="#SEC24">AUTHOR</a>
-<li><a name="TOC25" href="#SEC25">REVISION</a>
+<li><a name="TOC22" href="#SEC22">OBTAINING AN ESTIMATE OF STACK USAGE</a>
+<li><a name="TOC23" href="#SEC23">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
+<li><a name="TOC24" href="#SEC24">SEE ALSO</a>
+<li><a name="TOC25" href="#SEC25">AUTHOR</a>
+<li><a name="TOC26" href="#SEC26">REVISION</a>
</ul>
<P>
<b>#include <pcre.h></b>
@@ -174,7 +175,7 @@
start with <b>pcre16_</b> instead of <b>pcre_</b>. For every option that has UTF8
in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with
UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit
-option names define the same bit values.
+option names define the same bit values.
</P>
<P>
References to bytes and UTF-8 in this document should be read as references to
@@ -182,7 +183,7 @@
specified otherwise. More details of the specific differences for the 16-bit
library are given in the
<a href="pcre16.html"><b>pcre16</b></a>
-page.
+page.
</P>
<br><a name="SEC6" href="#TOC1">PCRE API OVERVIEW</a><br>
<P>
@@ -397,7 +398,7 @@
PCRE_CONFIG_UTF8
</pre>
The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero. If this option is given to the 16-bit version of
+otherwise it is set to zero. If this option is given to the 16-bit version of
this function, <b>pcre16_config()</b>, the result is PCRE_ERROR_BADOPTION.
<pre>
PCRE_CONFIG_UTF16
@@ -417,6 +418,13 @@
The output is an integer that is set to one if support for just-in-time
compiling is available; otherwise it is set to zero.
<pre>
+ PCRE_CONFIG_JITTARGET
+</pre>
+The output is a pointer to a zero-terminated "const char *" string. If JIT
+support is available, the string contains the name of the architecture for
+which the JIT compiler is configured, for example "x86 32bit (little endian +
+unaligned)". If JIT support is not available, the result is NULL.
+<pre>
PCRE_CONFIG_NEWLINE
</pre>
The output is an integer whose value specifies the default character sequence
@@ -738,7 +746,7 @@
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
+separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
library, the last two are recognized only in UTF-8 mode.
</P>
<P>
@@ -808,7 +816,7 @@
<pre>
PCRE_NO_UTF8_CHECK
</pre>
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
string is automatically checked. There is a discussion about the
<a href="pcreunicode.html#utf8strings">validity of UTF-8 strings</a>
in the
@@ -825,7 +833,7 @@
<P>
The following table lists the error codes than may be returned by
<b>pcre_compile2()</b>, along with the error messages that may be returned by
-both compiling functions. Note that error messages are always 8-bit ASCII
+both compiling functions. Note that error messages are always 8-bit ASCII
strings, even in 16-bit mode. As PCRE has developed, some error codes have
fallen out of use. To avoid confusion, they have not been re-used.
<pre>
@@ -899,14 +907,14 @@
65 different names for subpatterns of the same number are
not allowed
66 (*MARK) must have an argument
- 67 this version of PCRE is not compiled with Unicode property
+ 67 this version of PCRE is not compiled with Unicode property
support
68 \c must be followed by an ASCII character
69 \k is not followed by a braced, angle-bracketed, or quoted name
70 internal error: unknown opcode in find_fixedlength()
71 \N is not supported in a class
72 too many forward references
- 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
+ 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
74 invalid UTF-16 string (specifically UTF-16)
</pre>
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
@@ -1101,12 +1109,12 @@
PCRE_ERROR_NULL the argument <i>code</i> was NULL
the argument <i>where</i> was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
+ PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
endianness
PCRE_ERROR_BADOPTION the value of <i>what</i> was invalid
</pre>
The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. The endianness error can
+check against passing an arbitrary memory pointer. The endianness error can
occur if a compiled pattern is saved and reloaded on a different host. Here is
a typical call of <b>pcre_fullinfo()</b>, to obtain the length of the compiled
pattern:
@@ -1150,8 +1158,8 @@
</P>
<P>
If there is a fixed first value, for example, the letter "c" from a pattern
-such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
-value is always less than 256; in the 16-bit library the value can be up to
+such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
+value is always less than 256; in the 16-bit library the value can be up to
0xffff.
</P>
<P>
@@ -1427,7 +1435,7 @@
const unsigned char *<i>tables</i>;
unsigned char **<i>mark</i>;
</pre>
-In the 16-bit version of this structure, the <i>mark</i> field has type
+In the 16-bit version of this structure, the <i>mark</i> field has type
"PCRE_UCHAR16 **".
</P>
<P>
@@ -2067,14 +2075,14 @@
<pre>
PCRE_ERROR_BADMODE (-28)
</pre>
-This error is given if a pattern that was compiled by the 8-bit library is
+This error is given if a pattern that was compiled by the 8-bit library is
passed to a 16-bit library function, or vice versa.
<pre>
PCRE_ERROR_BADENDIANNESS (-29)
</pre>
-This error is given if a pattern that was compiled and saved is reloaded on a
-host with different endianness. The utility function
-<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern
+This error is given if a pattern that was compiled and saved is reloaded on a
+host with different endianness. The utility function
+<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern
so that it runs on the new host.
</P>
<P>
@@ -2084,7 +2092,7 @@
Reason codes for invalid UTF-8 strings
</b><br>
<P>
-This section applies only to the 8-bit library. The corresponding information
+This section applies only to the 8-bit library. The corresponding information
for the 16-bit library is given in the
<a href="pcre16.html"><b>pcre16</b></a>
page.
@@ -2374,8 +2382,32 @@
substring. Then return 1, which forces <b>pcre_exec()</b> to backtrack and try
other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b>
will yield PCRE_ERROR_NOMATCH.
+</P>
+<br><a name="SEC22" href="#TOC1">OBTAINING AN ESTIMATE OF STACK USAGE</a><br>
+<P>
+Matching certain patterns using <b>pcre_exec()</b> can use a lot of process
+stack, which in certain environments can be rather limited in size. Some users
+find it helpful to have an estimate of the amount of stack that is used by
+<b>pcre_exec()</b>, to help them set recursion limits, as described in the
+<a href="pcrestack.html"><b>pcrestack</b></a>
+documentation. The estimate that is output by <b>pcretest</b> when called with
+the <b>-m</b> and <b>-C</b> options is obtained by calling <b>pcre_exec</b> with
+the values NULL, NULL, NULL, -999, and -999 for its first five arguments.
+</P>
+<P>
+Normally, if its first argument is NULL, <b>pcre_exec()</b> immediately returns
+the negative error code PCRE_ERROR_NULL, but with this special combination of
+arguments, it returns instead a negative number whose absolute value is the
+approximate stack frame size in bytes. (A negative number is used so that it is
+clear that no match has happened.) The value is approximate because in some
+cases, recursive calls to <b>pcre_exec()</b> occur when there are one or two
+additional variables on the stack.
+</P>
+<P>
+If PCRE has been compiled to use the heap instead of the stack for recursion,
+the value returned is the size of each block that is obtained from the heap.
<a name="dfamatch"></a></P>
-<br><a name="SEC22" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
+<br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
<P>
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
@@ -2550,13 +2582,13 @@
error is given if the output vector is not large enough. This should be
extremely rare, as a vector of size 1000 is used.
</P>
-<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre16</b>(3), <b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
</P>
-<br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC25" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -2565,9 +2597,9 @@
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC25" href="#TOC1">REVISION</a><br>
+<br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 07 January 2012
+Last updated: 21 January 2012
<br>
Copyright © 1997-2012 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcrebuild.html
===================================================================
--- code/trunk/doc/html/pcrebuild.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrebuild.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -66,11 +66,11 @@
</P>
<br><a name="SEC2" href="#TOC1">BUILDING 8-BIT and 16-BIT LIBRARIES</a><br>
<P>
-By default, a library called <b>libpcre</b> is built, containing functions that
-take string arguments contained in vectors of bytes, either as single-byte
+By default, a library called <b>libpcre</b> is built, containing functions that
+take string arguments contained in vectors of bytes, either as single-byte
characters, or interpreted as UTF-8 strings. You can also build a separate
-library, called <b>libpcre16</b>, in which strings are contained in vectors of
-16-bit data units and interpreted either as single-unit characters or UTF-16
+library, called <b>libpcre16</b>, in which strings are contained in vectors of
+16-bit data units and interpreted either as single-unit characters or UTF-16
strings, by adding
<pre>
--enable-pcre16
@@ -97,7 +97,7 @@
<P>
By default, if the 8-bit library is being built, the <b>configure</b> script
will search for a C++ compiler and C++ header files. If it finds them, it
-automatically builds the C++ wrapper library (which supports only 8-bit
+automatically builds the C++ wrapper library (which supports only 8-bit
strings). You can disable this by adding
<pre>
--disable-cpp
@@ -122,7 +122,7 @@
<P>
Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As
well as compiling PCRE with this option, you also have have to set the
-PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
+PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
functions.
</P>
<P>
Modified: code/trunk/doc/html/pcrecallout.html
===================================================================
--- code/trunk/doc/html/pcrecallout.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrecallout.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -31,7 +31,7 @@
PCRE provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE in the middle of pattern matching. The
caller of PCRE provides an external function by putting its entry point in the
-global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit
+global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit
library). By default, this variable contains NULL, which disables all calling
out.
</P>
@@ -105,7 +105,7 @@
int <i>callout_number</i>;
int *<i>offset_vector</i>;
const char *<i>subject</i>; (8-bit version)
- PCRE_SPTR16 <i>subject</i>; (16-bit version)
+ PCRE_SPTR16 <i>subject</i>; (16-bit version)
int <i>subject_length</i>;
int <i>start_match</i>;
int <i>current_position</i>;
@@ -129,7 +129,7 @@
</P>
<P>
The <i>offset_vector</i> field is a pointer to the vector of offsets that was
-passed by the caller to the matching function. When <b>pcre_exec()</b> or
+passed by the caller to the matching function. When <b>pcre_exec()</b> or
<b>pcre16_exec()</b> is used, the contents can be inspected, in order to extract
substrings that have been matched so far, in the same way as for extracting
substrings after a match has completed. For the DFA matching functions, this
Modified: code/trunk/doc/html/pcrecpp.html
===================================================================
--- code/trunk/doc/html/pcrecpp.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrecpp.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -35,7 +35,7 @@
The C++ wrapper for PCRE was provided by Google Inc. Some additional
functionality was added by Giuseppe Maxia. This brief man page was constructed
from the notes in the <i>pcrecpp.h</i> file, which should be consulted for
-further details. Note that the C++ wrapper supports only the original 8-bit
+further details. Note that the C++ wrapper supports only the original 8-bit
PCRE library. There is no 16-bit support at present.
</P>
<br><a name="SEC3" href="#TOC1">MATCHING INTERFACE</a><br>
Modified: code/trunk/doc/html/pcrejit.html
===================================================================
--- code/trunk/doc/html/pcrejit.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrejit.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -45,10 +45,10 @@
</P>
<br><a name="SEC2" href="#TOC1">8-BIT and 16-BIT SUPPORT</a><br>
<P>
-JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
-this documentation simple, only the 8-bit interface is described in what
-follows. If you are using the 16-bit library, substitute the 16-bit functions
-and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of
+JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
+this documentation simple, only the 8-bit interface is described in what
+follows. If you are using the 16-bit library, substitute the 16-bit functions
+and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of
<i>pcre_jit_stack</i>).
</P>
<br><a name="SEC3" href="#TOC1">AVAILABILITY OF JIT SUPPORT</a><br>
Modified: code/trunk/doc/html/pcrematching.html
===================================================================
--- code/trunk/doc/html/pcrematching.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrematching.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -28,13 +28,13 @@
for matching a compiled regular expression against a given subject string. The
"standard" algorithm is the one provided by the <b>pcre_exec()</b> and
<b>pcre16_exec()</b> functions. These work in the same was as Perl's matching
-function, and provide a Perl-compatible matching operation. The just-in-time
+function, and provide a Perl-compatible matching operation. The just-in-time
(JIT) optimization that is described in the
<a href="pcrejit.html"><b>pcrejit</b></a>
documentation is compatible with these functions.
</P>
<P>
-An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and
+An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and
<b>pcre16_dfa_exec()</b> functions; they operate in a different way, and are not
Perl-compatible. This alternative has advantages and disadvantages compared
with the standard algorithm, and these are described below.
Modified: code/trunk/doc/html/pcrepartial.html
===================================================================
--- code/trunk/doc/html/pcrepartial.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrepartial.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -50,7 +50,7 @@
</P>
<P>
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
-PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
+PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
functions. For backwards compatibility, PCRE_PARTIAL is a synonym for
PCRE_PARTIAL_SOFT. The essential difference between the two options is whether
or not a partial match is preferred to an alternative complete match, though
@@ -70,7 +70,7 @@
</P>
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()</a><br>
<P>
-A partial match occurs during a call to <b>pcre_exec()</b> or
+A partial match occurs during a call to <b>pcre_exec()</b> or
<b>pcre16_exec()</b> when the end of the subject string is reached successfully,
but matching cannot continue because more characters are needed. However, at
least one character in the subject must have been inspected. This character
@@ -144,7 +144,8 @@
this reason, the assumption is made that the end of the supplied subject string
may not be the true end of the available data, and so, if \z, \Z, \b, \B,
or $ are encountered at the end of the subject, the result is
-PCRE_ERROR_PARTIAL.
+PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
+been inspected.
</P>
<P>
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16
@@ -294,7 +295,7 @@
<P>
You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
PCRE_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
+facility can be used to pass very long subject strings to the DFA matching
functions.
</P>
<br><a name="SEC8" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec()</a><br>
@@ -434,7 +435,7 @@
</P>
<br><a name="SEC11" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 08 January 2012
+Last updated: 21 January 2012
<br>
Copyright © 1997-2012 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcrepattern.html
===================================================================
--- code/trunk/doc/html/pcrepattern.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrepattern.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -65,11 +65,11 @@
second library that supports 16-bit and UTF-16 character strings. To use these
features, PCRE must be built to include appropriate support. When using UTF
strings you must either call the compiling function with the PCRE_UTF8 or
-PCRE_UTF16 option, or the pattern must start with one of these special
+PCRE_UTF16 option, or the pattern must start with one of these special
sequences:
<pre>
(*UTF8)
- (*UTF16)
+ (*UTF16)
</pre>
Starting a pattern with such a sequence is equivalent to setting the relevant
option. This feature is not Perl-compatible. How setting a UTF mode affects
@@ -292,7 +292,7 @@
16-bit non-UTF mode less than 0x10000
16-bit UTF-16 mode less than 0x10ffff and a valid codepoint
</pre>
-Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
+Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
"surrogate" codepoints).
</P>
<P>
@@ -335,7 +335,7 @@
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. The value of the character is
+subsequent digits stand for themselves. The value of the character is
constrained in the same way as characters specified in hexadecimal.
For example:
<pre>
@@ -503,8 +503,8 @@
U+2028 Line separator
U+2029 Paragraph separator
</pre>
-In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
-relevant.
+In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
+relevant.
<a name="newlineseq"></a></P>
<br><b>
Newline sequences
@@ -970,7 +970,7 @@
<P>
Outside a character class, a dot in the pattern matches any one character in
the subject string except (by default) a character that signifies the end of a
-line.
+line.
</P>
<P>
When a line ending is defined as a single character, dot never matches that
@@ -1103,7 +1103,7 @@
</P>
<P>
Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\000-\037]. Ranges
+used for characters specified numerically, for example [\000-\037]. Ranges
can include any characters that are valid for the current mode.
</P>
<P>
@@ -1298,8 +1298,8 @@
<br>
2. It sets up the subpattern as a capturing subpattern. This means that, when
the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the <i>ovector</i> argument of the
-matching function. (This applies only to the traditional matching functions;
+subpattern is passed back to the caller via the <i>ovector</i> argument of the
+matching function. (This applies only to the traditional matching functions;
the DFA matching functions do not support capturing.)
</P>
<P>
@@ -2505,7 +2505,7 @@
<P>
PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable <i>pcre_callout</i>
+function by putting its entry point in the global variable <i>pcre_callout</i>
(8-bit library) or <i>pcre16_callout</i> (16-bit library). By default, this
variable contains NULL, which disables all calling out.
</P>
Modified: code/trunk/doc/html/pcreposix.html
===================================================================
--- code/trunk/doc/html/pcreposix.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcreposix.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -48,7 +48,7 @@
expression 8-bit library. See the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation for a description of PCRE's native API, which contains much
-additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
+additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
library.
</P>
<P>
Modified: code/trunk/doc/html/pcreprecompile.html
===================================================================
--- code/trunk/doc/html/pcreprecompile.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcreprecompile.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -120,7 +120,7 @@
</P>
<P>
If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes the matching
+the pointer in the compiled pattern is NULL, which causes the matching
functions to use PCRE's internal tables. Thus, you do not need to take any
special action at run time in this case.
</P>
Modified: code/trunk/doc/html/pcrestack.html
===================================================================
--- code/trunk/doc/html/pcrestack.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcrestack.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -130,9 +130,9 @@
</P>
<P>
As a very rough rule of thumb, you should reckon on about 500 bytes per
-recursion. Thus, if you want to limit your stack usage to 8Mb, you
-should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
-support around 128000 recursions.
+recursion. Thus, if you want to limit your stack usage to 8Mb, you should set
+the limit at 16000 recursions. A 64Mb stack, on the other hand, can support
+around 128000 recursions.
</P>
<P>
In Unix-like environments, the <b>pcretest</b> test program has a command line
@@ -143,6 +143,32 @@
limits.
</P>
<br><b>
+Obtaining an estimate of stack usage
+</b><br>
+<P>
+The actual amount of stack used per recursion can vary quite a lot, depending
+on the compiler that was used to build PCRE and the optimization or debugging
+options that were set for it. The rule of thumb value of 500 bytes mentioned
+above may be larger or smaller than what is actually needed. A better
+approximation can be obtained by running this command:
+<pre>
+ pcretest -m -C
+</pre>
+The <b>-C</b> option causes <b>pcretest</b> to output information about the
+options with which PCRE was compiled. When <b>-m</b> is also given (before
+<b>-C</b>), information about stack use is given in a line like this:
+<pre>
+ Match recursion uses stack: approximate frame size = 640 bytes
+</pre>
+The value is approximate because some recursions need a bit more (up to perhaps
+16 more bytes).
+</P>
+<P>
+If the above command is given when PCRE is compiled to use the heap instead of
+the stack for recursion, the value that is output is the size of each block
+that is obtained from the heap.
+</P>
+<br><b>
Changing stack size in Unix-like systems
</b><br>
<P>
@@ -190,7 +216,7 @@
REVISION
</b><br>
<P>
-Last updated: 10 January 2012
+Last updated: 21 January 2012
<br>
Copyright © 1997-2012 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcresyntax.html
===================================================================
--- code/trunk/doc/html/pcresyntax.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcresyntax.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -448,12 +448,12 @@
<pre>
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
- (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
+ (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
(*SKIP) advance to current matching position
(*SKIP:NAME) advance to position corresponding to an earlier
- (*MARK:NAME); if not found, the (*SKIP) is ignored
+ (*MARK:NAME); if not found, the (*SKIP) is ignored
(*THEN) local failure, backtrack to next alternation
- (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
+ (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
</PRE>
</P>
<br><a name="SEC22" href="#TOC1">NEWLINE CONVENTIONS</a><br>
Modified: code/trunk/doc/html/pcretest.html
===================================================================
--- code/trunk/doc/html/pcretest.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcretest.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -52,26 +52,26 @@
</P>
<br><a name="SEC2" href="#TOC1">PCRE's 8-BIT and 16-BIT LIBRARIES</a><br>
<P>
-From release 8.30, two separate PCRE libraries can be built. The original one
-supports 8-bit character strings, whereas the newer 16-bit library supports
-character strings encoded in 16-bit units. The <b>pcretest</b> program can be
+From release 8.30, two separate PCRE libraries can be built. The original one
+supports 8-bit character strings, whereas the newer 16-bit library supports
+character strings encoded in 16-bit units. The <b>pcretest</b> program can be
used to test both libraries. However, it is itself still an 8-bit program,
reading 8-bit input and writing 8-bit output. When testing the 16-bit library,
the patterns and data strings are converted to 16-bit format before being
-passed to the PCRE library functions. Results are converted to 8-bit for
+passed to the PCRE library functions. Results are converted to 8-bit for
output.
</P>
<P>
-References to functions and structures of the form <b>pcre[16]_xx</b> below
-mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using
+References to functions and structures of the form <b>pcre[16]_xx</b> below
+mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using
the 16-bit library".
</P>
<br><a name="SEC3" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P>
<b>-16</b>
-If both the 8-bit and the 16-bit libraries have been built, this option causes
-the 16-bit library to be used. If only the 16-bit library has been built, this
-is the default (so has no effect). If only the 8-bit library has been built,
+If both the 8-bit and the 16-bit libraries have been built, this option causes
+the 16-bit library to be used. If only the 16-bit library has been built, this
+is the default (so has no effect). If only the 8-bit library has been built,
this option causes an error.
</P>
<P>
@@ -82,25 +82,25 @@
<P>
<b>-C</b>
Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit. All other options
+about the optional features that are included, and then exit. All other options
are ignored.
</P>
<P>
<b>-C</b> <i>option</i>
-Output information about a specific build-time option, then exit. This
-functionality is intended for use in scripts such as <b>RunTest</b>. The
+Output information about a specific build-time option, then exit. This
+functionality is intended for use in scripts such as <b>RunTest</b>. The
following options output the value indicated:
<pre>
linksize the internal link size (2, 3, or 4)
- newline the default newline setting:
- CR, LF, CRLF, ANYCRLF, or ANY
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
</pre>
The following options output 1 for true or zero for false:
<pre>
jit just-in-time support is available
pcre16 the 16-bit library was built
pcre8 the 8-bit library was built
- ucp Unicode property support is available
+ ucp Unicode property support is available
utf UTF-8 and/or UTF-16 support is available
</PRE>
</P>
@@ -134,7 +134,7 @@
<P>
<b>-m</b>
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding <b>/M</b> to each regular expression. The size is given in
+equivalent to adding <b>/M</b> to each regular expression. The size is given in
bytes for both libraries.
</P>
<P>
@@ -172,7 +172,7 @@
neither <b>-i</b> nor <b>-d</b> is present on the command line. This behaviour
means that the output from tests that are run with and without <b>-s</b> should
be identical, except when options that output information about the actual
-running of a match are set.
+running of a match are set.
<br>
<br>
The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options, which give information about
@@ -276,7 +276,7 @@
The following table shows additional modifiers for setting PCRE compile-time
options that do not correspond to anything in Perl:
<pre>
- <b>/8</b> PCRE_UTF8 ) when using the 8-bit
+ <b>/8</b> PCRE_UTF8 ) when using the 8-bit
<b>/?</b> PCRE_NO_UTF8_CHECK ) library
<b>/8</b> PCRE_UTF16 ) when using the 16-bit
@@ -309,7 +309,7 @@
</pre>
As well as turning on the PCRE_UTF8/16 option, the <b>/8</b> modifier causes
all non-printing characters in output strings to be printed using the
-\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without
+\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without
the curly brackets.
</P>
<P>
@@ -661,7 +661,7 @@
2: b
</pre>
If the strings contain any non-printing characters, they are output as \xhh
-escapes if the value is less than 256 and UTF mode is not set. Otherwise they
+escapes if the value is less than 256 and UTF mode is not set. Otherwise they
are output as \x{hh...} escapes. See below for the definition of non-printing
characters. If the pattern has the <b>/+</b> modifier, the output for substring
0 is followed by the the rest of the subject string, identified by "0+" like
@@ -881,15 +881,15 @@
You can copy a file written by <b>pcretest</b> to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
pattern was compiled. For example, you can compile on an i86 machine and run on
-a SPARC machine. When a pattern is reloaded on a host with different
+a SPARC machine. When a pattern is reloaded on a host with different
endianness, the confirmation message is changed to:
<pre>
Compiled pattern (byte-inverted) loaded from /some/file
</pre>
-The test suite contains some saved pre-compiled patterns with different
-endianness. These are reloaded using "<!" instead of just "<". This suppresses
-the "(byte-inverted)" text so that the output is the same on all hosts. It also
-forces debugging output once the pattern has been reloaded.
+The test suite contains some saved pre-compiled patterns with different
+endianness. These are reloaded using "<!" instead of just "<". This suppresses
+the "(byte-inverted)" text so that the output is the same on all hosts. It also
+forces debugging output once the pattern has been reloaded.
</P>
<P>
File names for saving and reloading can be absolute or relative, but note that
Modified: code/trunk/doc/html/pcreunicode.html
===================================================================
--- code/trunk/doc/html/pcreunicode.html 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/html/pcreunicode.html 2012-01-21 16:37:17 UTC (rev 903)
@@ -17,7 +17,7 @@
</b><br>
<P>
From Release 8.30, in addition to its previous UTF-8 support, PCRE also
-supports UTF-16 by means of a separate 16-bit library. This can be built as
+supports UTF-16 by means of a separate 16-bit library. This can be built as
well as, or instead of, the 8-bit library.
</P>
<br><b>
@@ -82,7 +82,7 @@
</P>
<P>
The excluded code points are the "Surrogate Area" of Unicode. They are reserved
-for use by UTF-16, where they are used in pairs to encode codepoints with
+for use by UTF-16, where they are used in pairs to encode codepoints with
values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
are available independently in the UTF-8 encoding. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
@@ -161,7 +161,7 @@
data units, for example: \x{100}{3}.
</P>
<P>
-4. The dot metacharacter matches one UTF character instead of a single data
+4. The dot metacharacter matches one UTF character instead of a single data
unit.
</P>
<P>
@@ -179,7 +179,7 @@
<P>
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as in
+recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with values less than 256. This remains true even when PCRE
is built to include Unicode property support, because to do otherwise would
slow down PCRE in many common cases. Note in particular that this applies to
Modified: code/trunk/doc/pcre-config.1
===================================================================
--- code/trunk/doc/pcre-config.1 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre-config.1 2012-01-21 16:37:17 UTC (rev 903)
@@ -6,7 +6,7 @@
.sp
.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
.ti +5n
-.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags]
+.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags]
.ti +5n
.B [--cflags-posix]
.
@@ -15,9 +15,9 @@
.rs
.sp
\fBpcre-config\fP returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them. Some of
-the options apply only to the 8-bit or 16-bit libraries, respectively, and are
-not available if only one of those libraries has been built. If an unavailable
+libraries and the options required to compile a program to use them. Some of
+the options apply only to the 8-bit or 16-bit libraries, respectively, and are
+not available if only one of those libraries has been built. If an unavailable
option is encountered, the "usage" information is output.
.
.
Modified: code/trunk/doc/pcre.3
===================================================================
--- code/trunk/doc/pcre.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -11,11 +11,11 @@
support for one or two .NET and Oniguruma syntax items, and there is an option
for requesting some minor changes that give better JavaScript compatibility.
.P
-Starting with release 8.30, it is possible to compile two separate PCRE
+Starting with release 8.30, it is possible to compile two separate PCRE
libraries: the original, which supports 8-bit character strings (including
UTF-8 strings), and a second library that supports 16-bit character strings
(including UTF-16 strings). The build process allows either one or both to be
-built. The majority of the work to make this possible was done by Zoltan
+built. The majority of the work to make this possible was done by Zoltan
Herczeg.
.P
The two libraries contain identical sets of functions, except that the names in
@@ -26,8 +26,8 @@
.\" HREF
\fBpcre16\fP
.\"
-page. References to functions or structures of the form \fIpcre[16]_xxx\fP
-should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and
+page. References to functions or structures of the form \fIpcre[16]_xxx\fP
+should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and
\fIpcre16_xxx\fP when using the 16-bit library".
.P
The current implementation of PCRE corresponds approximately with Perl 5.12,
@@ -106,7 +106,7 @@
of searching. The sections are as follows:
.sp
pcre this document
- pcre16 details of the 16-bit library
+ pcre16 details of the 16-bit library
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
Modified: code/trunk/doc/pcre.txt
===================================================================
--- code/trunk/doc/pcre.txt 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre.txt 2012-01-21 16:37:17 UTC (rev 903)
@@ -138,8 +138,8 @@
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE(3) PCRE(3)
@@ -463,8 +463,8 @@
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREBUILD(3) PCREBUILD(3)
@@ -859,8 +859,8 @@
Last updated: 07 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREMATCHING(3) PCREMATCHING(3)
@@ -1066,8 +1066,8 @@
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREAPI(3) PCREAPI(3)
@@ -1405,6 +1405,14 @@
The output is an integer that is set to one if support for just-in-time
compiling is available; otherwise it is set to zero.
+ PCRE_CONFIG_JITTARGET
+
+ The output is a pointer to a zero-terminated "const char *" string. If
+ JIT support is available, the string contains the name of the architec-
+ ture for which the JIT compiler is configured, for example "x86 32bit
+ (little endian + unaligned)". If JIT support is not available, the
+ result is NULL.
+
PCRE_CONFIG_NEWLINE
The output is an integer whose value specifies the default character
@@ -3255,6 +3263,31 @@
matches, pcre_exec() will yield PCRE_ERROR_NOMATCH.
+OBTAINING AN ESTIMATE OF STACK USAGE
+
+ Matching certain patterns using pcre_exec() can use a lot of process
+ stack, which in certain environments can be rather limited in size.
+ Some users find it helpful to have an estimate of the amount of stack
+ that is used by pcre_exec(), to help them set recursion limits, as
+ described in the pcrestack documentation. The estimate that is output
+ by pcretest when called with the -m and -C options is obtained by call-
+ ing pcre_exec with the values NULL, NULL, NULL, -999, and -999 for its
+ first five arguments.
+
+ Normally, if its first argument is NULL, pcre_exec() immediately
+ returns the negative error code PCRE_ERROR_NULL, but with this special
+ combination of arguments, it returns instead a negative number whose
+ absolute value is the approximate stack frame size in bytes. (A nega-
+ tive number is used so that it is clear that no match has happened.)
+ The value is approximate because in some cases, recursive calls to
+ pcre_exec() occur when there are one or two additional variables on the
+ stack.
+
+ If PCRE has been compiled to use the heap instead of the stack for
+ recursion, the value returned is the size of each block that is
+ obtained from the heap.
+
+
MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
@@ -3436,11 +3469,11 @@
REVISION
- Last updated: 07 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECALLOUT(3) PCRECALLOUT(3)
@@ -3638,8 +3671,8 @@
Last updated: 08 Janurary 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECOMPAT(3) PCRECOMPAT(3)
@@ -3813,8 +3846,8 @@
Last updated: 08 Januray 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPATTERN(3) PCREPATTERN(3)
@@ -6418,8 +6451,8 @@
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESYNTAX(3) PCRESYNTAX(3)
@@ -6794,8 +6827,8 @@
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREUNICODE(3) PCREUNICODE(3)
@@ -6992,8 +7025,8 @@
Last updated: 13 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREJIT(3) PCREJIT(3)
@@ -7348,8 +7381,8 @@
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPARTIAL(3) PCREPARTIAL(3)
@@ -7469,111 +7502,112 @@
plete match. For this reason, the assumption is made that the end of
the supplied subject string may not be the true end of the available
data, and so, if \z, \Z, \b, \B, or $ are encountered at the end of the
- subject, the result is PCRE_ERROR_PARTIAL.
+ subject, the result is PCRE_ERROR_PARTIAL, provided that at least one
+ character in the subject has been inspected.
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16 subject
- strings are checked for validity. Normally, an invalid sequence causes
- the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the
- special case of a truncated character at the end of the subject,
- PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when
+ strings are checked for validity. Normally, an invalid sequence causes
+ the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the
+ special case of a truncated character at the end of the subject,
+ PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when
PCRE_PARTIAL_HARD is set.
Comparing hard and soft partial matching
- The difference between the two partial matching options can be illus-
+ The difference between the two partial matching options can be illus-
trated by a pattern such as:
/dog(sbody)?/
- This matches either "dog" or "dogsbody", greedily (that is, it prefers
- the longer string if possible). If it is matched against the string
- "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
+ This matches either "dog" or "dogsbody", greedily (that is, it prefers
+ the longer string if possible). If it is matched against the string
+ "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
However, if PCRE_PARTIAL_HARD is set, the result is PCRE_ERROR_PARTIAL.
- On the other hand, if the pattern is made ungreedy the result is dif-
+ On the other hand, if the pattern is made ungreedy the result is dif-
ferent:
/dog(sbody)??/
- In this case the result is always a complete match because that is
- found first, and matching never continues after finding a complete
+ In this case the result is always a complete match because that is
+ found first, and matching never continues after finding a complete
match. It might be easier to follow this explanation by thinking of the
two patterns like this:
/dog(sbody)?/ is the same as /dogsbody|dog/
/dog(sbody)??/ is the same as /dog|dogsbody/
- The second pattern will never match "dogsbody", because it will always
+ The second pattern will never match "dogsbody", because it will always
find the shorter match first.
PARTIAL MATCHING USING pcre_dfa_exec() OR pcre16_dfa_exec()
The DFA functions move along the subject string character by character,
- without backtracking, searching for all possible matches simultane-
- ously. If the end of the subject is reached before the end of the pat-
- tern, there is the possibility of a partial match, again provided that
+ without backtracking, searching for all possible matches simultane-
+ ously. If the end of the subject is reached before the end of the pat-
+ tern, there is the possibility of a partial match, again provided that
at least one character has been inspected.
- When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
- there have been no complete matches. Otherwise, the complete matches
- are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
- takes precedence over any complete matches. The portion of the string
- that was inspected when the longest partial match was found is set as
+ When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
+ there have been no complete matches. Otherwise, the complete matches
+ are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
+ takes precedence over any complete matches. The portion of the string
+ that was inspected when the longest partial match was found is set as
the first matching string, provided there are at least two slots in the
offsets vector.
- Because the DFA functions always search for all possible matches, and
- there is no difference between greedy and ungreedy repetition, their
- behaviour is different from the standard functions when PCRE_PAR-
- TIAL_HARD is set. Consider the string "dog" matched against the
+ Because the DFA functions always search for all possible matches, and
+ there is no difference between greedy and ungreedy repetition, their
+ behaviour is different from the standard functions when PCRE_PAR-
+ TIAL_HARD is set. Consider the string "dog" matched against the
ungreedy pattern shown above:
/dog(sbody)??/
- Whereas the standard functions stop as soon as they find the complete
- match for "dog", the DFA functions also find the partial match for
+ Whereas the standard functions stop as soon as they find the complete
+ match for "dog", the DFA functions also find the partial match for
"dogsbody", and so return that when PCRE_PARTIAL_HARD is set.
PARTIAL MATCHING AND WORD BOUNDARIES
- If a pattern ends with one of sequences \b or \B, which test for word
- boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
+ If a pattern ends with one of sequences \b or \B, which test for word
+ boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
intuitive results. Consider this pattern:
/\bcat\b/
This matches "cat", provided there is a word boundary at either end. If
the subject string is "the cat", the comparison of the final "t" with a
- following character cannot take place, so a partial match is found.
- However, normal matching carries on, and \b matches at the end of the
- subject when the last character is a letter, so a complete match is
- found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using
- PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because
+ following character cannot take place, so a partial match is found.
+ However, normal matching carries on, and \b matches at the end of the
+ subject when the last character is a letter, so a complete match is
+ found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using
+ PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because
then the partial match takes precedence.
FORMERLY RESTRICTED PATTERNS
For releases of PCRE prior to 8.00, because of the way certain internal
- optimizations were implemented in the pcre_exec() function, the
- PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
- used with all patterns. From release 8.00 onwards, the restrictions no
- longer apply, and partial matching with can be requested for any pat-
+ optimizations were implemented in the pcre_exec() function, the
+ PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
+ used with all patterns. From release 8.00 onwards, the restrictions no
+ longer apply, and partial matching with can be requested for any pat-
tern.
Items that were formerly restricted were repeated single characters and
- repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
- not conform to the restrictions, pcre_exec() returned the error code
- PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
- PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
+ repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
+ not conform to the restrictions, pcre_exec() returned the error code
+ PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
+ PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
pattern can be used for partial matching now always returns 1.
EXAMPLE OF PARTIAL MATCHING USING PCRETEST
- If the escape sequence \P is present in a pcretest data line, the
- PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
+ If the escape sequence \P is present in a pcretest data line, the
+ PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
pcretest that uses the date example quoted above:
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -7589,24 +7623,24 @@
data> j\P
No match
- The first data string is matched completely, so pcretest shows the
- matched substrings. The remaining four strings do not match the com-
+ The first data string is matched completely, so pcretest shows the
+ matched substrings. The remaining four strings do not match the com-
plete pattern, but the first two are partial matches. Similar output is
obtained if DFA matching is used.
- If the escape sequence \P is present more than once in a pcretest data
+ If the escape sequence \P is present more than once in a pcretest data
line, the PCRE_PARTIAL_HARD option is set for the match.
MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() OR pcre16_dfa_exec()
- When a partial match has been found using a DFA matching function, it
- is possible to continue the match by providing additional subject data
- and calling the function again with the same compiled regular expres-
- sion, this time setting the PCRE_DFA_RESTART option. You must pass the
+ When a partial match has been found using a DFA matching function, it
+ is possible to continue the match by providing additional subject data
+ and calling the function again with the same compiled regular expres-
+ sion, this time setting the PCRE_DFA_RESTART option. You must pass the
same working space as before, because this is where details of the pre-
- vious partial match are stored. Here is an example using pcretest,
- using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
+ vious partial match are stored. Here is an example using pcretest,
+ using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
specifies the use of the DFA matching function):
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -7615,47 +7649,47 @@
data> n05\R\D
0: n05
- The first call has "23ja" as the subject, and requests partial match-
- ing; the second call has "n05" as the subject for the continued
- (restarted) match. Notice that when the match is complete, only the
- last part is shown; PCRE does not retain the previously partially-
- matched string. It is up to the calling program to do that if it needs
+ The first call has "23ja" as the subject, and requests partial match-
+ ing; the second call has "n05" as the subject for the continued
+ (restarted) match. Notice that when the match is complete, only the
+ last part is shown; PCRE does not retain the previously partially-
+ matched string. It is up to the calling program to do that if it needs
to.
- You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
- PCRE_DFA_RESTART to continue partial matching over multiple segments.
- This facility can be used to pass very long subject strings to the DFA
+ You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
+ PCRE_DFA_RESTART to continue partial matching over multiple segments.
+ This facility can be used to pass very long subject strings to the DFA
matching functions.
MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec()
- From release 8.00, the standard matching functions can also be used to
+ From release 8.00, the standard matching functions can also be used to
do multi-segment matching. Unlike the DFA functions, it is not possible
- to restart the previous match with a new segment of data. Instead, new
+ to restart the previous match with a new segment of data. Instead, new
data must be added to the previous subject string, and the entire match
- re-run, starting from the point where the partial match occurred. Ear-
+ re-run, starting from the point where the partial match occurred. Ear-
lier data can be discarded.
- It is best to use PCRE_PARTIAL_HARD in this situation, because it does
- not treat the end of a segment as the end of the subject when matching
- \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches
+ It is best to use PCRE_PARTIAL_HARD in this situation, because it does
+ not treat the end of a segment as the end of the subject when matching
+ \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches
dates:
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
data> The date is 23ja\P\P
Partial match: 23ja
- At this stage, an application could discard the text preceding "23ja",
- add on text from the next segment, and call the matching function
- again. Unlike the DFA matching functions the entire matching string
- must always be available, and the complete matching process occurs for
+ At this stage, an application could discard the text preceding "23ja",
+ add on text from the next segment, and call the matching function
+ again. Unlike the DFA matching functions the entire matching string
+ must always be available, and the complete matching process occurs for
each call, so more memory and more processing time is needed.
- Note: If the pattern contains lookbehind assertions, or \K, or starts
+ Note: If the pattern contains lookbehind assertions, or \K, or starts
with \b or \B, the string that is returned for a partial match includes
- characters that precede the partially matched string itself, because
- these must be retained when adding on more characters for a subsequent
+ characters that precede the partially matched string itself, because
+ these must be retained when adding on more characters for a subsequent
matching attempt.
@@ -7665,28 +7699,28 @@
whichever matching function is used.
1. If the pattern contains a test for the beginning of a line, you need
- to pass the PCRE_NOTBOL option when the subject string for any call
- does start at the beginning of a line. There is also a PCRE_NOTEOL
+ to pass the PCRE_NOTBOL option when the subject string for any call
+ does start at the beginning of a line. There is also a PCRE_NOTEOL
option, but in practice when doing multi-segment matching you should be
using PCRE_PARTIAL_HARD, which includes the effect of PCRE_NOTEOL.
- 2. Lookbehind assertions at the start of a pattern are catered for in
- the offsets that are returned for a partial match. However, in theory,
- a lookbehind assertion later in the pattern could require even earlier
- characters to be inspected, and it might not have been reached when a
- partial match occurs. This is probably an extremely unlikely case; you
- could guard against it to a certain extent by always including extra
+ 2. Lookbehind assertions at the start of a pattern are catered for in
+ the offsets that are returned for a partial match. However, in theory,
+ a lookbehind assertion later in the pattern could require even earlier
+ characters to be inspected, and it might not have been reached when a
+ partial match occurs. This is probably an extremely unlikely case; you
+ could guard against it to a certain extent by always including extra
characters at the start.
- 3. Matching a subject string that is split into multiple segments may
- not always produce exactly the same result as matching over one single
- long string, especially when PCRE_PARTIAL_SOFT is used. The section
- "Partial Matching and Word Boundaries" above describes an issue that
- arises if the pattern ends with \b or \B. Another kind of difference
- may occur when there are multiple matching possibilities, because (for
- PCRE_PARTIAL_SOFT) a partial match result is given only when there are
+ 3. Matching a subject string that is split into multiple segments may
+ not always produce exactly the same result as matching over one single
+ long string, especially when PCRE_PARTIAL_SOFT is used. The section
+ "Partial Matching and Word Boundaries" above describes an issue that
+ arises if the pattern ends with \b or \B. Another kind of difference
+ may occur when there are multiple matching possibilities, because (for
+ PCRE_PARTIAL_SOFT) a partial match result is given only when there are
no completed matches. This means that as soon as the shortest match has
- been found, continuation to a new subject segment is no longer possi-
+ been found, continuation to a new subject segment is no longer possi-
ble. Consider again this pcretest example:
re> /dog(sbody)?/
@@ -7700,18 +7734,18 @@
0: dogsbody
1: dog
- The first data line passes the string "dogsb" to a standard matching
- function, setting the PCRE_PARTIAL_SOFT option. Although the string is
- a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL,
- because the shorter string "dog" is a complete match. Similarly, when
- the subject is presented to a DFA matching function in several parts
- ("do" and "gsb" being the first two) the match stops when "dog" has
- been found, and it is not possible to continue. On the other hand, if
- "dogsbody" is presented as a single string, a DFA matching function
+ The first data line passes the string "dogsb" to a standard matching
+ function, setting the PCRE_PARTIAL_SOFT option. Although the string is
+ a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL,
+ because the shorter string "dog" is a complete match. Similarly, when
+ the subject is presented to a DFA matching function in several parts
+ ("do" and "gsb" being the first two) the match stops when "dog" has
+ been found, and it is not possible to continue. On the other hand, if
+ "dogsbody" is presented as a single string, a DFA matching function
finds both matches.
- Because of these problems, it is best to use PCRE_PARTIAL_HARD when
- matching multi-segment data. The example above then behaves differ-
+ Because of these problems, it is best to use PCRE_PARTIAL_HARD when
+ matching multi-segment data. The example above then behaves differ-
ently:
re> /dog(sbody)?/
@@ -7723,25 +7757,25 @@
Partial match: gsb
4. Patterns that contain alternatives at the top level which do not all
- start with the same pattern item may not work as expected when
+ start with the same pattern item may not work as expected when
PCRE_DFA_RESTART is used. For example, consider this pattern:
1234|3789
- If the first part of the subject is "ABC123", a partial match of the
- first alternative is found at offset 3. There is no partial match for
+ If the first part of the subject is "ABC123", a partial match of the
+ first alternative is found at offset 3. There is no partial match for
the second alternative, because such a match does not start at the same
- point in the subject string. Attempting to continue with the string
- "7890" does not yield a match because only those alternatives that
- match at one point in the subject are remembered. The problem arises
- because the start of the second alternative matches within the first
- alternative. There is no problem with anchored patterns or patterns
+ point in the subject string. Attempting to continue with the string
+ "7890" does not yield a match because only those alternatives that
+ match at one point in the subject are remembered. The problem arises
+ because the start of the second alternative matches within the first
+ alternative. There is no problem with anchored patterns or patterns
such as:
1234|ABCD
- where no string can be a partial match for both alternatives. This is
- not a problem if a standard matching function is used, because the
+ where no string can be a partial match for both alternatives. This is
+ not a problem if a standard matching function is used, because the
entire match has to be rerun each time:
re> /1234|3789/
@@ -7751,10 +7785,10 @@
0: 3789
Of course, instead of using PCRE_DFA_RESTART, the same technique of re-
- running the entire match can also be used with the DFA matching func-
- tions. Another possibility is to work with two buffers. If a partial
- match at offset n in the first buffer is followed by "no match" when
- PCRE_DFA_RESTART is used on the second buffer, you can then try a new
+ running the entire match can also be used with the DFA matching func-
+ tions. Another possibility is to work with two buffers. If a partial
+ match at offset n in the first buffer is followed by "no match" when
+ PCRE_DFA_RESTART is used on the second buffer, you can then try a new
match starting at offset n+1 in the first buffer.
@@ -7767,11 +7801,11 @@
REVISION
- Last updated: 08 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPRECOMPILE(3) PCREPRECOMPILE(3)
@@ -7905,8 +7939,8 @@
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPERFORM(3) PCREPERFORM(3)
@@ -8075,8 +8109,8 @@
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPOSIX(3) PCREPOSIX(3)
@@ -8339,8 +8373,8 @@
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECPP(3) PCRECPP(3)
@@ -8681,8 +8715,8 @@
Last updated: 08 January 2012
------------------------------------------------------------------------------
-
-
+
+
PCRESAMPLE(3) PCRESAMPLE(3)
@@ -8825,8 +8859,8 @@
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESTACK(3) PCRESTACK(3)
@@ -8944,6 +8978,30 @@
subject string. This is done by calling pcre[16]_exec() repeatedly with
different limits.
+ Obtaining an estimate of stack usage
+
+ The actual amount of stack used per recursion can vary quite a lot,
+ depending on the compiler that was used to build PCRE and the optimiza-
+ tion or debugging options that were set for it. The rule of thumb value
+ of 500 bytes mentioned above may be larger or smaller than what is
+ actually needed. A better approximation can be obtained by running this
+ command:
+
+ pcretest -m -C
+
+ The -C option causes pcretest to output information about the options
+ with which PCRE was compiled. When -m is also given (before -C), infor-
+ mation about stack use is given in a line like this:
+
+ Match recursion uses stack: approximate frame size = 640 bytes
+
+ The value is approximate because some recursions need a bit more (up to
+ perhaps 16 more bytes).
+
+ If the above command is given when PCRE is compiled to use the heap
+ instead of the stack for recursion, the value that is output is the
+ size of each block that is obtained from the heap.
+
Changing stack size in Unix-like systems
In Unix-like environments, there is not often a problem with the stack
@@ -8983,8 +9041,8 @@
REVISION
- Last updated: 10 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
Modified: code/trunk/doc/pcre16.3
===================================================================
--- code/trunk/doc/pcre16.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre16.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -139,7 +139,7 @@
.sp
.B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP,
.ti +5n
-.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP,
+.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP,
.ti +5n
.B int \fIkeep_boms\fP);
.
@@ -158,8 +158,8 @@
to the 16-bit library. This page describes what is different when you use the
16-bit library.
.P
-WARNING: A single application can be linked with both libraries, but you must
-take care when processing any particular pattern to use functions from just one
+WARNING: A single application can be linked with both libraries, but you must
+take care when processing any particular pattern to use functions from just one
library. For example, if you want to study a pattern that was compiled with
\fBpcre16_compile()\fP, you must do so with \fBpcre16_study()\fP, not
\fBpcre_study()\fP, and you must free the study data with
@@ -169,7 +169,7 @@
.SH "THE HEADER FILE"
.rs
.sp
-There is only one header file, \fBpcre.h\fP. It contains prototypes for all the
+There is only one header file, \fBpcre.h\fP. It contains prototypes for all the
functions in both libraries, as well as definitions of flags, structures, error
codes, etc.
.
@@ -177,34 +177,34 @@
.SH "THE LIBRARY NAME"
.rs
.sp
-In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can
-normally be accesss by adding \fB-lpcre16\fP to the command for linking an
+In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can
+normally be accesss by adding \fB-lpcre16\fP to the command for linking an
application that uses PCRE.
.
.
.SH "STRING TYPES"
.rs
.sp
-In the 8-bit library, strings are passed to PCRE library functions as vectors
-of bytes with the C type "char *". In the 16-bit library, strings are passed as
-vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
-appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
-very many environments, "short int" is a 16-bit data type. When PCRE is built,
-it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
-data type. If it is not, the build fails with an error message telling the
+In the 8-bit library, strings are passed to PCRE library functions as vectors
+of bytes with the C type "char *". In the 16-bit library, strings are passed as
+vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
+appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
+very many environments, "short int" is a 16-bit data type. When PCRE is built,
+it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
+data type. If it is not, the build fails with an error message telling the
maintainer to modify the definition appropriately.
.
.
.SH "STRUCTURE TYPES"
.rs
.sp
-The types of the opaque structures that are used for compiled 16-bit patterns
-and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The
-type of the user-accessible structure that is returned by \fBpcre16_study()\fP
+The types of the opaque structures that are used for compiled 16-bit patterns
+and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The
+type of the user-accessible structure that is returned by \fBpcre16_study()\fP
is \fBpcre16_extra\fP, and the type of the structure that is used for passing
-data to a callout function is \fBpcre16_callout_block\fP. These structures
-contain the same fields, with the same names, as their 8-bit counterparts. The
-only difference is that pointers to character strings are 16-bit instead of
+data to a callout function is \fBpcre16_callout_block\fP. These structures
+contain the same fields, with the same names, as their 8-bit counterparts. The
+only difference is that pointers to character strings are 16-bit instead of
8-bit types.
.
.
@@ -212,31 +212,31 @@
.rs
.sp
For every function in the 8-bit library there is a corresponding function in
-the 16-bit library with a name that starts with \fBpcre16_\fP instead of
+the 16-bit library with a name that starts with \fBpcre16_\fP instead of
\fBpcre_\fP. The prototypes are listed above. In addition, there is one extra
-function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function
-that converts a UTF-16 character string to host byte order if necessary. The
-other 16-bit functions expect the strings they are passed to be in host byte
-order.
+function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function
+that converts a UTF-16 character string to host byte order if necessary. The
+other 16-bit functions expect the strings they are passed to be in host byte
+order.
.P
The \fIinput\fP and \fIoutput\fP arguments of
-\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is,
-conversion in place is supported. The output buffer must be at least as long as
+\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is,
+conversion in place is supported. The output buffer must be at least as long as
the input.
.P
The \fIlength\fP argument specifies the number of 16-bit data units in the
input string; a negative value specifies a zero-terminated string.
.P
-If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host
+If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
string (commonly as the first character).
.P
-If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it
-points means that the input starts off in host byte order, otherwise the
-opposite order is assumed. Again, BOMs in the string can change this. The final
-byte order is passed back at the end of processing.
+If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it
+points means that the input starts off in host byte order, otherwise the
+opposite order is assumed. Again, BOMs in the string can change this. The final
+byte order is passed back at the end of processing.
.P
-If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied
+If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied
into the output string. Otherwise they are discarded.
.P
The result of the function is the number of 16-bit units placed into the output
@@ -246,16 +246,16 @@
.SH "SUBJECT STRING OFFSETS"
.rs
.sp
-The offsets within subject strings that are returned by the matching functions
+The offsets within subject strings that are returned by the matching functions
are in 16-bit units rather than bytes.
.
.
.SH "NAMED SUBPATTERNS"
.rs
.sp
-The name-to-number translation table that is maintained for named subpatterns
-uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function
-returns the length of each entry in the table as the number of 16-bit data
+The name-to-number translation table that is maintained for named subpatterns
+uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function
+returns the length of each entry in the table as the number of 16-bit data
units.
.
.
@@ -266,7 +266,7 @@
which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
fact, these new options define the same bits in the options word.
.P
-For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16
+For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
given to \fBpcre_config()\fP, or if the PCRE_CONFIG_UTF8 option is given to
\fBpcre16_config()\fP, the result is the PCRE_ERROR_BADOPTION error.
@@ -275,18 +275,18 @@
.SH "CHARACTER CODES"
.rs
.sp
-In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
-same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
-from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
-0xff can therefore be influenced by the locale in the same way as before.
-Characters greater than 0xff have only one case, and no "type" (such as letter
+In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
+same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
+from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
+0xff can therefore be influenced by the locale in the same way as before.
+Characters greater than 0xff have only one case, and no "type" (such as letter
or digit).
.P
-In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
-the exception of values in the range 0xd800 to 0xdfff because those are
+In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
+the exception of values in the range 0xd800 to 0xdfff because those are
"surrogate" values that are used in pairs to encode values greater than 0xffff.
.P
-A UTF-16 string can indicate its endianness by special code knows as a
+A UTF-16 string can indicate its endianness by special code knows as a
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
to be in host byte order. A utility function called
\fBpcre16_utf16_to_host_byte_order()\fP is provided to help with this (see
@@ -296,20 +296,20 @@
.SH "ERROR NAMES"
.rs
.sp
-The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
+The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
pattern is passed to a function that processes patterns in the other
-mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to
+mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to
\fBpcre16_exec()\fP.
.P
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
-UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
+UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
are described in the section entitled
.\" HTML <a href="pcreapi.html#badutf8reasons">
.\" </a>
"Reason codes for invalid UTF-8 strings"
.\"
-in the main
+in the main
.\" HREF
\fBpcreapi\fP
.\"
@@ -324,8 +324,8 @@
.SH "ERROR TEXTS"
.rs
.sp
-If there is an error while compiling a pattern, the error text that is passed
-back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit
+If there is an error while compiling a pattern, the error text that is passed
+back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit
character string, zero-terminated.
.
.
@@ -339,15 +339,15 @@
.SH "TESTING"
.rs
.sp
-The \fBpcretest\fP program continues to operate with 8-bit input and output
-files, but it can be used for testing the 16-bit library. If it is run with the
-command line option \fB-16\fP, patterns and subject strings are converted from
-8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
-are used instead of the 8-bit ones. Returned 16-bit strings are converted to
+The \fBpcretest\fP program continues to operate with 8-bit input and output
+files, but it can be used for testing the 16-bit library. If it is run with the
+command line option \fB-16\fP, patterns and subject strings are converted from
+8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
+are used instead of the 8-bit ones. Returned 16-bit strings are converted to
8-bit for output. If the 8-bit library was not compiled, \fBpcretest\fP
defaults to 16-bit and the \fB-16\fP option is ignored.
.P
-When PCRE is being built, the \fBRunTest\fP script that is called by "make
+When PCRE is being built, the \fBRunTest\fP script that is called by "make
check" uses the \fBpcretest\fP \fB-C\fP option to discover which of the 8-bit
and 16-bit libraries has been built, and runs the tests appropriately.
.
@@ -355,8 +355,8 @@
.SH "NOT SUPPORTED IN 16-BIT MODE"
.rs
.sp
-Not all the features of the 8-bit library are available with the 16-bit
-library. The C++ and POSIX wrapper functions support only the 8-bit library,
+Not all the features of the 8-bit library are available with the 16-bit
+library. The C++ and POSIX wrapper functions support only the 8-bit library,
and the \fBpcregrep\fP program is at present 8-bit only.
.
.
Modified: code/trunk/doc/pcre_config.3
===================================================================
--- code/trunk/doc/pcre_config.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre_config.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -28,7 +28,7 @@
PCRE_CONFIG_JIT Availability of just-in-time compiler
support (1=yes 0=no)
PCRE_CONFIG_JITTARGET String containing information about the
- target architecture for the JIT compiler,
+ target architecture for the JIT compiler,
or NULL if there is no JIT support
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
@@ -55,7 +55,7 @@
Availability of Unicode property support
(1=yes 0=no)
.sp
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
is also given if PCRE_CONFIG_UTF16 is passed to \fBpcre_config()\fP or if
PCRE_CONFIG_UTF8 is passed to \fBpcre16_config()\fP.
.P
Modified: code/trunk/doc/pcre_fullinfo.3
===================================================================
--- code/trunk/doc/pcre_fullinfo.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre_fullinfo.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -38,7 +38,7 @@
PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_JIT Return 1 after successful JIT compilation
- PCRE_INFO_JITSIZE Size of JIT compiled code
+ PCRE_INFO_JITSIZE Size of JIT compiled code
PCRE_INFO_LASTLITERAL Literal last data unit required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
Modified: code/trunk/doc/pcre_jit_stack_alloc.3
===================================================================
--- code/trunk/doc/pcre_jit_stack_alloc.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre_jit_stack_alloc.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -11,7 +11,7 @@
.ti +5n
.B int \fImaxsize\fP);
.PP
-.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP,
+.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP,
.ti +5n
.B int \fImaxsize\fP);
.
Modified: code/trunk/doc/pcre_pattern_to_host_byte_order.3
===================================================================
--- code/trunk/doc/pcre_pattern_to_host_byte_order.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre_pattern_to_host_byte_order.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -9,7 +9,7 @@
.SM
.B int pcre_pattern_to_host_byte_order(pcre *\fIcode\fP,
.ti +5n
-.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);
+.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);
.PP
.B int pcre16_pattern_to_host_byte_order(pcre16 *\fIcode\fP,
.ti +5n
@@ -20,8 +20,8 @@
.rs
.sp
This function ensures that the bytes in 2-byte and 4-byte values in a compiled
-pattern are in the correct order for the current host. It is useful when a
-pattern that has been compiled on one host is transferred to another that might
+pattern are in the correct order for the current host. It is useful when a
+pattern that has been compiled on one host is transferred to another that might
have different endianness. The arguments are:
.sp
\fIcode\fP A compiled regular expression
Modified: code/trunk/doc/pcre_utf16_to_host_byte_order.3
===================================================================
--- code/trunk/doc/pcre_utf16_to_host_byte_order.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcre_utf16_to_host_byte_order.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -9,7 +9,7 @@
.SM
.B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP,
.ti +5n
-.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP,
+.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP,
.ti +5n
.B int \fIkeep_boms\fP);
.
@@ -17,8 +17,8 @@
.SH DESCRIPTION
.rs
.sp
-This function, which exists only in the 16-bit library, converts a UTF-16
-string to the correct order for the current host, taking account of any byte
+This function, which exists only in the 16-bit library, converts a UTF-16
+string to the correct order for the current host, taking account of any byte
order marks (BOMs) within the string. Its arguments are:
.sp
\fIoutput\fP pointer to output buffer, may be the same as \fIinput\fP
Modified: code/trunk/doc/pcreapi.3
===================================================================
--- code/trunk/doc/pcreapi.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcreapi.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -148,7 +148,7 @@
start with \fBpcre16_\fP instead of \fBpcre_\fP. For every option that has UTF8
in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with
UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit
-option names define the same bit values.
+option names define the same bit values.
.P
References to bytes and UTF-8 in this document should be read as references to
16-bit data quantities and UTF-16 when using the 16-bit library, unless
@@ -157,7 +157,7 @@
.\" HREF
\fBpcre16\fP
.\"
-page.
+page.
.
.
.SH "PCRE API OVERVIEW"
@@ -392,7 +392,7 @@
PCRE_CONFIG_UTF8
.sp
The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero. If this option is given to the 16-bit version of
+otherwise it is set to zero. If this option is given to the 16-bit version of
this function, \fBpcre16_config()\fP, the result is PCRE_ERROR_BADOPTION.
.sp
PCRE_CONFIG_UTF16
@@ -415,8 +415,8 @@
PCRE_CONFIG_JITTARGET
.sp
The output is a pointer to a zero-terminated "const char *" string. If JIT
-support is available, the string contains the name of the architecture for
-which the JIT compiler is configured, for example "x86 32bit (little endian +
+support is available, the string contains the name of the architecture for
+which the JIT compiler is configured, for example "x86 32bit (little endian +
unaligned)". If JIT support is not available, the result is NULL.
.sp
PCRE_CONFIG_NEWLINE
@@ -742,7 +742,7 @@
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
+separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
library, the last two are recognized only in UTF-8 mode.
.P
The newline setting in the options word uses three bits that are treated
@@ -819,11 +819,11 @@
.sp
PCRE_NO_UTF8_CHECK
.sp
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
string is automatically checked. There is a discussion about the
.\" HTML <a href="pcreunicode.html#utf8strings">
.\" </a>
-validity of UTF-8 strings
+validity of UTF-8 strings
.\"
in the
.\" HREF
@@ -843,7 +843,7 @@
.sp
The following table lists the error codes than may be returned by
\fBpcre_compile2()\fP, along with the error messages that may be returned by
-both compiling functions. Note that error messages are always 8-bit ASCII
+both compiling functions. Note that error messages are always 8-bit ASCII
strings, even in 16-bit mode. As PCRE has developed, some error codes have
fallen out of use. To avoid confusion, they have not been re-used.
.sp
@@ -917,14 +917,14 @@
65 different names for subpatterns of the same number are
not allowed
66 (*MARK) must have an argument
- 67 this version of PCRE is not compiled with Unicode property
+ 67 this version of PCRE is not compiled with Unicode property
support
68 \ec must be followed by an ASCII character
69 \ek is not followed by a braced, angle-bracketed, or quoted name
70 internal error: unknown opcode in find_fixedlength()
71 \eN is not supported in a class
72 too many forward references
- 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
+ 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
74 invalid UTF-16 string (specifically UTF-16)
.sp
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
@@ -1120,12 +1120,12 @@
PCRE_ERROR_NULL the argument \fIcode\fP was NULL
the argument \fIwhere\fP was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
+ PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
endianness
PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
.sp
The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. The endianness error can
+check against passing an arbitrary memory pointer. The endianness error can
occur if a compiled pattern is saved and reloaded on a different host. Here is
a typical call of \fBpcre_fullinfo()\fP, to obtain the length of the compiled
pattern:
@@ -1168,8 +1168,8 @@
variable.
.P
If there is a fixed first value, for example, the letter "c" from a pattern
-such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
-value is always less than 256; in the 16-bit library the value can be up to
+such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
+value is always less than 256; in the 16-bit library the value can be up to
0xffff.
.P
If there is no fixed first value, and if either
@@ -1459,7 +1459,7 @@
const unsigned char *\fItables\fP;
unsigned char **\fImark\fP;
.sp
-In the 16-bit version of this structure, the \fImark\fP field has type
+In the 16-bit version of this structure, the \fImark\fP field has type
"PCRE_UCHAR16 **".
.P
The \fIflags\fP field is a bitmap that specifies which of the other fields
@@ -2092,14 +2092,14 @@
.sp
PCRE_ERROR_BADMODE (-28)
.sp
-This error is given if a pattern that was compiled by the 8-bit library is
+This error is given if a pattern that was compiled by the 8-bit library is
passed to a 16-bit library function, or vice versa.
.sp
PCRE_ERROR_BADENDIANNESS (-29)
-.sp
-This error is given if a pattern that was compiled and saved is reloaded on a
-host with different endianness. The utility function
-\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern
+.sp
+This error is given if a pattern that was compiled and saved is reloaded on a
+host with different endianness. The utility function
+\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern
so that it runs on the new host.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
@@ -2109,7 +2109,7 @@
.SS "Reason codes for invalid UTF-8 strings"
.rs
.sp
-This section applies only to the 8-bit library. The corresponding information
+This section applies only to the 8-bit library. The corresponding information
for the 16-bit library is given in the
.\" HREF
\fBpcre16\fP
@@ -2417,14 +2417,14 @@
.rs
.sp
Matching certain patterns using \fBpcre_exec()\fP can use a lot of process
-stack, which in certain environments can be rather limited in size. Some users
-find it helpful to have an estimate of the amount of stack that is used by
+stack, which in certain environments can be rather limited in size. Some users
+find it helpful to have an estimate of the amount of stack that is used by
\fBpcre_exec()\fP, to help them set recursion limits, as described in the
.\" HREF
\fBpcrestack\fP
.\"
-documentation. The estimate that is output by \fBpcretest\fP when called with
-the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with
+documentation. The estimate that is output by \fBpcretest\fP when called with
+the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with
the values NULL, NULL, NULL, -999, and -999 for its first five arguments.
.P
Normally, if its first argument is NULL, \fBpcre_exec()\fP immediately returns
@@ -2432,10 +2432,10 @@
arguments, it returns instead a negative number whose absolute value is the
approximate stack frame size in bytes. (A negative number is used so that it is
clear that no match has happened.) The value is approximate because in some
-cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two
+cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two
additional variables on the stack.
.P
-If PCRE has been compiled to use the heap instead of the stack for recursion,
+If PCRE has been compiled to use the heap instead of the stack for recursion,
the value returned is the size of each block that is obtained from the heap.
.
.
Modified: code/trunk/doc/pcrebuild.3
===================================================================
--- code/trunk/doc/pcrebuild.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrebuild.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -35,11 +35,11 @@
.SH "BUILDING 8-BIT and 16-BIT LIBRARIES"
.rs
.sp
-By default, a library called \fBlibpcre\fP is built, containing functions that
-take string arguments contained in vectors of bytes, either as single-byte
+By default, a library called \fBlibpcre\fP is built, containing functions that
+take string arguments contained in vectors of bytes, either as single-byte
characters, or interpreted as UTF-8 strings. You can also build a separate
-library, called \fBlibpcre16\fP, in which strings are contained in vectors of
-16-bit data units and interpreted either as single-unit characters or UTF-16
+library, called \fBlibpcre16\fP, in which strings are contained in vectors of
+16-bit data units and interpreted either as single-unit characters or UTF-16
strings, by adding
.sp
--enable-pcre16
@@ -70,7 +70,7 @@
.sp
By default, if the 8-bit library is being built, the \fBconfigure\fP script
will search for a C++ compiler and C++ header files. If it finds them, it
-automatically builds the C++ wrapper library (which supports only 8-bit
+automatically builds the C++ wrapper library (which supports only 8-bit
strings). You can disable this by adding
.sp
--disable-cpp
@@ -96,7 +96,7 @@
.P
Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As
well as compiling PCRE with this option, you also have have to set the
-PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
+PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
functions.
.P
If you set --enable-utf when compiling in an EBCDIC environment, PCRE expects
Modified: code/trunk/doc/pcrecallout.3
===================================================================
--- code/trunk/doc/pcrecallout.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrecallout.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -11,7 +11,7 @@
PCRE provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE in the middle of pattern matching. The
caller of PCRE provides an external function by putting its entry point in the
-global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit
+global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit
library). By default, this variable contains NULL, which disables all calling
out.
.P
@@ -85,7 +85,7 @@
int \fIcallout_number\fP;
int *\fIoffset_vector\fP;
const char *\fIsubject\fP; (8-bit version)
- PCRE_SPTR16 \fIsubject\fP; (16-bit version)
+ PCRE_SPTR16 \fIsubject\fP; (16-bit version)
int \fIsubject_length\fP;
int \fIstart_match\fP;
int \fIcurrent_position\fP;
@@ -107,7 +107,7 @@
automatically generated callouts).
.P
The \fIoffset_vector\fP field is a pointer to the vector of offsets that was
-passed by the caller to the matching function. When \fBpcre_exec()\fP or
+passed by the caller to the matching function. When \fBpcre_exec()\fP or
\fBpcre16_exec()\fP is used, the contents can be inspected, in order to extract
substrings that have been matched so far, in the same way as for extracting
substrings after a match has completed. For the DFA matching functions, this
Modified: code/trunk/doc/pcrecpp.3
===================================================================
--- code/trunk/doc/pcrecpp.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrecpp.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -12,7 +12,7 @@
The C++ wrapper for PCRE was provided by Google Inc. Some additional
functionality was added by Giuseppe Maxia. This brief man page was constructed
from the notes in the \fIpcrecpp.h\fP file, which should be consulted for
-further details. Note that the C++ wrapper supports only the original 8-bit
+further details. Note that the C++ wrapper supports only the original 8-bit
PCRE library. There is no 16-bit support at present.
.
.
Modified: code/trunk/doc/pcrejit.3
===================================================================
--- code/trunk/doc/pcrejit.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrejit.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -21,10 +21,10 @@
.SH "8-BIT and 16-BIT SUPPORT"
.rs
.sp
-JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
-this documentation simple, only the 8-bit interface is described in what
-follows. If you are using the 16-bit library, substitute the 16-bit functions
-and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of
+JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
+this documentation simple, only the 8-bit interface is described in what
+follows. If you are using the 16-bit library, substitute the 16-bit functions
+and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of
\fIpcre_jit_stack\fP).
.
.
Modified: code/trunk/doc/pcrematching.3
===================================================================
--- code/trunk/doc/pcrematching.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrematching.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -8,14 +8,14 @@
for matching a compiled regular expression against a given subject string. The
"standard" algorithm is the one provided by the \fBpcre_exec()\fP and
\fBpcre16_exec()\fP functions. These work in the same was as Perl's matching
-function, and provide a Perl-compatible matching operation. The just-in-time
+function, and provide a Perl-compatible matching operation. The just-in-time
(JIT) optimization that is described in the
.\" HREF
\fBpcrejit\fP
.\"
documentation is compatible with these functions.
.P
-An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and
+An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and
\fBpcre16_dfa_exec()\fP functions; they operate in a different way, and are not
Perl-compatible. This alternative has advantages and disadvantages compared
with the standard algorithm, and these are described below.
Modified: code/trunk/doc/pcrepartial.3
===================================================================
--- code/trunk/doc/pcrepartial.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrepartial.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -25,7 +25,7 @@
long and is not all available at once.
.P
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
-PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
+PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
functions. For backwards compatibility, PCRE_PARTIAL is a synonym for
PCRE_PARTIAL_SOFT. The essential difference between the two options is whether
or not a partial match is preferred to an alternative complete match, though
@@ -46,7 +46,7 @@
.SH "PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()"
.rs
.sp
-A partial match occurs during a call to \fBpcre_exec()\fP or
+A partial match occurs during a call to \fBpcre_exec()\fP or
\fBpcre16_exec()\fP when the end of the subject string is reached successfully,
but matching cannot continue because more characters are needed. However, at
least one character in the subject must have been inspected. This character
@@ -115,7 +115,7 @@
this reason, the assumption is made that the end of the supplied subject string
may not be the true end of the available data, and so, if \ez, \eZ, \eb, \eB,
or $ are encountered at the end of the subject, the result is
-PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
+PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
been inspected.
.P
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16
@@ -270,7 +270,7 @@
.P
You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
PCRE_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
+facility can be used to pass very long subject strings to the DFA matching
functions.
.
.
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrepattern.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -25,11 +25,11 @@
second library that supports 16-bit and UTF-16 character strings. To use these
features, PCRE must be built to include appropriate support. When using UTF
strings you must either call the compiling function with the PCRE_UTF8 or
-PCRE_UTF16 option, or the pattern must start with one of these special
+PCRE_UTF16 option, or the pattern must start with one of these special
sequences:
.sp
(*UTF8)
- (*UTF16)
+ (*UTF16)
.sp
Starting a pattern with such a sequence is equivalent to setting the relevant
option. This feature is not Perl-compatible. How setting a UTF mode affects
@@ -263,8 +263,8 @@
8-bit UTF-8 mode less than 0x10ffff and a valid codepoint
16-bit non-UTF mode less than 0x10000
16-bit UTF-16 mode less than 0x10ffff and a valid codepoint
-.sp
-Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
+.sp
+Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
"surrogate" codepoints).
.P
If characters other than hexadecimal digits appear between \ex{ and }, or if
@@ -307,7 +307,7 @@
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. The value of the character is
+subsequent digits stand for themselves. The value of the character is
constrained in the same way as characters specified in hexadecimal.
For example:
.sp
@@ -499,8 +499,8 @@
U+2028 Line separator
U+2029 Paragraph separator
.sp
-In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
-relevant.
+In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
+relevant.
.
.
.\" HTML <a name="newlineseq"></a>
@@ -974,7 +974,7 @@
.sp
Outside a character class, a dot in the pattern matches any one character in
the subject string except (by default) a character that signifies the end of a
-line.
+line.
.P
When a line ending is defined as a single character, dot never matches that
character; when the two-character sequence CRLF is used, dot does not match CR
@@ -1104,7 +1104,7 @@
"]" can also be used to end a range.
.P
Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
+used for characters specified numerically, for example [\e000-\e037]. Ranges
can include any characters that are valid for the current mode.
.P
If a range that includes letters is used when caseless matching is set, it
@@ -1305,8 +1305,8 @@
.sp
2. It sets up the subpattern as a capturing subpattern. This means that, when
the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the \fIovector\fP argument of the
-matching function. (This applies only to the traditional matching functions;
+subpattern is passed back to the caller via the \fIovector\fP argument of the
+matching function. (This applies only to the traditional matching functions;
the DFA matching functions do not support capturing.)
.P
Opening parentheses are counted from left to right (starting from 1) to obtain
@@ -2538,7 +2538,7 @@
.P
PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable \fIpcre_callout\fP
+function by putting its entry point in the global variable \fIpcre_callout\fP
(8-bit library) or \fIpcre16_callout\fP (16-bit library). By default, this
variable contains NULL, which disables all calling out.
.P
Modified: code/trunk/doc/pcreposix.3
===================================================================
--- code/trunk/doc/pcreposix.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcreposix.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -30,7 +30,7 @@
\fBpcreapi\fP
.\"
documentation for a description of PCRE's native API, which contains much
-additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
+additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
library.
.P
The functions described here are just wrapper functions that ultimately call
Modified: code/trunk/doc/pcreprecompile.3
===================================================================
--- code/trunk/doc/pcreprecompile.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcreprecompile.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -111,7 +111,7 @@
documentation.
.P
If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes the matching
+the pointer in the compiled pattern is NULL, which causes the matching
functions to use PCRE's internal tables. Thus, you do not need to take any
special action at run time in this case.
.P
Modified: code/trunk/doc/pcrestack.3
===================================================================
--- code/trunk/doc/pcrestack.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcrestack.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -140,24 +140,24 @@
.sp
The actual amount of stack used per recursion can vary quite a lot, depending
on the compiler that was used to build PCRE and the optimization or debugging
-options that were set for it. The rule of thumb value of 500 bytes mentioned
-above may be larger or smaller than what is actually needed. A better
+options that were set for it. The rule of thumb value of 500 bytes mentioned
+above may be larger or smaller than what is actually needed. A better
approximation can be obtained by running this command:
.sp
pcretest -m -C
.sp
-The \fB-C\fP option causes \fBpcretest\fP to output information about the
-options with which PCRE was compiled. When \fB-m\fP is also given (before
+The \fB-C\fP option causes \fBpcretest\fP to output information about the
+options with which PCRE was compiled. When \fB-m\fP is also given (before
\fB-C\fP), information about stack use is given in a line like this:
.sp
Match recursion uses stack: approximate frame size = 640 bytes
-.sp
-The value is approximate because some recursions need a bit more (up to perhaps
+.sp
+The value is approximate because some recursions need a bit more (up to perhaps
16 more bytes).
.P
-If the above command is given when PCRE is compiled to use the heap instead of
-the stack for recursion, the value that is output is the size of each block
-that is obtained from the heap.
+If the above command is given when PCRE is compiled to use the heap instead of
+the stack for recursion, the value that is output is the size of each block
+that is obtained from the heap.
.
.
.SS "Changing stack size in Unix-like systems"
Modified: code/trunk/doc/pcresyntax.3
===================================================================
--- code/trunk/doc/pcresyntax.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcresyntax.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -420,12 +420,12 @@
.sp
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
- (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
+ (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
(*SKIP) advance to current matching position
(*SKIP:NAME) advance to position corresponding to an earlier
- (*MARK:NAME); if not found, the (*SKIP) is ignored
+ (*MARK:NAME); if not found, the (*SKIP) is ignored
(*THEN) local failure, backtrack to next alternation
- (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
+ (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
.
.
.SH "NEWLINE CONVENTIONS"
Modified: code/trunk/doc/pcretest.1
===================================================================
--- code/trunk/doc/pcretest.1 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcretest.1 2012-01-21 16:37:17 UTC (rev 903)
@@ -31,17 +31,17 @@
.SH "PCRE's 8-BIT and 16-BIT LIBRARIES"
.rs
.sp
-From release 8.30, two separate PCRE libraries can be built. The original one
-supports 8-bit character strings, whereas the newer 16-bit library supports
-character strings encoded in 16-bit units. The \fBpcretest\fP program can be
+From release 8.30, two separate PCRE libraries can be built. The original one
+supports 8-bit character strings, whereas the newer 16-bit library supports
+character strings encoded in 16-bit units. The \fBpcretest\fP program can be
used to test both libraries. However, it is itself still an 8-bit program,
reading 8-bit input and writing 8-bit output. When testing the 16-bit library,
the patterns and data strings are converted to 16-bit format before being
-passed to the PCRE library functions. Results are converted to 8-bit for
+passed to the PCRE library functions. Results are converted to 8-bit for
output.
.P
-References to functions and structures of the form \fBpcre[16]_xx\fP below
-mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using
+References to functions and structures of the form \fBpcre[16]_xx\fP below
+mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using
the 16-bit library".
.
.
@@ -49,9 +49,9 @@
.rs
.TP 10
\fB-16\fP
-If both the 8-bit and the 16-bit libraries have been built, this option causes
-the 16-bit library to be used. If only the 16-bit library has been built, this
-is the default (so has no effect). If only the 8-bit library has been built,
+If both the 8-bit and the 16-bit libraries have been built, this option causes
+the 16-bit library to be used. If only the 16-bit library has been built, this
+is the default (so has no effect). If only the 8-bit library has been built,
this option causes an error.
.TP 10
\fB-b\fP
@@ -60,24 +60,24 @@
.TP 10
\fB-C\fP
Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit. All other options
+about the optional features that are included, and then exit. All other options
are ignored.
.TP 10
\fB-C\fP \fIoption\fP
-Output information about a specific build-time option, then exit. This
-functionality is intended for use in scripts such as \fBRunTest\fP. The
+Output information about a specific build-time option, then exit. This
+functionality is intended for use in scripts such as \fBRunTest\fP. The
following options output the value indicated:
.sp
linksize the internal link size (2, 3, or 4)
- newline the default newline setting:
- CR, LF, CRLF, ANYCRLF, or ANY
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
.sp
The following options output 1 for true or zero for false:
-.sp
+.sp
jit just-in-time support is available
pcre16 the 16-bit library was built
pcre8 the 8-bit library was built
- ucp Unicode property support is available
+ ucp Unicode property support is available
utf UTF-8 and/or UTF-16 support is available
.TP 10
\fB-d\fP
@@ -104,7 +104,7 @@
.TP 10
\fB-m\fP
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding \fB/M\fP to each regular expression. The size is given in
+equivalent to adding \fB/M\fP to each regular expression. The size is given in
bytes for both libraries.
.TP 10
\fB-o\fP \fIosize\fP
@@ -137,7 +137,7 @@
neither \fB-i\fP nor \fB-d\fP is present on the command line. This behaviour
means that the output from tests that are run with and without \fB-s\fP should
be identical, except when options that output information about the actual
-running of a match are set.
+running of a match are set.
.sp
The \fB-M\fP, \fB-t\fP, and \fB-tm\fP options, which give information about
resources used, are likely to produce different output with and without
@@ -237,12 +237,12 @@
The following table shows additional modifiers for setting PCRE compile-time
options that do not correspond to anything in Perl:
.sp
- \fB/8\fP PCRE_UTF8 ) when using the 8-bit
+ \fB/8\fP PCRE_UTF8 ) when using the 8-bit
\fB/?\fP PCRE_NO_UTF8_CHECK ) library
-.sp
+.sp
\fB/8\fP PCRE_UTF16 ) when using the 16-bit
\fB/?\fP PCRE_NO_UTF16_CHECK ) library
-.sp
+.sp
\fB/A\fP PCRE_ANCHORED
\fB/C\fP PCRE_AUTO_CALLOUT
\fB/E\fP PCRE_DOLLAR_ENDONLY
@@ -270,7 +270,7 @@
.sp
As well as turning on the PCRE_UTF8/16 option, the \fB/8\fP modifier causes
all non-printing characters in output strings to be printed using the
-\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without
+\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without
the curly brackets.
.P
Full details of the PCRE options are given in the
@@ -663,7 +663,7 @@
2: b
.sp
If the strings contain any non-printing characters, they are output as \exhh
-escapes if the value is less than 256 and UTF mode is not set. Otherwise they
+escapes if the value is less than 256 and UTF mode is not set. Otherwise they
are output as \ex{hh...} escapes. See below for the definition of non-printing
characters. If the pattern has the \fB/+\fP modifier, the output for substring
0 is followed by the the rest of the subject string, identified by "0+" like
@@ -890,15 +890,15 @@
You can copy a file written by \fBpcretest\fP to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
pattern was compiled. For example, you can compile on an i86 machine and run on
-a SPARC machine. When a pattern is reloaded on a host with different
+a SPARC machine. When a pattern is reloaded on a host with different
endianness, the confirmation message is changed to:
.sp
Compiled pattern (byte-inverted) loaded from /some/file
.sp
-The test suite contains some saved pre-compiled patterns with different
-endianness. These are reloaded using "<!" instead of just "<". This suppresses
-the "(byte-inverted)" text so that the output is the same on all hosts. It also
-forces debugging output once the pattern has been reloaded.
+The test suite contains some saved pre-compiled patterns with different
+endianness. These are reloaded using "<!" instead of just "<". This suppresses
+the "(byte-inverted)" text so that the output is the same on all hosts. It also
+forces debugging output once the pattern has been reloaded.
.P
File names for saving and reloading can be absolute or relative, but note that
the shell facility of expanding a file name that starts with a tilde (~) is not
Modified: code/trunk/doc/pcreunicode.3
===================================================================
--- code/trunk/doc/pcreunicode.3 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/pcreunicode.3 2012-01-21 16:37:17 UTC (rev 903)
@@ -5,7 +5,7 @@
.rs
.sp
From Release 8.30, in addition to its previous UTF-8 support, PCRE also
-supports UTF-16 by means of a separate 16-bit library. This can be built as
+supports UTF-16 by means of a separate 16-bit library. This can be built as
well as, or instead of, the 8-bit library.
.
.
@@ -77,7 +77,7 @@
range U+0 to U+10FFFF, excluding U+D800 to U+DFFF.
.P
The excluded code points are the "Surrogate Area" of Unicode. They are reserved
-for use by UTF-16, where they are used in pairs to encode codepoints with
+for use by UTF-16, where they are used in pairs to encode codepoints with
values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
are available independently in the UTF-8 encoding. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
@@ -148,7 +148,7 @@
3. Repeat quantifiers apply to complete UTF characters, not to individual
data units, for example: \ex{100}{3}.
.P
-4. The dot metacharacter matches one UTF character instead of a single data
+4. The dot metacharacter matches one UTF character instead of a single data
unit.
.P
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode, or
@@ -166,7 +166,7 @@
.P
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as in
+recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with values less than 256. This remains true even when PCRE
is built to include Unicode property support, because to do otherwise would
slow down PCRE in many common cases. Note in particular that this applies to
Modified: code/trunk/doc/perltest.txt
===================================================================
--- code/trunk/doc/perltest.txt 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/doc/perltest.txt 2012-01-21 16:37:17 UTC (rev 903)
@@ -14,7 +14,7 @@
/W ignored
/S ignored
/SS ignored
- /Y ignored
+ /Y ignored
The pcretest \Y escape in data lines is removed before matching. The data lines
are processed as Perl double-quoted strings, so if they contain " $ or @
@@ -29,7 +29,7 @@
modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4
and testinput6 files can be fed to perltest to run compatible UTF-8 tests.
However, it is necessary to add "use utf8; require Encode" to the script to
-make this work correctly. I have not managed to find a way to handle this
+make this work correctly. I have not managed to find a way to handle this
automatically.
The other testinput files are not suitable for feeding to perltest.pl, since
Modified: code/trunk/pcre-config.in
===================================================================
--- code/trunk/pcre-config.in 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/pcre-config.in 2012-01-21 16:37:17 UTC (rev 903)
@@ -10,7 +10,7 @@
libs="[--libs-cpp]"
else
libs=
-fi
+fi
if test @enable_pcre16@ = yes ; then
libs="[--libs16] $libs"
@@ -18,7 +18,7 @@
if test @enable_pcre8@ = yes ; then
libs="[--libs] [--libs-posix] $libs"
- cflags="$cflags [--cflags-posix]"
+ cflags="$cflags [--cflags-posix]"
fi
usage="Usage: pcre-config [--prefix] [--exec-prefix] [--version] $libs $cflags"
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/pcre_compile.c 2012-01-21 16:37:17 UTC (rev 903)
@@ -488,7 +488,7 @@
"\\N is not supported in a class\0"
"too many forward references\0"
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
- "invalid UTF-16 string\0"
+ "invalid UTF-16 string\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -998,9 +998,9 @@
c -= CHAR_0;
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
c = c * 8 + *(++ptr) - CHAR_0;
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE8
if (!utf && c > 0xff) *errorcodeptr = ERR51;
-#endif
+#endif
break;
/* \x is complicated. \x{ddd} is a character number which can be greater
@@ -7709,11 +7709,11 @@
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
{
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE8
errorcode = ERR44;
-#else
+#else
errorcode = ERR74;
-#endif
+#endif
goto PCRE_EARLY_ERROR_RETURN2;
}
#else
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/pcre_exec.c 2012-01-21 16:37:17 UTC (rev 903)
@@ -487,13 +487,13 @@
/* When recursion is not being used, all "local" variables that have to be
preserved over calls to RMATCH() are part of a "frame". We set up the top-level
frame on the stack here; subsequent instantiations are obtained from the heap
-whenever RMATCH() does a "recursion". See the macro definitions above. Putting
-the top-level on the stack rather than malloc-ing them all gives a performance
+whenever RMATCH() does a "recursion". See the macro definitions above. Putting
+the top-level on the stack rather than malloc-ing them all gives a performance
boost in many cases where there is not much "recursion". */
#ifdef NO_RECURSE
-heapframe frame_zero;
-heapframe *frame = &frame_zero;
+heapframe frame_zero;
+heapframe *frame = &frame_zero;
frame->Xprevframe = NULL; /* Marks the top level */
/* Copy in the original argument variables */
@@ -616,7 +616,7 @@
eptrblock newptrb;
-/* There is a special fudge for calling match() in a way that causes it to
+/* There is a special fudge for calling match() in a way that causes it to
measure the size of its basic stack frame when the stack is being used for
recursion. The second argument (ecode) being NULL triggers this behaviour. It
cannot normally ever be NULL. The return is the negated value of the frame
@@ -631,7 +631,7 @@
int len = (char *)&rdepth - (char *)eptr;
return (len > 0)? -len : len;
}
- }
+ }
#endif /* NO_RECURSE */
/* To save space on the stack and in the heap frame, I have doubled up on some
@@ -838,7 +838,7 @@
case OP_ONCE_NC:
prev = ecode;
saved_eptr = eptr;
- save_mark = md->mark;
+ save_mark = md->mark;
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
@@ -857,7 +857,7 @@
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
- md->mark = save_mark;
+ md->mark = save_mark;
}
while (*ecode == OP_ALT);
@@ -937,7 +937,7 @@
save_offset2 = md->offset_vector[offset+1];
save_offset3 = md->offset_vector[md->offset_end - number];
save_capture_last = md->capture_last;
- save_mark = md->mark;
+ save_mark = md->mark;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
md->offset_vector[md->offset_end - number] =
@@ -1043,7 +1043,7 @@
save_mark = md->mark;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
RM2);
-
+
/* See comment in the code for capturing groups above about handling
THEN. */
@@ -1070,7 +1070,7 @@
RRETURN(rrc);
}
ecode += GET(ecode, 1);
- md->mark = save_mark;
+ md->mark = save_mark;
if (*ecode != OP_ALT) break;
}
@@ -1549,7 +1549,7 @@
case OP_ASSERT:
case OP_ASSERTBACK:
- save_mark = md->mark;
+ save_mark = md->mark;
if (md->match_function_type == MATCH_CONDASSERT)
{
condassert = TRUE;
@@ -1571,7 +1571,7 @@
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode, 1);
- md->mark = save_mark;
+ md->mark = save_mark;
}
while (*ecode == OP_ALT);
@@ -1595,7 +1595,7 @@
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
- save_mark = md->mark;
+ save_mark = md->mark;
if (md->match_function_type == MATCH_CONDASSERT)
{
condassert = TRUE;
@@ -1606,7 +1606,7 @@
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
- md->mark = save_mark;
+ md->mark = save_mark;
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
{
@@ -6207,21 +6207,21 @@
const pcre_study_data *study;
const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
-/* Check for the special magic call that measures the size of the stack used
+/* Check for the special magic call that measures the size of the stack used
per recursive call of match(). */
if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
start_offset == -999)
#ifdef NO_RECURSE
return -sizeof(heapframe);
-#else
+#else
return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
-#endif
+#endif
/* Plausibility checks */
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
+if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
Modified: code/trunk/pcreposix.c
===================================================================
--- code/trunk/pcreposix.c 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/pcreposix.c 2012-01-21 16:37:17 UTC (rev 903)
@@ -158,7 +158,7 @@
REG_BADPAT, /* \N is not supported in a class */
REG_BADPAT, /* too many forward references */
REG_BADPAT, /* disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) */
- REG_BADPAT /* invalid UTF-16 string (should not occur) */
+ REG_BADPAT /* invalid UTF-16 string (should not occur) */
};
/* Table of texts corresponding to POSIX error codes */
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/pcretest.c 2012-01-21 16:37:17 UTC (rev 903)
@@ -2412,9 +2412,9 @@
if (rc)
{
const char *arch;
- (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch);
+ (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch);
printf(" Just-in-time compiler support: %s\n", arch);
- }
+ }
else
printf(" No just-in-time compiler support\n");
(void)PCRE_CONFIG(PCRE_CONFIG_NEWLINE, &rc);
@@ -2438,11 +2438,11 @@
(void)PCRE_CONFIG(PCRE_CONFIG_STACKRECURSE, &rc);
printf(" Match recursion uses %s", rc? "stack" : "heap");
if (showstore)
- {
+ {
PCRE_EXEC(stack_size, NULL, NULL, NULL, -999, -999, 0, NULL, 0);
- printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size);
+ printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size);
}
- printf("\n");
+ printf("\n");
goto EXIT;
}
else if (strcmp(argv[op], "-help") == 0 ||
@@ -3385,10 +3385,10 @@
cn16ptr = copynames;
gn16ptr = getnames;
#endif
-#ifdef SUPPORT_PCRE8
+#ifdef SUPPORT_PCRE8
cn8ptr = copynames8;
gn8ptr = getnames8;
-#endif
+#endif
SET_PCRE_CALLOUT(callout);
first_callout = 1;
@@ -3483,9 +3483,9 @@
{
if (++i == 9)
fprintf(outfile, "** Too many hex digits in \\x{...} item; "
- "using only the first eight.\n");
+ "using only the first eight.\n");
else c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10);
- }
+ }
if (*pt == '}')
{
p = pt + 1;
Modified: code/trunk/perltest.pl
===================================================================
--- code/trunk/perltest.pl 2012-01-21 15:59:35 UTC (rev 902)
+++ code/trunk/perltest.pl 2012-01-21 16:37:17 UTC (rev 903)
@@ -23,7 +23,7 @@
foreach $c (@p)
{
if ($c >= 32 && $c < 127) { $t .= chr $c; }
- else { $t .= sprintf("\\x{%02x}", $c);
+ else { $t .= sprintf("\\x{%02x}", $c);
}
}
}
@@ -216,16 +216,16 @@
}
splice(@subs, 0, 18);
}
-
+
# It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
# set and the input pattern was a UTF-8 string. We can, however, force
- # it to be so marked.
-
+ # it to be so marked.
+
if (defined $REGMARK && $REGMARK != 1)
{
- $xx = $REGMARK;
- $xx = Encode::decode_utf8($xx) if $utf8;
- printf $outfile ("MK: %s\n", &pchars($xx));
+ $xx = $REGMARK;
+ $xx = Encode::decode_utf8($xx) if $utf8;
+ printf $outfile ("MK: %s\n", &pchars($xx));
}
}
}