Revision: 691
http://vcs.pcre.org/viewvc?view=rev&revision=691
Author: ph10
Date: 2011-09-11 15:31:21 +0100 (Sun, 11 Sep 2011)
Log Message:
-----------
Final source and document tidies for 8.20-RC1.
Modified Paths:
--------------
code/trunk/CMakeLists.txt
code/trunk/ChangeLog
code/trunk/NEWS
code/trunk/NON-UNIX-USE
code/trunk/PrepareRelease
code/trunk/README
code/trunk/RunTest
code/trunk/configure.ac
code/trunk/doc/html/index.html
code/trunk/doc/html/pcre.html
code/trunk/doc/html/pcre_assign_jit_stack.html
code/trunk/doc/html/pcre_config.html
code/trunk/doc/html/pcre_dfa_exec.html
code/trunk/doc/html/pcre_exec.html
code/trunk/doc/html/pcre_free_study.html
code/trunk/doc/html/pcre_fullinfo.html
code/trunk/doc/html/pcre_jit_stack_alloc.html
code/trunk/doc/html/pcre_jit_stack_free.html
code/trunk/doc/html/pcre_study.html
code/trunk/doc/html/pcreapi.html
code/trunk/doc/html/pcrebuild.html
code/trunk/doc/html/pcrecallout.html
code/trunk/doc/html/pcregrep.html
code/trunk/doc/html/pcrejit.html
code/trunk/doc/html/pcrepartial.html
code/trunk/doc/html/pcreprecompile.html
code/trunk/doc/html/pcrestack.html
code/trunk/doc/html/pcretest.html
code/trunk/doc/html/pcreunicode.html
code/trunk/doc/pcre.3
code/trunk/doc/pcre.txt
code/trunk/doc/pcre_assign_jit_stack.3
code/trunk/doc/pcre_dfa_exec.3
code/trunk/doc/pcre_exec.3
code/trunk/doc/pcre_free_study.3
code/trunk/doc/pcre_fullinfo.3
code/trunk/doc/pcre_jit_stack_alloc.3
code/trunk/doc/pcre_study.3
code/trunk/doc/pcreapi.3
code/trunk/doc/pcrebuild.3
code/trunk/doc/pcrecallout.3
code/trunk/doc/pcregrep.1
code/trunk/doc/pcregrep.txt
code/trunk/doc/pcrejit.3
code/trunk/doc/pcreprecompile.3
code/trunk/doc/pcrestack.3
code/trunk/doc/pcretest.1
code/trunk/doc/pcretest.txt
code/trunk/doc/pcreunicode.3
code/trunk/doc/perltest.txt
code/trunk/pcre_compile.c
code/trunk/pcre_exec.c
code/trunk/pcre_fullinfo.c
code/trunk/pcre_internal.h
code/trunk/pcre_jit_compile.c
code/trunk/pcre_jit_test.c
code/trunk/pcre_study.c
code/trunk/pcregrep.c
code/trunk/pcretest.c
Modified: code/trunk/CMakeLists.txt
===================================================================
--- code/trunk/CMakeLists.txt 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/CMakeLists.txt 2011-09-11 14:31:21 UTC (rev 691)
@@ -117,10 +117,10 @@
SET(PCRE_SUPPORT_JIT OFF CACHE BOOL
"Enable support for Just-in-time compiling.")
-
+
SET(PCRE_SUPPORT_PCREGREP_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcregrep.")
-
+
SET(PCRE_SUPPORT_UNICODE_PROPERTIES OFF CACHE BOOL
"Enable support for Unicode properties. (If set, UTF-8 support will be enabled as well)")
@@ -218,12 +218,12 @@
IF(PCRE_SUPPORT_JIT)
SET(SUPPORT_JIT 1)
ELSE
- SET(PCRE_SUPPORT_PCREGREP_JIT 0)
-ENDIF(PCRE_SUPPORT_JIT)
+ SET(PCRE_SUPPORT_PCREGREP_JIT 0)
+ENDIF(PCRE_SUPPORT_JIT)
IF(PCRE_SUPPORT_PCREGREP_JIT)
SET(SUPPORT_PCREGREP_JIT 1)
-ENDIF(PCRE_SUPPORT_PCREGREP_JIT)
+ENDIF(PCRE_SUPPORT_PCREGREP_JIT)
# This next one used to contain
# SET(PCRETEST_LIBS ${READLINE_LIBRARY})
@@ -449,12 +449,12 @@
ADD_EXECUTABLE(pcretest pcretest.c)
SET(targets ${targets} pcretest)
TARGET_LINK_LIBRARIES(pcretest pcreposix ${PCRETEST_LIBS})
-
+
IF(PCRE_SUPPORT_JIT)
ADD_EXECUTABLE(pcre_jit_test pcre_jit_test.c)
SET(targets ${targets} pcre_jit_test)
TARGET_LINK_LIBRARIES(pcre_jit_test pcre)
- ENDIF(PCRE_SUPPORT_JIT)
+ ENDIF(PCRE_SUPPORT_JIT)
IF(PCRE_BUILD_PCRECPP)
ADD_EXECUTABLE(pcrecpp_unittest pcrecpp_unittest.cc)
@@ -501,7 +501,7 @@
GET_TARGET_PROPERTY(PCRE_JIT_TEST_EXE
pcre_jit_test
DEBUG_LOCATION)
- ENDIF(PCRE_SUPPORT_JIT)
+ ENDIF(PCRE_SUPPORT_JIT)
GET_TARGET_PROPERTY(PCRECPP_UNITTEST_EXE
pcrecpp_unittest
@@ -517,7 +517,7 @@
IF(PCRE_SUPPORT_JIT)
ADD_TEST(pcre_jit_test ${PCRE_JIT_TEST_EXE})
- ENDIF(PCRE_SUPPORT_JIT)
+ ENDIF(PCRE_SUPPORT_JIT)
ADD_TEST(pcrecpp_test ${PCRECPP_UNITTEST_EXE})
ADD_TEST(pcre_scanner_test ${PCRE_SCANNER_UNITTEST_EXE})
ADD_TEST(pcre_stringpiece_test ${PCRE_STRINGPIECE_UNITTEST_EXE})
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/ChangeLog 2011-09-11 14:31:21 UTC (rev 691)
@@ -1,45 +1,46 @@
ChangeLog for PCRE
------------------
-Version 8.20
-------------
+Version 8.20 12-Sep-2011
+------------------------
1. Change 37 of 8.13 broke patterns like [:a]...[b:] because it thought it had
a POSIX class. After further experiments with Perl, which convinced me that
Perl has bugs and confusions, a closing square bracket is no longer allowed
- in a POSIX name.
+ in a POSIX name. This bug also affected patterns with classes that started
+ with full stops.
-2. If a pattern such as /(a)b|ac/ is matched against "ac", there is no captured
- substring, but while checking the failing first alternative, substring 1 is
- temporarily captured. If the output vector supplied to pcre_exec() was not
- big enough for this capture, the yield of the function was still zero
- ("insufficient space for captured substrings"). This cannot be totally fixed
- without adding another stack variable, which seems a lot of expense for a
+2. If a pattern such as /(a)b|ac/ is matched against "ac", there is no captured
+ substring, but while checking the failing first alternative, substring 1 is
+ temporarily captured. If the output vector supplied to pcre_exec() was not
+ big enough for this capture, the yield of the function was still zero
+ ("insufficient space for captured substrings"). This cannot be totally fixed
+ without adding another stack variable, which seems a lot of expense for a
edge case. However, I have improved the situation in cases such as
- /(a)(b)x|abc/ matched against "abc", where the return code indicates that
- fewer than the maximum number of slots in the ovector have been set.
-
-3. Related to (2) above: when there are more back references in a pattern than
- slots in the output vector, pcre_exec() uses temporary memory during
- matching, and copies in the captures as far as possible afterwards. It was
- using the entire output vector, but this conflicts with the specification
- that only 2/3 is used for passing back captured substrings. Now it uses only
- the first 2/3, for compatibility. This is, of course, another edge case.
-
+ /(a)(b)x|abc/ matched against "abc", where the return code indicates that
+ fewer than the maximum number of slots in the ovector have been set.
+
+3. Related to (2) above: when there are more back references in a pattern than
+ slots in the output vector, pcre_exec() uses temporary memory during
+ matching, and copies in the captures as far as possible afterwards. It was
+ using the entire output vector, but this conflicts with the specification
+ that only 2/3 is used for passing back captured substrings. Now it uses only
+ the first 2/3, for compatibility. This is, of course, another edge case.
+
4. Zoltan Herczeg's just-in-time compiler support has been integrated into the
main code base, and can be used by building with --enable-jit. When this is
- done, pcregrep automatically uses it unless --disable-pcregrep-jit or the
+ done, pcregrep automatically uses it unless --disable-pcregrep-jit or the
runtime --no-jit option is given.
-
-5. When the number of matches in a pcre_dfa_exec() run exactly filled the
- ovector, the return from the function was zero, implying that there were
- other matches that did not fit. The correct "exactly full" value is now
- returned.
-
+
+5. When the number of matches in a pcre_dfa_exec() run exactly filled the
+ ovector, the return from the function was zero, implying that there were
+ other matches that did not fit. The correct "exactly full" value is now
+ returned.
+
6. If a subpattern that was called recursively or as a subroutine contained
- (*PRUNE) or any other control that caused it to give a non-standard return,
- invalid errors such as "Error -26 (nested recursion at the same subject
- position)" or even infinite loops could occur.
+ (*PRUNE) or any other control that caused it to give a non-standard return,
+ invalid errors such as "Error -26 (nested recursion at the same subject
+ position)" or even infinite loops could occur.
Version 8.13 16-Aug-2011
Modified: code/trunk/NEWS
===================================================================
--- code/trunk/NEWS 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/NEWS 2011-09-11 14:31:21 UTC (rev 691)
@@ -4,7 +4,7 @@
Release 8.20
------------
-The main change in this release is the inclusion of Zoltan Herczeg's
+The main change in this release is the inclusion of Zoltan Herczeg's
just-in-time compiler support, which can be accessed by building PCRE with
--enable-jit. Large performance benefits can be had in many situations. 8.20
also fixes an unfortunate bug that was introduced in 8.13 as well as tidying up
Modified: code/trunk/NON-UNIX-USE
===================================================================
--- code/trunk/NON-UNIX-USE 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/NON-UNIX-USE 2011-09-11 14:31:21 UTC (rev 691)
@@ -122,12 +122,12 @@
an unusual compiler) so that all included PCRE header files are first
sought in the current directory. Otherwise you run the risk of picking up
a previously-installed file from somewhere else.
-
+
(7) If you have defined SUPPORT_JIT in config.h, you must also compile
-
+
pcre_jit_compile.c
-
- This file #includes sources from the sljit subdirectory, where there
+
+ This file #includes sources from the sljit subdirectory, where there
should be 16 files, all of whose names begin with "sljit".
(8) Now link all the compiled code into an object library in whichever form
@@ -148,12 +148,12 @@
(11) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. Some tests are
- relevant only when certain build-time options are selected. For example,
- test 4 is for UTF-8 support, and will not run if you have build PCRE
- without it. See the comments at the start of each testinput file. If you
- have a suitable Unix-like shell, the RunTest script will run the
+ relevant only when certain build-time options are selected. For example,
+ test 4 is for UTF-8 support, and will not run if you have build PCRE
+ without it. See the comments at the start of each testinput file. If you
+ have a suitable Unix-like shell, the RunTest script will run the
appropriate tests for you.
-
+
Note that the supplied files are in Unix format, with just LF characters
as line terminators. You may need to edit them to change this if your
system uses a different convention. If you are using Windows, you probably
@@ -161,7 +161,7 @@
corresponding output file). This is a locale test; wintestinput3 sets the
locale to "french" rather than "fr_FR", and there some minor output
differences.
-
+
(12) If you have built PCRE with SUPPORT_JIT, the JIT features will be tested
by the testdata files. However, you might also like to build and run
the JIT test program, pcre_jit_test.c.
Modified: code/trunk/PrepareRelease
===================================================================
--- code/trunk/PrepareRelease 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/PrepareRelease 2011-09-11 14:31:21 UTC (rev 691)
@@ -203,7 +203,7 @@
pcre_globals.c \
pcre_info.c \
pcre_jit_compile.c \
- pcre_jit_test.c \
+ pcre_jit_test.c \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
Modified: code/trunk/README
===================================================================
--- code/trunk/README 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/README 2011-09-11 14:31:21 UTC (rev 691)
@@ -173,15 +173,15 @@
--disable-cpp to the "configure" command. Otherwise, when "configure" is run,
it will try to find a C++ compiler and C++ header files, and if it succeeds,
it will try to build the C++ wrapper.
-
-. If you want to include support for just-in-time compiling, which can give
- large performance improvements on certain platforms, add --enable-jit to the
- "configure" command. This support is available only for certain hardware
- architectures. If you try to enable it on an unsupported architecture, there
+
+. If you want to include support for just-in-time compiling, which can give
+ large performance improvements on certain platforms, add --enable-jit to the
+ "configure" command. This support is available only for certain hardware
+ architectures. If you try to enable it on an unsupported architecture, there
will be a compile time error.
-
+
. When JIT support is enabled, pcregrep automatically makes use of it, unless
- you add --disable-pcregrep-jit to the "configure" command.
+ you add --disable-pcregrep-jit to the "configure" command.
. If you want to make use of the support for UTF-8 Unicode character strings in
PCRE, you must add --enable-utf8 to the "configure" command. Without it, the
@@ -355,10 +355,10 @@
Once "configure" has run, you can run "make". It builds two libraries, called
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
-command. If a C++ compiler was found on your system, and you did not disable it
+command. If a C++ compiler was found on your system, and you did not disable it
with --disable-cpp, "make" also builds the C++ wrapper library, which is called
libpcrecpp, and some test programs called pcrecpp_unittest,
-pcre_scanner_unittest, and pcre_stringpiece_unittest. If you enabled JIT
+pcre_scanner_unittest, and pcre_stringpiece_unittest. If you enabled JIT
support with --enable-jit, a test program called pcre_jit_test is also built.
The command "make check" runs all the appropriate tests. Details of the PCRE
@@ -394,7 +394,7 @@
Man pages (share/man/man{1,3}):
pcregrep.1
pcretest.1
- pcre-config.1
+ pcre-config.1
pcre.3
pcre*.3 (lots more pages, all starting "pcre")
@@ -412,7 +412,7 @@
pcre.txt (a concatenation of the man(3) pages)
pcretest.txt the pcretest man page
pcregrep.txt the pcregrep man page
- pcre-config.txt the pcre-config man page
+ pcre-config.txt the pcre-config man page
If you want to remove PCRE from your system, you can run "make uninstall".
This removes all the files that "make install" installed. However, it does not
@@ -548,7 +548,7 @@
created by the configuring process. There is also a script called RunGrepTest
that tests the options of the pcregrep command. If the C++ wrapper library is
built, three test programs called pcrecpp_unittest, pcre_scanner_unittest, and
-pcre_stringpiece_unittest are also built. When JIT support is enabled, another
+pcre_stringpiece_unittest are also built. When JIT support is enabled, another
test program called pcre_jit_test is built.
Both the scripts and all the program tests are run if you obey "make check" or
@@ -561,10 +561,10 @@
were selected. For example, the tests for UTF-8 support are run only if
--enable-utf8 was used. RunTest outputs a comment when it skips a test.
-Many of the tests that are not skipped are run up to three times. The second
-run forces pcre_study() to be called for all patterns except for a few in some
-tests that are marked "never study" (see the pcretest program for how this is
-done). If JIT support is available, the non-DFA tests are run a third time,
+Many of the tests that are not skipped are run up to three times. The second
+run forces pcre_study() to be called for all patterns except for a few in some
+tests that are marked "never study" (see the pcretest program for how this is
+done). If JIT support is available, the non-DFA tests are run a third time,
this time with a forced pcre_study() with the PCRE_STUDY_JIT_COMPILE option.
RunTest uses a file called testtry to hold the main output from pcretest
@@ -638,9 +638,9 @@
The thirteenth test checks a number internals and non-Perl features concerned
with Unicode property support.
-The fourteenth test is run only when JIT support is available, and the
-fifteenth test is run only when JIT support is not available. They test some
-JIT-specific features such as information output from pcretest about JIT
+The fourteenth test is run only when JIT support is available, and the
+fifteenth test is run only when JIT support is not available. They test some
+JIT-specific features such as information output from pcretest about JIT
compilation.
@@ -721,7 +721,7 @@
pcre_get.c ) sources for the functions in the library,
pcre_globals.c ) and some internal functions that they use
pcre_info.c )
- pcre_jit_compile.c )
+ pcre_jit_compile.c )
pcre_maketables.c )
pcre_newline.c )
pcre_ord2utf8.c )
@@ -738,7 +738,7 @@
pcre.h.in template for pcre.h when built by "configure"
pcreposix.h header for the external POSIX wrapper API
pcre_internal.h header for internal use
- sljit/* 16 files that make up the JIT compiler
+ sljit/* 16 files that make up the JIT compiler
ucp.h header for Unicode property handling
config.h.in template for config.h, which is built by "configure"
@@ -805,7 +805,7 @@
mkinstalldirs script for making install directories
perltest.pl Perl test program
pcre-config.in source of script which retains PCRE information
- pcre_jit_test.c test program for the JIT compiler
+ pcre_jit_test.c test program for the JIT compiler
pcrecpp_unittest.cc )
pcre_scanner_unittest.cc ) test programs for the C++ wrapper
pcre_stringpiece_unittest.cc )
Modified: code/trunk/RunTest
===================================================================
--- code/trunk/RunTest 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/RunTest 2011-09-11 14:31:21 UTC (rev 691)
@@ -18,8 +18,8 @@
# two tests for JIT-specific features, one to be run when JIT support is
# available, and one when it is not.
-# The arguments for this script can be individual test numbers, or the word
-# "valgrind", or "sim" followed by an argument to run cross-compiled
+# The arguments for this script can be individual test numbers, or the word
+# "valgrind", or "sim" followed by an argument to run cross-compiled
# executables under a simulator, for example:
#
# RunTest 3 sim "qemu-arm -s 8388608"
Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/configure.ac 2011-09-11 14:31:21 UTC (rev 691)
@@ -11,7 +11,7 @@
m4_define(pcre_major, [8])
m4_define(pcre_minor, [20])
m4_define(pcre_prerelease, [-RC1])
-m4_define(pcre_date, [2011-08-18])
+m4_define(pcre_date, [2011-09-12])
# Libtool shared library interface versions (current:revision:age)
m4_define(libpcre_version, [0:1:0])
@@ -491,7 +491,7 @@
if test "$enable_pcregrep_jit" = "yes"; then
AC_DEFINE([SUPPORT_PCREGREP_JIT], [], [
- Define to enable JIT support in pcregrep.])
+ Define to enable JIT support in pcregrep.])
fi
if test "$enable_utf8" = "yes"; then
@@ -771,7 +771,7 @@
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
- Use JIT in pcregrep ............. : ${enable_pcregrep_jit}
+ Use JIT in pcregrep ............. : ${enable_pcregrep_jit}
Buffer size for pcregrep ........ : ${with_pcregrep_bufsize}
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
Modified: code/trunk/doc/html/index.html
===================================================================
--- code/trunk/doc/html/index.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/index.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -1,10 +1,10 @@
<html>
-<!-- This is a manually maintained file that is the root of the HTML version of
- the PCRE documentation. When the HTML documents are built from the man
- page versions, the entire doc/html directory is emptied, this file is then
- copied into doc/html/index.html, and the remaining files therein are
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
--->
+-->
<head>
<title>PCRE specification</title>
</head>
@@ -83,12 +83,15 @@
</table>
<p>
-There are also individual pages that summarize the interface for each function
+There are also individual pages that summarize the interface for each function
in the library:
</p>
-<table>
+<table>
+<tr><td><a href="pcre_assign_jit_stack.html">pcre_assign_jit_stack</a></td>
+ <td> Assign stack for JIT matching</td></tr>
+
<tr><td><a href="pcre_compile.html">pcre_compile</a></td>
<td> Compile a regular expression</td></tr>
@@ -108,6 +111,9 @@
<td> Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
+<tr><td><a href="pcre_free_study.html">pcre_free_study</a></td>
+ <td> Free study data</td></tr>
+
<tr><td><a href="pcre_exec.html">pcre_exec</a></td>
<td> Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
@@ -136,9 +142,15 @@
<tr><td><a href="pcre_info.html">pcre_info</a></td>
<td> Obsolete information extraction function</td></tr>
+<tr><td><a href="pcre_jit_stack_alloc.html">pcre_jit_stack_alloc</a></td>
+ <td> Create a stack for JIT matching</td></tr>
+
+<tr><td><a href="pcre_jit_stack_free.html">pcre_jit_stack_free</a></td>
+ <td> Free a JIT matching stack</td></tr>
+
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td> Build character tables in current locale</td></tr>
-
+
<tr><td><a href="pcre_refcount.html">pcre_refcount</a></td>
<td> Maintain reference count in compiled pattern</td></tr>
Modified: code/trunk/doc/html/pcre.html
===================================================================
--- code/trunk/doc/html/pcre.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -98,8 +98,8 @@
pcrecpp details of the C++ wrapper
pcredemo a demonstration C program that uses PCRE
pcregrep description of the <b>pcregrep</b> command
- pcrejit discussion of the just-in-time optimization support
- pcrelimits details of size and other limits
+ pcrejit discussion of the just-in-time optimization support
+ pcrelimits details of size and other limits
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
pcrepattern syntax and semantics of supported regular expressions
@@ -110,7 +110,7 @@
pcrestack discussion of stack usage
pcresyntax quick syntax reference
pcretest description of the <b>pcretest</b> testing command
- pcreunicode discussion of Unicode and UTF-8 support
+ pcreunicode discussion of Unicode and UTF-8 support
</pre>
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
Modified: code/trunk/doc/html/pcre_assign_jit_stack.html
===================================================================
--- code/trunk/doc/html/pcre_assign_jit_stack.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_assign_jit_stack.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -0,0 +1,68 @@
+<html>
+<head>
+<title>pcre_assign_jit_stack specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre_assign_jit_stack man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include <pcre.h></b>
+</P>
+<P>
+<b>void pcre_assign_jit_stack(pcre_extra *<i>extra</i>, </b>
+<b>pcre_jit_callback <i>callback</i>, void *<i>data</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function provides control over the memory used as a stack at runtime by a
+call to <b>pcre_exec()</b> with a pattern that has been successfully compiled
+with JIT optimization. The arguments are:
+<pre>
+ extra the data pointer returned by <b>pcre_study()</b>
+ callback a callback function
+ data a JIT stack or a value to be passed to the callback
+ function
+</PRE>
+</P>
+<P>
+If <i>callback</i> is NULL and <i>data</i> is NULL, an internal 32K block on
+the machine stack is used.
+</P>
+<P>
+If <i>callback</i> is NULL and <i>data</i> is not NULL, <i>data</i> must
+be a valid JIT stack, the result of calling <b>pcre_jit_stack_alloc()</b>.
+</P>
+<P>
+If <i>callback</i> not NULL, it is called with <i>data</i> as an argument at
+the start of matching, in order to set up a JIT stack. If the result is NULL,
+the internal 32K stack is used; otherwise the return value must be a valid JIT
+stack, the result of calling <b>pcre_jit_stack_alloc()</b>.
+</P>
+<P>
+You may safely assign the same JIT stack to multiple patterns, as long as they
+are all matched in the same thread. In a multithread application, each thread
+must use its own JIT stack. For more details, see the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+page.
+</P>
+<P>
+There is a complete description of the PCRE native API in the
+<a href="pcreapi.html"><b>pcreapi</b></a>
+page and a description of the POSIX API in the
+<a href="pcreposix.html"><b>pcreposix</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
Modified: code/trunk/doc/html/pcre_config.html
===================================================================
--- code/trunk/doc/html/pcre_config.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_config.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -26,14 +26,18 @@
</b><br>
<P>
This function makes it possible for a client program to find out which optional
-features are available in the version of the PCRE library it is using. Its
+features are available in the version of the PCRE library it is using. The
arguments are as follows:
<pre>
<i>what</i> A code specifying what information is required
<i>where</i> Points to where to put the data
</pre>
-The available codes are:
+The <i>where</i> argument must point to an integer variable, except for
+PCRE_CONFIG_MATCH_LIMIT and PCRE_CONFIG_MATCH_LIMIT_RECURSION, when it must
+point to an unsigned long integer. The available codes are:
<pre>
+ PCRE_CONFIG_JIT Availability of just-in-time compiler
+ support (1=yes 0=no)
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
@@ -48,9 +52,8 @@
0 all Unicode line endings
1 CR, LF, or CRLF only
PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
- Threshold of return slots, above
- which <b>malloc()</b> is used by
- the POSIX API
+ Threshold of return slots, above which
+ <b>malloc()</b> is used by the POSIX API
PCRE_CONFIG_STACKRECURSE Recursion implementation (1=stack 0=heap)
PCRE_CONFIG_UTF8 Availability of UTF-8 support (1=yes 0=no)
PCRE_CONFIG_UNICODE_PROPERTIES
Modified: code/trunk/doc/html/pcre_dfa_exec.html
===================================================================
--- code/trunk/doc/html/pcre_dfa_exec.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_dfa_exec.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -82,17 +82,21 @@
<P>
A <b>pcre_extra</b> structure contains the following fields:
<pre>
- <i>flags</i> Bits indicating which fields are set
- <i>study_data</i> Opaque data from <b>pcre_study()</b>
- <i>match_limit</i> Limit on internal resource use
+ <i>flags</i> Bits indicating which fields are set
+ <i>study_data</i> Opaque data from <b>pcre_study()</b>
+ <i>match_limit</i> Limit on internal resource use
<i>match_limit_recursion</i> Limit on internal recursion depth
- <i>callout_data</i> Opaque data passed back to callouts
- <i>tables</i> Points to character tables or is NULL
+ <i>callout_data</i> Opaque data passed back to callouts
+ <i>tables</i> Points to character tables or is NULL
+ <i>mark</i> For passing back a *MARK pointer
+ <i>executable_jit</i> Opaque data from JIT compilation
</pre>
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES. For this matching function, the <i>match_limit</i> and
-<i>match_limit_recursion</i> fields are not used, and must not be set.
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
+PCRE_EXTRA_TABLES, PCRE_EXTRA_MARK and PCRE_EXTRA_EXECUTABLE_JIT. For this
+matching function, the <i>match_limit</i> and <i>match_limit_recursion</i> fields
+are not used, and must not be set. The PCRE_EXTRA_EXECUTABLE_JIT flag and
+the corresponding variable are ignored.
</P>
<P>
There is a complete description of the PCRE native API in the
Modified: code/trunk/doc/html/pcre_exec.html
===================================================================
--- code/trunk/doc/html/pcre_exec.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_exec.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -70,16 +70,18 @@
<a href="pcrepartial.html"><b>pcrepartial</b></a>
page. A <b>pcre_extra</b> structure contains the following fields:
<pre>
- <i>flags</i> Bits indicating which fields are set
- <i>study_data</i> Opaque data from <b>pcre_study()</b>
- <i>match_limit</i> Limit on internal resource use
+ <i>flags</i> Bits indicating which fields are set
+ <i>study_data</i> Opaque data from <b>pcre_study()</b>
+ <i>match_limit</i> Limit on internal resource use
<i>match_limit_recursion</i> Limit on internal recursion depth
- <i>callout_data</i> Opaque data passed back to callouts
- <i>tables</i> Points to character tables or is NULL
+ <i>callout_data</i> Opaque data passed back to callouts
+ <i>tables</i> Points to character tables or is NULL
+ <i>mark</i> For passing back a *MARK pointer
+ <i>executable_jit</i> Opaque data from JIT compilation
</pre>
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
-PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA, and
-PCRE_EXTRA_TABLES.
+PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
+PCRE_EXTRA_TABLES, PCRE_EXTRA_MARK and PCRE_EXTRA_EXECUTABLE_JIT.
</P>
<P>
There is a complete description of the PCRE native API in the
Modified: code/trunk/doc/html/pcre_free_study.html
===================================================================
--- code/trunk/doc/html/pcre_free_study.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_free_study.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -0,0 +1,40 @@
+<html>
+<head>
+<title>pcre_free_study specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre_free_study man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include <pcre.h></b>
+</P>
+<P>
+<b>void pcre_free_study(pcre_extra *<i>extra</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function is used to free the memory used for the data generated by a call
+to <b>pcre_study()</b> when it is no longer needed. The argument must be the
+result of such a call.
+</P>
+<P>
+There is a complete description of the PCRE native API in the
+<a href="pcreapi.html"><b>pcreapi</b></a>
+page and a description of the POSIX API in the
+<a href="pcreposix.html"><b>pcreposix</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
Modified: code/trunk/doc/html/pcre_fullinfo.html
===================================================================
--- code/trunk/doc/html/pcre_fullinfo.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_fullinfo.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -43,7 +43,9 @@
or after newline, or
-2 otherwise
PCRE_INFO_FIRSTTABLE Table of first bytes (after studying)
+ PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
+ PCRE_INFO_JIT Return 1 after successful JIT compilation
PCRE_INFO_LASTLITERAL Literal last byte required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
@@ -55,6 +57,15 @@
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
</pre>
+The <i>where</i> argument must point to an integer variable, except for the
+following <i>what</i> values:
+<pre>
+ PCRE_INFO_DEFAULT_TABLES const unsigned char *
+ PCRE_INFO_FIRSTTABLE const unsigned char *
+ PCRE_INFO_NAMETABLE const unsigned char *
+ PCRE_INFO_OPTIONS unsigned long int
+ PCRE_INFO_SIZE size_t
+</pre>
The yield of the function is zero on success or:
<pre>
PCRE_ERROR_NULL the argument <i>code</i> was NULL
Modified: code/trunk/doc/html/pcre_jit_stack_alloc.html
===================================================================
--- code/trunk/doc/html/pcre_jit_stack_alloc.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_jit_stack_alloc.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -0,0 +1,47 @@
+<html>
+<head>
+<title>pcre_jit_stack_alloc specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre_jit_stack_alloc man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include <pcre.h></b>
+</P>
+<P>
+<b>pcre_jit_stack *pcre_jit_stack_alloc(int <i>startsize</i>, </b>
+<b>int <i>maxsize</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function is used to create a stack for use by the code compiled by the JIT
+optimization of <b>pcre_study()</b>. The arguments are a starting size for the
+stack, and a maximum size to which it is allowed to grow. The result can be
+passed to the JIT runtime code by <b>pcre_assign_jit_stack()</b>, or that
+function can set up a callback for obtaining a stack. A maximum stack size of
+512K to 1M should be more than enough for any pattern. For more details, see
+the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+page.
+</P>
+<P>
+There is a complete description of the PCRE native API in the
+<a href="pcreapi.html"><b>pcreapi</b></a>
+page and a description of the POSIX API in the
+<a href="pcreposix.html"><b>pcreposix</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
Modified: code/trunk/doc/html/pcre_jit_stack_free.html
===================================================================
--- code/trunk/doc/html/pcre_jit_stack_free.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_jit_stack_free.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -0,0 +1,42 @@
+<html>
+<head>
+<title>pcre_jit_stack_free specification</title>
+</head>
+<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
+<h1>pcre_jit_stack_free man page</h1>
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
+<p>
+This page is part of the PCRE HTML documentation. It was generated automatically
+from the original man page. If there is any nonsense in it, please consult the
+man page, in case the conversion went wrong.
+<br>
+<br><b>
+SYNOPSIS
+</b><br>
+<P>
+<b>#include <pcre.h></b>
+</P>
+<P>
+<b>void pcre_jit_stack_free(pcre_jit_stack *<i>stack</i>);</b>
+</P>
+<br><b>
+DESCRIPTION
+</b><br>
+<P>
+This function is used to free a JIT stack that was created by
+<b>pcre_jit_stack_alloc()</b> when it is no longer needed. For more details, see
+the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+page.
+</P>
+<P>
+There is a complete description of the PCRE native API in the
+<a href="pcreapi.html"><b>pcreapi</b></a>
+page and a description of the POSIX API in the
+<a href="pcreposix.html"><b>pcreposix</b></a>
+page.
+<p>
+Return to the <a href="index.html">PCRE index page</a>.
+</p>
Modified: code/trunk/doc/html/pcre_study.html
===================================================================
--- code/trunk/doc/html/pcre_study.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcre_study.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -34,7 +34,7 @@
<i>errptr</i> Where to put an error message
</pre>
If the function succeeds, it returns a value that can be passed to
-<b>pcre_exec()</b> via its <i>extra</i> argument.
+<b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> via their <i>extra</i> arguments.
</P>
<P>
If the function returns NULL, either it could not find any additional
@@ -42,8 +42,11 @@
the error value. It is NULL in first case.
</P>
<P>
-There are currently no options defined; the value of the second argument should
-always be zero.
+The only option is PCRE_STUDY_JIT_COMPILE. It requests just-in-time compilation
+if possible. If PCRE has been compiled without JIT support, this option is
+ignored. See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+page for further details.
</P>
<P>
There is a complete description of the PCRE native API in the
Modified: code/trunk/doc/html/pcreapi.html
===================================================================
--- code/trunk/doc/html/pcreapi.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcreapi.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -13,30 +13,32 @@
man page, in case the conversion went wrong.
<br>
<ul>
-<li><a name="TOC1" href="#SEC1">PCRE NATIVE API</a>
-<li><a name="TOC2" href="#SEC2">PCRE API OVERVIEW</a>
-<li><a name="TOC3" href="#SEC3">NEWLINES</a>
-<li><a name="TOC4" href="#SEC4">MULTITHREADING</a>
-<li><a name="TOC5" href="#SEC5">SAVING PRECOMPILED PATTERNS FOR LATER USE</a>
-<li><a name="TOC6" href="#SEC6">CHECKING BUILD-TIME OPTIONS</a>
-<li><a name="TOC7" href="#SEC7">COMPILING A PATTERN</a>
-<li><a name="TOC8" href="#SEC8">COMPILATION ERROR CODES</a>
-<li><a name="TOC9" href="#SEC9">STUDYING A PATTERN</a>
-<li><a name="TOC10" href="#SEC10">LOCALE SUPPORT</a>
-<li><a name="TOC11" href="#SEC11">INFORMATION ABOUT A PATTERN</a>
-<li><a name="TOC12" href="#SEC12">OBSOLETE INFO FUNCTION</a>
-<li><a name="TOC13" href="#SEC13">REFERENCE COUNTS</a>
-<li><a name="TOC14" href="#SEC14">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
-<li><a name="TOC15" href="#SEC15">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
-<li><a name="TOC16" href="#SEC16">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
-<li><a name="TOC17" href="#SEC17">DUPLICATE SUBPATTERN NAMES</a>
-<li><a name="TOC18" href="#SEC18">FINDING ALL POSSIBLE MATCHES</a>
-<li><a name="TOC19" href="#SEC19">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
-<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
-<li><a name="TOC21" href="#SEC21">AUTHOR</a>
-<li><a name="TOC22" href="#SEC22">REVISION</a>
+<li><a name="TOC1" href="#SEC1">PCRE NATIVE API BASIC FUNCTIONS</a>
+<li><a name="TOC2" href="#SEC2">PCRE NATIVE API AUXILIARY FUNCTIONS</a>
+<li><a name="TOC3" href="#SEC3">PCRE NATIVE API INDIRECTED FUNCTIONS</a>
+<li><a name="TOC4" href="#SEC4">PCRE API OVERVIEW</a>
+<li><a name="TOC5" href="#SEC5">NEWLINES</a>
+<li><a name="TOC6" href="#SEC6">MULTITHREADING</a>
+<li><a name="TOC7" href="#SEC7">SAVING PRECOMPILED PATTERNS FOR LATER USE</a>
+<li><a name="TOC8" href="#SEC8">CHECKING BUILD-TIME OPTIONS</a>
+<li><a name="TOC9" href="#SEC9">COMPILING A PATTERN</a>
+<li><a name="TOC10" href="#SEC10">COMPILATION ERROR CODES</a>
+<li><a name="TOC11" href="#SEC11">STUDYING A PATTERN</a>
+<li><a name="TOC12" href="#SEC12">LOCALE SUPPORT</a>
+<li><a name="TOC13" href="#SEC13">INFORMATION ABOUT A PATTERN</a>
+<li><a name="TOC14" href="#SEC14">OBSOLETE INFO FUNCTION</a>
+<li><a name="TOC15" href="#SEC15">REFERENCE COUNTS</a>
+<li><a name="TOC16" href="#SEC16">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
+<li><a name="TOC17" href="#SEC17">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
+<li><a name="TOC18" href="#SEC18">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
+<li><a name="TOC19" href="#SEC19">DUPLICATE SUBPATTERN NAMES</a>
+<li><a name="TOC20" href="#SEC20">FINDING ALL POSSIBLE MATCHES</a>
+<li><a name="TOC21" href="#SEC21">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
+<li><a name="TOC22" href="#SEC22">SEE ALSO</a>
+<li><a name="TOC23" href="#SEC23">AUTHOR</a>
+<li><a name="TOC24" href="#SEC24">REVISION</a>
</ul>
-<br><a name="SEC1" href="#TOC1">PCRE NATIVE API</a><br>
+<br><a name="SEC1" href="#TOC1">PCRE NATIVE API BASIC FUNCTIONS</a><br>
<P>
<b>#include <pcre.h></b>
</P>
@@ -56,11 +58,25 @@
<b>const char **<i>errptr</i>);</b>
</P>
<P>
+<b>void pcre_free_study(pcre_extra *<i>extra</i>);</b>
+</P>
+<P>
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>);</b>
</P>
+<br><a name="SEC2" href="#TOC1">PCRE NATIVE API AUXILIARY FUNCTIONS</a><br>
<P>
+<b>pcre_jit_stack *pcre_jit_stack_alloc(int <i>startsize</i>, int <i>maxsize</i>);</b>
+</P>
+<P>
+<b>void pcre_jit_stack_free(pcre_jit_stack *<i>stack</i>);</b>
+</P>
+<P>
+<b>void pcre_assign_jit_stack(pcre_extra *<i>extra</i>, </b>
+<b>pcre_jit_callback <i>callback</i>, void *<i>data</i>);</b>
+</P>
+<P>
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
<b>int <i>options</i>, int *<i>ovector</i>, int <i>ovecsize</i>,</b>
@@ -126,6 +142,7 @@
<P>
<b>char *pcre_version(void);</b>
</P>
+<br><a name="SEC3" href="#TOC1">PCRE NATIVE API INDIRECTED FUNCTIONS</a><br>
<P>
<b>void *(*pcre_malloc)(size_t);</b>
</P>
@@ -141,14 +158,15 @@
<P>
<b>int (*pcre_callout)(pcre_callout_block *);</b>
</P>
-<br><a name="SEC2" href="#TOC1">PCRE API OVERVIEW</a><br>
+<br><a name="SEC4" href="#TOC1">PCRE API OVERVIEW</a><br>
<P>
PCRE has its own native API, which is described in this document. There are
also some wrapper functions that correspond to the POSIX regular expression
-API. These are described in the
+API, but they do not give access to all the functionality. They are described
+in the
<a href="pcreposix.html"><b>pcreposix</b></a>
documentation. Both of these APIs define a set of C function calls. A C++
-wrapper is distributed with PCRE. It is documented in the
+wrapper is also distributed with PCRE. It is documented in the
<a href="pcrecpp.html"><b>pcrecpp</b></a>
page.
</P>
@@ -179,6 +197,18 @@
documentation describes how to compile and run it.
</P>
<P>
+Just-in-time compiler support is an optional feature of PCRE that can be built
+in appropriate hardware environments. It greatly speeds up the matching
+performance of many patterns. Simple programs can easily request that it be
+used if available, by setting an option that is ignored when it is not
+relevant. More complicated programs might need to make use of the functions
+<b>pcre_jit_stack_alloc()</b>, <b>pcre_jit_stack_free()</b>, and
+<b>pcre_assign_jit_stack()</b> in order to control the JIT code's memory usage.
+These functions are discussed in the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation.
+</P>
+<P>
A second matching function, <b>pcre_dfa_exec()</b>, which is not
Perl-compatible, is also provided. This uses a different algorithm for the
matching. The alternative algorithm finds all possible matches (at a given
@@ -254,7 +284,7 @@
<a href="pcrecallout.html"><b>pcrecallout</b></a>
documentation.
<a name="newlines"></a></P>
-<br><a name="SEC3" href="#TOC1">NEWLINES</a><br>
+<br><a name="SEC5" href="#TOC1">NEWLINES</a><br>
<P>
PCRE supports five different conventions for indicating line breaks in
strings: a single CR (carriage return) character, a single LF (linefeed)
@@ -293,7 +323,7 @@
the \n or \r escape sequences, nor does it affect what \R matches, which is
controlled in a similar way, but by separate options.
</P>
-<br><a name="SEC4" href="#TOC1">MULTITHREADING</a><br>
+<br><a name="SEC6" href="#TOC1">MULTITHREADING</a><br>
<P>
The PCRE functions can be used in multi-threading applications, with the
proviso that the memory management functions pointed to by <b>pcre_malloc</b>,
@@ -304,8 +334,14 @@
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
</P>
-<br><a name="SEC5" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br>
<P>
+If the just-in-time optimization feature is being used, it needs separate
+memory stack areas for each thread. See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation for more details.
+</P>
+<br><a name="SEC7" href="#TOC1">SAVING PRECOMPILED PATTERNS FOR LATER USE</a><br>
+<P>
The compiled form of a regular expression can be saved and re-used at a later
time, possibly by a different program, and even on a host other than the one on
which it was compiled. Details are given in the
@@ -314,7 +350,7 @@
for use with a different version is not guaranteed to work and may cause
crashes.
</P>
-<br><a name="SEC6" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
+<br><a name="SEC8" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
<P>
<b>int pcre_config(int <i>what</i>, void *<i>where</i>);</b>
</P>
@@ -339,6 +375,11 @@
The output is an integer that is set to one if support for Unicode character
properties is available; otherwise it is set to zero.
<pre>
+ PCRE_CONFIG_JIT
+</pre>
+The output is an integer that is set to one if support for just-in-time
+compiling is available; otherwise it is set to zero.
+<pre>
PCRE_CONFIG_NEWLINE
</pre>
The output is an integer whose value specifies the default character sequence
@@ -393,7 +434,7 @@
<b>pcre_stack_free</b> are called to manage memory blocks on the heap, thus
avoiding the use of the stack.
</P>
-<br><a name="SEC7" href="#TOC1">COMPILING A PATTERN</a><br>
+<br><a name="SEC9" href="#TOC1">COMPILING A PATTERN</a><br>
<P>
<b>pcre *pcre_compile(const char *<i>pattern</i>, int <i>options</i>,</b>
<b>const char **<i>errptr</i>, int *<i>erroffset</i>,</b>
@@ -724,7 +765,7 @@
can also be passed to <b>pcre_exec()</b> and <b>pcre_dfa_exec()</b>, to suppress
the UTF-8 validity checking of subject strings.
</P>
-<br><a name="SEC8" href="#TOC1">COMPILATION ERROR CODES</a><br>
+<br><a name="SEC10" href="#TOC1">COMPILATION ERROR CODES</a><br>
<P>
The following table lists the error codes than may be returned by
<b>pcre_compile2()</b>, along with the error messages that may be returned by
@@ -808,7 +849,7 @@
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
be used if the limits were changed when PCRE was built.
<a name="studyingapattern"></a></P>
-<br><a name="SEC9" href="#TOC1">STUDYING A PATTERN</a><br>
+<br><a name="SEC11" href="#TOC1">STUDYING A PATTERN</a><br>
<P>
<b>pcre_extra *pcre_study(const pcre *<i>code</i>, int <i>options</i></b>
<b>const char **<i>errptr</i>);</b>
@@ -837,10 +878,24 @@
<b>pcre_dfa_exec()</b>, it must set up its own <b>pcre_extra</b> block.
</P>
<P>
-The second argument of <b>pcre_study()</b> contains option bits. At present, no
-options are defined, and this argument should always be zero.
+The second argument of <b>pcre_study()</b> contains option bits. There is only
+one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-time
+compiler is available, the pattern is further compiled into machine code that
+executes much faster than the <b>pcre_exec()</b> matching function. If
+the just-in-time compiler is not available, this option is ignored. All other
+bits in the <i>options</i> argument must be zero.
</P>
<P>
+JIT compilation is a heavyweight optimization. It can take some time for
+patterns to be analyzed, and for one-off matches and simple patterns the
+benefit of faster execution might be offset by a much slower study time.
+Not all patterns can be optimized by the JIT compiler. For those that cannot be
+handled, matching automatically falls back to the <b>pcre_exec()</b>
+interpreter. For more details, see the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation.
+</P>
+<P>
The third argument for <b>pcre_study()</b> is a pointer for an error message. If
studying succeeds (even if no data is returned), the variable it points to is
set to NULL. Otherwise it is set to point to a textual error message. This is a
@@ -849,13 +904,30 @@
sure that it has run successfully.
</P>
<P>
-This is a typical call to <b>pcre_study</b>():
+When you are finished with a pattern, you can free the memory used for the
+study data by calling <b>pcre_free_study()</b>. This function was added to the
+API for release 8.20. For earlier versions, the memory could be freed with
+<b>pcre_free()</b>, just like the pattern itself. This will still work in cases
+where PCRE_STUDY_JIT_COMPILE is not used, but it is advisable to change to the
+new function when convenient.
+</P>
+<P>
+This is a typical way in which <b>pcre_study</b>() is used (except that in a
+real application there should be tests for errors):
<pre>
- pcre_extra *pe;
- pe = pcre_study(
+ int rc;
+ pcre *re;
+ pcre_extra *sd;
+ re = pcre_compile("pattern", 0, &error, &erroroffset, NULL);
+ sd = pcre_study(
re, /* result of pcre_compile() */
- 0, /* no options exist */
+ 0, /* no options */
&error); /* set to NULL or points to a message */
+ rc = pcre_exec( /* see below for details of pcre_exec() options */
+ re, sd, "subject", 7, 0, 0, ovector, 30);
+ ...
+ pcre_free_study(sd);
+ pcre_free(re);
</pre>
Studying a pattern does two things: first, a lower bound for the length of
subject string that is needed to match the pattern is computed. This does not
@@ -872,14 +944,18 @@
matching.
</P>
<P>
-The two optimizations just described can be disabled by setting the
-PCRE_NO_START_OPTIMIZE option when calling <b>pcre_exec()</b> or
+These two optimizations apply to both <b>pcre_exec()</b> and
+<b>pcre_dfa_exec()</b>. However, they are not used by <b>pcre_exec()</b> if
+<b>pcre_study()</b> is called with the PCRE_STUDY_JIT_COMPILE option, and
+just-in-time compiling is successful. The optimizations can be disabled by
+setting the PCRE_NO_START_OPTIMIZE option when calling <b>pcre_exec()</b> or
<b>pcre_dfa_exec()</b>. You might want to do this if your pattern contains
-callouts or (*MARK), and you want to make use of these facilities in cases
-where matching fails. See the discussion of PCRE_NO_START_OPTIMIZE
+callouts or (*MARK) (which cannot be handled by the JIT compiler), and you want
+to make use of these facilities in cases where matching fails. See the
+discussion of PCRE_NO_START_OPTIMIZE
<a href="#execoptions">below.</a>
<a name="localesupport"></a></P>
-<br><a name="SEC10" href="#TOC1">LOCALE SUPPORT</a><br>
+<br><a name="SEC12" href="#TOC1">LOCALE SUPPORT</a><br>
<P>
PCRE handles caseless matching, and determines whether characters are letters,
digits, or whatever, by reference to a set of tables, indexed by character
@@ -940,7 +1016,7 @@
one in which it was compiled. Passing table pointers at run time is discussed
below in the section on matching a pattern.
<a name="infoaboutpattern"></a></P>
-<br><a name="SEC11" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br>
+<br><a name="SEC13" href="#TOC1">INFORMATION ABOUT A PATTERN</a><br>
<P>
<b>int pcre_fullinfo(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>int <i>what</i>, void *<i>where</i>);</b>
@@ -971,7 +1047,7 @@
size_t length;
rc = pcre_fullinfo(
re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
+ sd, /* result of pcre_study(), or NULL */
PCRE_INFO_SIZE, /* what is required */
&length); /* where to put the data */
</pre>
@@ -1040,6 +1116,17 @@
0. The fourth argument should point to an <b>int</b> variable. (?J) and
(?-J) set and unset the local PCRE_DUPNAMES option, respectively.
<pre>
+ PCRE_INFO_JIT
+</pre>
+Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE option, and
+just-in-time compiling was successful. The fourth argument should point to an
+<b>int</b> variable. A return value of 0 means that JIT support is not available
+in this version of PCRE, or that the pattern was not studied with the
+PCRE_STUDY_JIT_COMPILE option, or that the JIT compiler could not handle this
+particular pattern. See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation for details of what can and cannot be handled.
+<pre>
PCRE_INFO_LASTLITERAL
</pre>
Return the value of the rightmost literal byte that must exist in any matched
@@ -1166,7 +1253,7 @@
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
documentation for details).
</P>
-<br><a name="SEC12" href="#TOC1">OBSOLETE INFO FUNCTION</a><br>
+<br><a name="SEC14" href="#TOC1">OBSOLETE INFO FUNCTION</a><br>
<P>
<b>int pcre_info(const pcre *<i>code</i>, int *<i>optptr</i>, int</b>
<b>*<i>firstcharptr</i>);</b>
@@ -1190,7 +1277,7 @@
it is used to pass back information about the first character of any matched
string (see PCRE_INFO_FIRSTBYTE above).
</P>
-<br><a name="SEC13" href="#TOC1">REFERENCE COUNTS</a><br>
+<br><a name="SEC15" href="#TOC1">REFERENCE COUNTS</a><br>
<P>
<b>int pcre_refcount(pcre *<i>code</i>, int <i>adjust</i>);</b>
</P>
@@ -1214,7 +1301,7 @@
pattern is compiled on one host and then transferred to a host whose byte-order
is different. (This seems a highly unlikely scenario.)
</P>
-<br><a name="SEC14" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
+<br><a name="SEC16" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
<P>
<b>int pcre_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
@@ -1267,6 +1354,7 @@
<pre>
unsigned long int <i>flags</i>;
void *<i>study_data</i>;
+ void *<i>executable_jit</i>;
unsigned long int <i>match_limit</i>;
unsigned long int <i>match_limit_recursion</i>;
void *<i>callout_data</i>;
@@ -1277,16 +1365,18 @@
are set. The flag bits are:
<pre>
PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_EXECUTABLE_JIT
PCRE_EXTRA_MATCH_LIMIT
PCRE_EXTRA_MATCH_LIMIT_RECURSION
PCRE_EXTRA_CALLOUT_DATA
PCRE_EXTRA_TABLES
PCRE_EXTRA_MARK
</pre>
-Other flag bits should be set to zero. The <i>study_data</i> field is set in the
-<b>pcre_extra</b> block that is returned by <b>pcre_study()</b>, together with
-the appropriate flag bit. You should not set this yourself, but you may add to
-the block by setting the other fields and their corresponding flag bits.
+Other flag bits should be set to zero. The <i>study_data</i> field and sometimes
+the <i>executable_jit</i> field are set in the <b>pcre_extra</b> block that is
+returned by <b>pcre_study()</b>, together with the appropriate flag bits. You
+should not set these yourself, but you may add to the block by setting the
+other fields and their corresponding flag bits.
</P>
<P>
The <i>match_limit</i> field provides a means of preventing PCRE from using up a
@@ -1295,14 +1385,22 @@
classic example is a pattern that uses nested unlimited repeats.
</P>
<P>
-Internally, PCRE uses a function called <b>match()</b> which it calls repeatedly
-(sometimes recursively). The limit set by <i>match_limit</i> is imposed on the
-number of times this function is called during a match, which has the effect of
-limiting the amount of backtracking that can take place. For patterns that are
-not anchored, the count restarts from zero for each position in the subject
-string.
+Internally, <b>pcre_exec()</b> uses a function called <b>match()</b>, which it
+calls repeatedly (sometimes recursively). The limit set by <i>match_limit</i> is
+imposed on the number of times this function is called during a match, which
+has the effect of limiting the amount of backtracking that can take place. For
+patterns that are not anchored, the count restarts from zero for each position
+in the subject string.
</P>
<P>
+When <b>pcre_exec()</b> is called with a pattern that was successfully studied
+with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is executed
+is entirely different. However, there is still the possibility of runaway
+matching that goes on for a very long time, and so the <i>match_limit</i> value
+is also used in this case (but in a different way) to limit how long the
+matching can continue.
+</P>
+<P>
The default value for the limit can be set when PCRE is built; the default
default is 10 million, which handles all but the most extreme cases. You can
override the default by suppling <b>pcre_exec()</b> with a <b>pcre_extra</b>
@@ -1318,9 +1416,11 @@
This limit is of use only if it is set smaller than <i>match_limit</i>.
</P>
<P>
-Limiting the recursion depth limits the amount of stack that can be used, or,
-when PCRE has been compiled to use memory on the heap instead of the stack, the
-amount of heap memory that can be used.
+Limiting the recursion depth limits the amount of machine stack that can be
+used, or, when PCRE has been compiled to use memory on the heap instead of the
+stack, the amount of heap memory that can be used. This limit is not relevant,
+and is ignored, if the pattern was successfully studied with
+PCRE_STUDY_JIT_COMPILE.
</P>
<P>
The default value for <i>match_limit_recursion</i> can be set when PCRE is
@@ -1373,6 +1473,14 @@
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, PCRE_NOTEMPTY_ATSTART,
PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and
PCRE_PARTIAL_HARD.
+</P>
+<P>
+If the pattern was successfully studied with the PCRE_STUDY_JIT_COMPILE option,
+the only supported options for JIT execution are PCRE_NO_UTF8_CHECK,
+PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NOTEMPTY_ATSTART. Note in
+particular that partial matching is not supported. If an unsupported option is
+used, JIT execution is disabled and the normal interpretive code in
+<b>pcre_exec()</b> is run.
<pre>
PCRE_ANCHORED
</pre>
@@ -1684,14 +1792,30 @@
<P>
If the vector is too small to hold all the captured substring offsets, it is
used as far as possible (up to two-thirds of its length), and the function
-returns a value of zero. If the substring offsets are not of interest,
-<b>pcre_exec()</b> may be called with <i>ovector</i> passed as NULL and
-<i>ovecsize</i> as zero. However, if the pattern contains back references and
-the <i>ovector</i> is not big enough to remember the related substrings, PCRE
-has to get additional memory for use during matching. Thus it is usually
-advisable to supply an <i>ovector</i>.
+returns a value of zero. If neither the actual string matched not any captured
+substrings are of interest, <b>pcre_exec()</b> may be called with <i>ovector</i>
+passed as NULL and <i>ovecsize</i> as zero. However, if the pattern contains
+back references and the <i>ovector</i> is not big enough to remember the related
+substrings, PCRE has to get additional memory for use during matching. Thus it
+is usually advisable to supply an <i>ovector</i> of reasonable size.
</P>
<P>
+There are some cases where zero is returned (indicating vector overflow) when
+in fact the vector is exactly the right size for the final match. For example,
+consider the pattern
+<pre>
+ (a)(?:(b)c|bd)
+</pre>
+If a vector of 6 elements (allowing for only 1 captured substring) is given
+with subject string "abd", <b>pcre_exec()</b> will try to set the second
+captured string, thereby recording a vector overflow, before failing to match
+"c" and backing up to try the second alternative. The zero return, however,
+does correctly indicate that the maximum number of slots (namely 2) have been
+filled. In similar cases where there is temporary overflow, but the final
+number of used slots is actually less than the maximum, a non-zero value is
+returned.
+</P>
+<P>
The <b>pcre_fullinfo()</b> function can be used to find out how many capturing
subpatterns there are in a compiled pattern. The smallest size for
<i>ovector</i> that will allow for <i>n</i> captured substrings, in addition to
@@ -1714,11 +1838,11 @@
(assuming the vector is large enough, of course) are set to -1.
</P>
<P>
-<b>Note</b>: Elements of <i>ovector</i> that do not correspond to capturing
-parentheses in the pattern are never changed. That is, if a pattern contains
-<i>n</i> capturing parentheses, no more than <i>ovector[0]</i> to
-<i>ovector[2n+1]</i> are set by <b>pcre_exec()</b>. The other elements retain
-whatever values they previously had.
+<b>Note</b>: Elements in the first two-thirds of <i>ovector</i> that do not
+correspond to capturing parentheses in the pattern are never changed. That is,
+if a pattern contains <i>n</i> capturing parentheses, no more than
+<i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by <b>pcre_exec()</b>. The other
+elements (in the first two-thirds) retain whatever values they previously had.
</P>
<P>
Some convenience functions are provided for extracting the captured substrings
@@ -1864,6 +1988,14 @@
faulted at compile time, but more complicated cases, in particular mutual
recursions between two different subpatterns, cannot be detected until run
time.
+<pre>
+ PCRE_ERROR_JIT_STACKLIMIT (-27)
+</pre>
+This error is returned when a pattern that was successfully studied using the
+PCRE_STUDY_JIT_COMPILE option is being matched, but the memory available for
+the just-in-time processing stack is not large enough. See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation for more details.
</P>
<P>
Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
@@ -1941,7 +2073,7 @@
The first byte of a character has the value 0xfe or 0xff. These values can
never occur in a valid UTF-8 string.
</P>
-<br><a name="SEC15" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
+<br><a name="SEC17" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
<P>
<b>int pcre_copy_substring(const char *<i>subject</i>, int *<i>ovector</i>,</b>
<b>int <i>stringcount</i>, int <i>stringnumber</i>, char *<i>buffer</i>,</b>
@@ -2036,7 +2168,7 @@
<b>pcre_free</b> directly; it is for these cases that the functions are
provided.
</P>
-<br><a name="SEC16" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
+<br><a name="SEC18" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
<P>
<b>int pcre_get_stringnumber(const pcre *<i>code</i>,</b>
<b>const char *<i>name</i>);</b>
@@ -2100,7 +2232,7 @@
numbers. For this reason, the use of different names for subpatterns of the
same number causes an error at compile time.
</P>
-<br><a name="SEC17" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
+<br><a name="SEC19" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
<P>
<b>int pcre_get_stringtable_entries(const pcre *<i>code</i>,</b>
<b>const char *<i>name</i>, char **<i>first</i>, char **<i>last</i>);</b>
@@ -2138,7 +2270,7 @@
Given all the relevant entries for the name, you can extract each of their
numbers, and hence the captured data, if any.
</P>
-<br><a name="SEC18" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
+<br><a name="SEC20" href="#TOC1">FINDING ALL POSSIBLE MATCHES</a><br>
<P>
The traditional matching function uses a similar algorithm to Perl, which stops
when it finds the first match, starting at a given point in the subject. If you
@@ -2157,7 +2289,7 @@
other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b>
will yield PCRE_ERROR_NOMATCH.
<a name="dfamatch"></a></P>
-<br><a name="SEC19" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
+<br><a name="SEC21" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
<P>
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
@@ -2288,7 +2420,8 @@
The strings are returned in reverse order of length; that is, the longest
matching string is given first. If there were too many matches to fit into
<i>ovector</i>, the yield of the function is zero, and the vector is filled with
-the longest matches.
+the longest matches. Unlike <b>pcre_exec()</b>, <b>pcre_dfa_exec()</b> can use
+the entire <i>ovector</i> for returning matched strings.
</P>
<br><b>
Error returns from <b>pcre_dfa_exec()</b>
@@ -2315,8 +2448,9 @@
PCRE_ERROR_DFA_UMLIMIT (-18)
</pre>
This return is given if <b>pcre_dfa_exec()</b> is called with an <i>extra</i>
-block that contains a setting of the <i>match_limit</i> field. This is not
-supported (it is meaningless).
+block that contains a setting of the <i>match_limit</i> or
+<i>match_limit_recursion</i> fields. This is not supported (these fields are
+meaningless for DFA matching).
<pre>
PCRE_ERROR_DFA_WSSIZE (-19)
</pre>
@@ -2330,13 +2464,13 @@
error is given if the output vector is not large enough. This should be
extremely rare, as a vector of size 1000 is used.
</P>
-<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC22" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
</P>
-<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC23" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -2345,9 +2479,9 @@
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC22" href="#TOC1">REVISION</a><br>
+<br><a name="SEC24" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 13 August 2011
+Last updated: 06 September 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcrebuild.html
===================================================================
--- code/trunk/doc/html/pcrebuild.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcrebuild.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -18,20 +18,21 @@
<li><a name="TOC3" href="#SEC3">C++ SUPPORT</a>
<li><a name="TOC4" href="#SEC4">UTF-8 SUPPORT</a>
<li><a name="TOC5" href="#SEC5">UNICODE CHARACTER PROPERTY SUPPORT</a>
-<li><a name="TOC6" href="#SEC6">CODE VALUE OF NEWLINE</a>
-<li><a name="TOC7" href="#SEC7">WHAT \R MATCHES</a>
-<li><a name="TOC8" href="#SEC8">POSIX MALLOC USAGE</a>
-<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
-<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
-<li><a name="TOC11" href="#SEC11">LIMITING PCRE RESOURCE USAGE</a>
-<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
-<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
-<li><a name="TOC14" href="#SEC14">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
-<li><a name="TOC15" href="#SEC15">PCREGREP BUFFER SIZE</a>
-<li><a name="TOC16" href="#SEC16">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
-<li><a name="TOC17" href="#SEC17">SEE ALSO</a>
-<li><a name="TOC18" href="#SEC18">AUTHOR</a>
-<li><a name="TOC19" href="#SEC19">REVISION</a>
+<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a>
+<li><a name="TOC7" href="#SEC7">CODE VALUE OF NEWLINE</a>
+<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a>
+<li><a name="TOC9" href="#SEC9">POSIX MALLOC USAGE</a>
+<li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
+<li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
+<li><a name="TOC12" href="#SEC12">LIMITING PCRE RESOURCE USAGE</a>
+<li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
+<li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
+<li><a name="TOC15" href="#SEC15">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
+<li><a name="TOC16" href="#SEC16">PCREGREP BUFFER SIZE</a>
+<li><a name="TOC17" href="#SEC17">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a>
+<li><a name="TOC18" href="#SEC18">SEE ALSO</a>
+<li><a name="TOC19" href="#SEC19">AUTHOR</a>
+<li><a name="TOC20" href="#SEC20">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE BUILD-TIME OPTIONS</a><br>
<P>
@@ -120,8 +121,25 @@
<a href="pcrepattern.html"><b>pcrepattern</b></a>
documentation.
</P>
-<br><a name="SEC6" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
+<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
<P>
+Just-in-time compiler support is included in the build by specifying
+<pre>
+ --enable-jit
+</pre>
+This support is available only for certain hardware architectures. If this
+option is set for an unsupported architecture, a compile time error occurs.
+See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation for a discussion of JIT usage. When JIT support is enabled,
+pcregrep automatically makes use of it, unless you add
+<pre>
+ --disable-pcregrep-jit
+</pre>
+to the "configure" command.
+</P>
+<br><a name="SEC7" href="#TOC1">CODE VALUE OF NEWLINE</a><br>
+<P>
By default, PCRE interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
compile PCRE to use carriage return (CR) instead, by adding
@@ -153,7 +171,7 @@
overridden when the library functions are called. At build time it is
conventional to use the standard for your operating system.
</P>
-<br><a name="SEC7" href="#TOC1">WHAT \R MATCHES</a><br>
+<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
By default, the sequence \R in a pattern matches any Unicode newline sequence,
whatever has been selected as the line ending sequence. If you specify
@@ -164,7 +182,7 @@
selected when PCRE is built can be overridden when the library functions are
called.
</P>
-<br><a name="SEC8" href="#TOC1">POSIX MALLOC USAGE</a><br>
+<br><a name="SEC9" href="#TOC1">POSIX MALLOC USAGE</a><br>
<P>
When PCRE is called through the POSIX interface (see the
<a href="pcreposix.html"><b>pcreposix</b></a>
@@ -180,7 +198,7 @@
</pre>
to the <b>configure</b> command.
</P>
-<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
+<br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<P>
Within a compiled pattern, offset values are used to point from one part to
another (for example, from an opening parenthesis to an alternation
@@ -196,7 +214,7 @@
longer offsets slows down the operation of PCRE because it has to load
additional bytes when handling them.
</P>
-<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
+<br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<P>
When matching with the <b>pcre_exec()</b> function, PCRE implements backtracking
by making recursive calls to an internal function called <b>match()</b>. In
@@ -227,7 +245,7 @@
slowly when built in this way. This option affects only the <b>pcre_exec()</b>
function; it is not relevant for <b>pcre_dfa_exec()</b>.
</P>
-<br><a name="SEC11" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
+<br><a name="SEC12" href="#TOC1">LIMITING PCRE RESOURCE USAGE</a><br>
<P>
Internally, PCRE has a function called <b>match()</b>, which it calls repeatedly
(sometimes recursively) when matching a pattern with the <b>pcre_exec()</b>
@@ -256,7 +274,7 @@
</pre>
to the <b>configure</b> command. This value can also be overridden at run time.
</P>
-<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
+<br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P>
PCRE uses fixed tables for processing characters whose code values are less
than 256. By default, PCRE is built with a set of tables that are distributed
@@ -273,7 +291,7 @@
create alternative tables when cross compiling, you will have to do so "by
hand".)
</P>
-<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
+<br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
<P>
PCRE assumes by default that it will run in an environment where the character
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
@@ -287,7 +305,7 @@
an EBCDIC environment (for example, an IBM mainframe operating system). The
--enable-ebcdic option is incompatible with --enable-utf8.
</P>
-<br><a name="SEC14" href="#TOC1">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
+<br><a name="SEC15" href="#TOC1">PCREGREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P>
By default, <b>pcregrep</b> reads all files as plain text. You can build it so
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
@@ -300,7 +318,7 @@
relevant libraries are installed on your system. Configuration will fail if
they are not.
</P>
-<br><a name="SEC15" href="#TOC1">PCREGREP BUFFER SIZE</a><br>
+<br><a name="SEC16" href="#TOC1">PCREGREP BUFFER SIZE</a><br>
<P>
<b>pcregrep</b> uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it
@@ -315,7 +333,7 @@
to the <b>configure</b> command. The caller of \fPpcregrep\fP can, however,
override this value by specifying a run-time option.
</P>
-<br><a name="SEC16" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
+<br><a name="SEC17" href="#TOC1">PCRETEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P>
If you add
<pre>
@@ -346,11 +364,11 @@
</pre>
immediately before the <b>configure</b> command.
</P>
-<br><a name="SEC17" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC18" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcreapi</b>(3), <b>pcre_config</b>(3).
</P>
-<br><a name="SEC18" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC19" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -359,9 +377,9 @@
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC19" href="#TOC1">REVISION</a><br>
+<br><a name="SEC20" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 02 August 2011
+Last updated: 06 September 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcrecallout.html
===================================================================
--- code/trunk/doc/html/pcrecallout.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcrecallout.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -60,6 +60,11 @@
indicates how the pattern is matched. This is useful information when you are
trying to optimize the performance of a particular pattern.
</P>
+<P>
+The use of callouts in a pattern makes it ineligible for optimization by the
+just-in-time compiler. Studying such a pattern with the PCRE_STUDY_JIT_COMPILE
+option always fails.
+</P>
<br><a name="SEC2" href="#TOC1">MISSING CALLOUTS</a><br>
<P>
You should be aware that, because of optimizations in the way PCRE matches
@@ -214,7 +219,7 @@
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 31 July 2011
+Last updated: 26 August 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcregrep.html
===================================================================
--- code/trunk/doc/html/pcregrep.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcregrep.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -456,6 +456,14 @@
<b>--line-offsets</b> is used.
</P>
<P>
+<b>--no-jit</b>
+If the PCRE library is built with support for just-in-time compiling (which
+speeds up matching), <b>pcregrep</b> automatically makes use of this, unless it
+was explicitly disabled at build time. This option can be used to disable the
+use of JIT at run time. It is provided for testing and working round problems.
+It should never be needed in normal use.
+</P>
+<P>
<b>-o</b>, <b>--only-matching</b>
Show only the part of the line that matched a pattern instead of the whole
line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
@@ -634,7 +642,7 @@
</P>
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 30 July 2011
+Last updated: 06 September 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcrejit.html
===================================================================
--- code/trunk/doc/html/pcrejit.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcrejit.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -13,7 +13,262 @@
man page, in case the conversion went wrong.
<br>
<ul>
+<li><a name="TOC1" href="#SEC1">PCRE JUST-IN-TIME COMPILER SUPPORT</a>
+<li><a name="TOC2" href="#SEC2">AVAILABILITY OF JIT SUPPORT</a>
+<li><a name="TOC3" href="#SEC3">SIMPLE USE OF JIT</a>
+<li><a name="TOC4" href="#SEC4">UNSUPPORTED OPTIONS AND PATTERN ITEMS</a>
+<li><a name="TOC5" href="#SEC5">RETURN VALUES FROM JIT EXECUTION</a>
+<li><a name="TOC6" href="#SEC6">SAVING AND RESTORING COMPILED PATTERNS</a>
+<li><a name="TOC7" href="#SEC7">CONTROLLING THE JIT STACK</a>
+<li><a name="TOC8" href="#SEC8">EXAMPLE CODE</a>
+<li><a name="TOC9" href="#SEC9">SEE ALSO</a>
+<li><a name="TOC10" href="#SEC10">AUTHOR</a>
+<li><a name="TOC11" href="#SEC11">REVISION</a>
</ul>
+<br><a name="SEC1" href="#TOC1">PCRE JUST-IN-TIME COMPILER SUPPORT</a><br>
+<P>
+Just-in-time compiling is a heavyweight optimization that can greatly speed up
+pattern matching. However, it comes at the cost of extra processing before the
+match is performed. Therefore, it is of most benefit when the same pattern is
+going to be matched many times. This does not necessarily mean many calls of
+\fPpcre_exec()\fP; if the pattern is not anchored, matching attempts may take
+place many times at various positions in the subject, even for a single call to
+<b>pcre_exec()</b>. If the subject string is very long, it may still pay to use
+JIT for one-off matches.
+</P>
+<P>
+JIT support applies only to the traditional matching function,
+<b>pcre_exec()</b>. It does not apply when <b>pcre_dfa_exec()</b> is being used.
+The code for this support was written by Zoltan Herczeg.
+</P>
+<br><a name="SEC2" href="#TOC1">AVAILABILITY OF JIT SUPPORT</a><br>
+<P>
+JIT support is an optional feature of PCRE. The "configure" option --enable-jit
+(or equivalent CMake option) must be set when PCRE is built if you want to use
+JIT. The support is limited to the following hardware platforms:
+<pre>
+ ARM v5, v7, and Thumb2
+ Intel x86 32-bit and 64-bit
+ MIPS 32-bit
+ Power PC 32-bit and 64-bit
+</pre>
+If --enable-jit is set on an unsupported platform, compilation fails.
+</P>
+<P>
+A program can tell if JIT support is available by calling <b>pcre_config()</b>
+with the PCRE_CONFIG_JIT option. The result is 1 when JIT is available, and 0
+otherwise. However, a simple program does not need to check this in order to
+use JIT. The API is implemented in a way that falls back to the ordinary PCRE
+code if JIT is not available.
+</P>
+<br><a name="SEC3" href="#TOC1">SIMPLE USE OF JIT</a><br>
+<P>
+You have to do two things to make use of the JIT support in the simplest way:
+<pre>
+ (1) Call <b>pcre_study()</b> with the PCRE_STUDY_JIT_COMPILE option for
+ each compiled pattern, and pass the resulting <b>pcre_extra</b> block to
+ <b>pcre_exec()</b>.
+
+ (2) Use <b>pcre_free_study()</b> to free the <b>pcre_extra</b> block when it is
+ no longer needed instead of just freeing it yourself. This
+ ensures that any JIT data is also freed.
+</pre>
+In some circumstances you may need to call additional functions. These are
+described in the section entitled
+<a href="#stackcontrol">"Controlling the JIT stack"</a>
+below.
+</P>
+<P>
+If JIT support is not available, PCRE_STUDY_JIT_COMPILE is ignored, and no JIT
+data is set up. Otherwise, the compiled pattern is passed to the JIT compiler,
+which turns it into machine code that executes much faster than the normal
+interpretive code. When <b>pcre_exec()</b> is passed a <b>pcre_extra</b> block
+containing a pointer to JIT code, it obeys that instead of the normal code. The
+result is identical, but the code runs much faster.
+</P>
+<P>
+There are some <b>pcre_exec()</b> options that are not supported for JIT
+execution. There are also some pattern items that JIT cannot handle. Details
+are given below. In both cases, execution automatically falls back to the
+interpretive code.
+</P>
+<P>
+If the JIT compiler finds an unsupported item, no JIT data is generated. You
+can find out if JIT execution is available after studying a pattern by calling
+<b>pcre_fullinfo()</b> with the PCRE_INFO_JIT option. A result of 1 means that
+JIT compilationw was successful. A result of 0 means that JIT support is not
+available, or the pattern was not studied with PCRE_STUDY_JIT_COMPILE, or the
+JIT compiler was not able to handle the pattern.
+</P>
+<br><a name="SEC4" href="#TOC1">UNSUPPORTED OPTIONS AND PATTERN ITEMS</a><br>
+<P>
+The only <b>pcre_exec()</b> options that are supported for JIT execution are
+PCRE_NO_UTF8_CHECK, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and
+PCRE_NOTEMPTY_ATSTART. Note in particular that partial matching is not
+supported.
+</P>
+<P>
+The unsupported pattern items are:
+<pre>
+ \C match a single byte, even in UTF-8 mode
+ (?Cn) callouts
+ (?(<name>)... conditional test on setting of a named subpattern
+ (?(R)... conditional test on whole pattern recursion
+ (?(Rn)... conditional test on recursion, by number
+ (?(R&name)... conditional test on recursion, by name
+ (*COMMIT) )
+ (*MARK) )
+ (*PRUNE) ) the backtracking control verbs
+ (*SKIP) )
+ (*THEN) )
+</pre>
+Support for some of these may be added in future.
+</P>
+<br><a name="SEC5" href="#TOC1">RETURN VALUES FROM JIT EXECUTION</a><br>
+<P>
+When a pattern is matched using JIT execution, the return values are the same
+as those given by the interpretive <b>pcre_exec()</b> code, with the addition of
+one new error code: PCRE_ERROR_JIT_STACKLIMIT. This means that the memory used
+for the JIT stack was insufficient. See
+<a href="#stackcontrol">"Controlling the JIT stack"</a>
+below for a discussion of JIT stack usage. For compatibility with the
+interpretive <b>pcre_exec()</b> code, no more than two-thirds of the
+<i>ovector</i> argument is used for passing back captured substrings.
+</P>
+<P>
+The error code PCRE_ERROR_MATCHLIMIT is returned by the JIT code if searching a
+very large pattern tree goes on for too long, as it is in the same circumstance
+when JIT is not used, but the details of exactly what is counted are not the
+same. The PCRE_ERROR_RECURSIONLIMIT error code is never returned by JIT
+execution.
+</P>
+<br><a name="SEC6" href="#TOC1">SAVING AND RESTORING COMPILED PATTERNS</a><br>
+<P>
+The code that is generated by the JIT compiler is architecture-specific, and is
+also position dependent. For those reasons it cannot be saved and restored like
+the bytecode and other data of a compiled pattern. You should be able run
+<b>pcre_study()</b> on a saved and restored pattern, and thereby recreate the
+JIT data, but because JIT compilation uses significant resources, it is
+probably not worth doing this.
+<a name="stackcontrol"></a></P>
+<br><a name="SEC7" href="#TOC1">CONTROLLING THE JIT STACK</a><br>
+<P>
+When the compiled JIT code runs, it needs a block of memory to use as a stack.
+By default, it uses 32K on the machine stack. However, some large or
+complicated patterns need more than this. The error PCRE_ERROR_JIT_STACKLIMIT
+is given when there is not enough stack. Three functions are provided for
+managing blocks of memory for use as JIT stacks.
+</P>
+<P>
+The <b>pcre_jit_stack_alloc()</b> function creates a JIT stack. Its arguments
+are a starting size and a maximum size, and it returns a pointer to an opaque
+structure of type <b>pcre_jit_stack</b>, or NULL if there is an error. The
+<b>pcre_jit_stack_free()</b> function can be used to free a stack that is no
+longer needed. (For the technically minded: the address space is allocated by
+mmap or VirtualAlloc.)
+</P>
+<P>
+JIT uses far less memory for recursion than the interpretive code,
+and a maximum stack size of 512K to 1M should be more than enough for any
+pattern.
+</P>
+<P>
+The <b>pcre_assign_jit_stack()</b> function specifies which stack JIT code
+should use. Its arguments are as follows:
+<pre>
+ pcre_extra *extra
+ pcre_jit_callback callback
+ void *data
+</pre>
+The <i>extra</i> argument must be the result of studying a pattern with
+PCRE_STUDY_JIT_COMPILE. There are three cases for the values of the other two
+options:
+<pre>
+ (1) If <i>callback</i> is NULL and <i>data</i> is NULL, an internal 32K block
+ on the machine stack is used.
+
+ (2) If <i>callback</i> is NULL and <i>data</i> is not NULL, <i>data</i> must be
+ a valid JIT stack, the result of calling <b>pcre_jit_stack_alloc()</b>.
+
+ (3) If <i>callback</i> not NULL, it must point to a function that is called
+ with <i>data</i> as an argument at the start of matching, in order to
+ set up a JIT stack. If the result is NULL, the internal 32K stack
+ is used; otherwise the return value must be a valid JIT stack,
+ the result of calling <b>pcre_jit_stack_alloc()</b>.
+</pre>
+You may safely assign the same JIT stack to more than one pattern, as long as
+they are all matched sequentially in the same thread. In a multithread
+application, each thread must use its own JIT stack.
+</P>
+<P>
+Strictly speaking, even more is allowed. You can assign the same stack to any
+number of patterns as long as they are not used for matching by multiple
+threads at the same time. For example, you can assign the same stack to all
+compiled patterns, and use a global mutex in the callback to wait until the
+stack is available for use. However, this is an inefficient solution, and
+not recommended.
+</P>
+<P>
+This is a suggestion for how a typical multithreaded program might operate:
+<pre>
+ During thread initalization
+ thread_local_var = pcre_jit_stack_alloc(...)
+
+ During thread exit
+ pcre_jit_stack_free(thread_local_var)
+
+ Use a one-line callback function
+ return thread_local_var
+</pre>
+All the functions described in this section do nothing if JIT is not available,
+and <b>pcre_assign_jit_stack()</b> does nothing unless the <b>extra</b> argument
+is non-NULL and points to a <b>pcre_extra</b> block that is the result of a
+successful study with PCRE_STUDY_JIT_COMPILE.
+</P>
+<br><a name="SEC8" href="#TOC1">EXAMPLE CODE</a><br>
+<P>
+This is a single-threaded example that specifies a JIT stack without using a
+callback.
+<pre>
+ int rc;
+ int ovector[30];
+ pcre *re;
+ pcre_extra *extra;
+ pcre_jit_stack *jit_stack;
+
+ re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
+ /* Check for errors */
+ extra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &error);
+ jit_stack = pcre_jit_stack_alloc(32*1024, 512*1024);
+ /* Check for error (NULL) */
+ pcre_assign_jit_stack(extra, NULL, jit_stack);
+ rc = pcre_exec(re, extra, subject, length, 0, 0, ovector, 30);
+ /* Check results */
+ pcre_free(re);
+ pcre_free_study(extra);
+ pcre_jit_stack_free(jit_stack);
+
+</PRE>
+</P>
+<br><a name="SEC9" href="#TOC1">SEE ALSO</a><br>
+<P>
+<b>pcreapi</b>(3)
+</P>
+<br><a name="SEC10" href="#TOC1">AUTHOR</a><br>
+<P>
+Philip Hazel
+<br>
+University Computing Service
+<br>
+Cambridge CB2 3QH, England.
+<br>
+</P>
+<br><a name="SEC11" href="#TOC1">REVISION</a><br>
+<P>
+Last updated: 06 September 2011
+<br>
+Copyright © 1997-2011 University of Cambridge.
+<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
</p>
Modified: code/trunk/doc/html/pcrepartial.html
===================================================================
--- code/trunk/doc/html/pcrepartial.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcrepartial.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -58,13 +58,15 @@
are set, PCRE_PARTIAL_HARD takes precedence.
</P>
<P>
-Setting a partial matching option disables two of PCRE's optimizations. PCRE
-remembers the last literal byte in a pattern, and abandons matching immediately
-if such a byte is not present in the subject string. This optimization cannot
-be used for a subject string that might match only partially. If the pattern
-was studied, PCRE knows the minimum length of a matching string, and does not
-bother to run the matching function on shorter strings. This optimization is
-also disabled for partial matching.
+Setting a partial matching option for <b>pcre_exec()</b> disables the use of any
+just-in-time code that was set up by calling <b>pcre_study()</b> with the
+PCRE_STUDY_JIT_COMPILE option. It also disables two of PCRE's standard
+optimizations. PCRE remembers the last literal byte in a pattern, and abandons
+matching immediately if such a byte is not present in the subject string. This
+optimization cannot be used for a subject string that might match only
+partially. If the pattern was studied, PCRE knows the minimum length of a
+matching string, and does not bother to run the matching function on shorter
+strings. This optimization is also disabled for partial matching.
</P>
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre_exec()</a><br>
<P>
@@ -434,9 +436,9 @@
</P>
<br><a name="SEC11" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 07 November 2010
+Last updated: 26 August 2011
<br>
-Copyright © 1997-2010 University of Cambridge.
+Copyright © 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
Modified: code/trunk/doc/html/pcreprecompile.html
===================================================================
--- code/trunk/doc/html/pcreprecompile.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcreprecompile.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -28,7 +28,9 @@
If you are not using any private character tables (see the
<a href="pcre_maketables.html"><b>pcre_maketables()</b></a>
documentation), this is relatively straightforward. If you are using private
-tables, it is a little bit more complicated.
+tables, it is a little bit more complicated. However, if you are using the
+just-in-time optimization feature of <b>pcre_study()</b>, it is not possible to
+save and reload the JIT data.
</P>
<P>
If you save compiled patterns to a file, you can copy them to a different host
@@ -36,7 +38,8 @@
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
-guaranteed to work and may cause crashes.
+guaranteed to work and may cause crashes, and saving and restoring a compiled
+pattern loses any JIT optimization data.
</P>
<br><a name="SEC2" href="#TOC1">SAVING A COMPILED PATTERN</a><br>
<P>
@@ -76,9 +79,11 @@
them.
</P>
<P>
-If the pattern has been studied, it is also possible to save the study data in
-a similar way to the compiled pattern itself. When studying generates
-additional information, <b>pcre_study()</b> returns a pointer to a
+If the pattern has been studied, it is also possible to save the normal study
+data in a similar way to the compiled pattern itself. However, if the
+PCRE_STUDY_JIT_COMPILE was used, the just-in-time data that is created cannot
+be saved because it is too dependent on the current environment. When studying
+generates additional information, <b>pcre_study()</b> returns a pointer to a
<b>pcre_extra</b> data block. Its format is defined in the
<a href="pcreapi.html#extradata">section on matching a pattern</a>
in the
@@ -120,7 +125,8 @@
reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
<i>flags</i> field to indicate that study data is present. Then pass the
<b>pcre_extra</b> block to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b> in the
-usual way.
+usual way. If the pattern was studied for just-in-time optimization, that data
+cannot be saved, and so is lost by a save/restore cycle.
</P>
<br><a name="SEC4" href="#TOC1">COMPATIBILITY WITH DIFFERENT PCRE RELEASES</a><br>
<P>
@@ -138,9 +144,9 @@
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 17 November 2010
+Last updated: 26 August 2011
<br>
-Copyright © 1997-2010 University of Cambridge.
+Copyright © 1997-2011 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE index page</a>.
Modified: code/trunk/doc/html/pcrestack.html
===================================================================
--- code/trunk/doc/html/pcrestack.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcrestack.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -33,6 +33,16 @@
current call (a "tail recursion"), the function is just restarted instead.
</P>
<P>
+The above comments apply when <b>pcre_exec()</b> is run in its normal
+interpretive manner. If the pattern was studied with the
+PCRE_STUDY_JIT_COMPILE option, and just-in-time compiling was successful, and
+the options passed to <b>pcre_exec()</b> were not incompatible, the matching
+process uses the JIT-compiled code instead of the <b>match()</b> function. In
+this case, the memory requirements are handled entirely differently. See the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation for details.
+</P>
+<P>
The <b>pcre_dfa_exec()</b> function operates in an entirely different way, and
uses recursion only when there is a regular expression recursion or subroutine
call in the pattern. This includes the processing of assertion and "once-only"
@@ -45,7 +55,7 @@
</P>
<P>
The comments that follow do NOT apply to <b>pcre_dfa_exec()</b>; they are
-relevant only for <b>pcre_exec()</b>.
+relevant only for <b>pcre_exec()</b> without the JIT optimization.
</P>
<br><b>
Reducing <b>pcre_exec()</b>'s stack usage
@@ -179,7 +189,7 @@
REVISION
</b><br>
<P>
-Last updated: 22 July 2011
+Last updated: 26 August 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcretest.html
===================================================================
--- code/trunk/doc/html/pcretest.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcretest.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -115,22 +115,25 @@
megabytes.
</P>
<P>
-<b>-s</b>
+<b>-s</b> or <b>-s+</b>
Behave as if each pattern has the <b>/S</b> modifier; in other words, force each
-pattern to be studied. If the <b>/I</b> or <b>/D</b> option is present on a
-pattern (requesting output about the compiled pattern), information about the
-result of studying is not included when studying is caused only by <b>-s</b> and
-neither <b>-i</b> nor <b>-d</b> is present on the command line. This behaviour
-means that the output from tests that are run with and without <b>-s</b> should
-be identical, except when options that output information about the actual
-running of a match are set. The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options,
-which give information about resources used, are likely to produce different
-output with and without <b>-s</b>. Output may also differ if the <b>/C</b> option
-is present on an individual pattern. This uses callouts to trace the the
-matching process, and this may be different between studied and non-studied
-patterns. If the pattern contains (*MARK) items there may also be differences,
-for the same reason. The <b>-s</b> command line option can be overridden for
-specific patterns that should never be studied (see the /S option below).
+pattern to be studied. If <b>-s+</b> is used, the PCRE_STUDY_JIT_COMPILE flag is
+passed to <b>pcre_study()</b>, causing just-in-time optimization to be set up if
+it is available. If the <b>/I</b> or <b>/D</b> option is present on a pattern
+(requesting output about the compiled pattern), information about the result of
+studying is not included when studying is caused only by <b>-s</b> and neither
+<b>-i</b> nor <b>-d</b> is present on the command line. This behaviour means that
+the output from tests that are run with and without <b>-s</b> should be
+identical, except when options that output information about the actual running
+of a match are set. The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options, which give
+information about resources used, are likely to produce different output with
+and without <b>-s</b>. Output may also differ if the <b>/C</b> option is present
+on an individual pattern. This uses callouts to trace the the matching process,
+and this may be different between studied and non-studied patterns. If the
+pattern contains (*MARK) items there may also be differences, for the same
+reason. The <b>-s</b> command line option can be overridden for specific
+patterns that should never be studied (see the <b>/S</b> pattern modifier
+below).
</P>
<P>
<b>-t</b>
@@ -296,7 +299,8 @@
contains multiple copies of the same substring. If the <b>+</b> modifier appears
twice, the same action is taken for captured substrings. In each case the
remainder is output on the following line with a plus character following the
-capture number.
+capture number. Note that this modifier must not immediately follow the /S
+modifier because /S+ has another meaning.
</P>
<P>
The <b>/=</b> modifier requests that the values of all potential captured
@@ -372,6 +376,19 @@
files in a few cases where the output is different when the pattern is studied.
</P>
<P>
+If the <b>/S</b> modifier is immediately followed by a + character, the call to
+<b>pcre_study()</b> is made with the PCRE_STUDY_JIT_COMPILE option, requesting
+just-in-time optimization support if it is available. Note that there is also a
+<b>/+</b> modifier; it must not be given immediately after <b>/S</b> because this
+will be misinterpreted. If JIT studying is successful, it will automatically be
+used when <b>pcre_exec()</b> is run, except when incompatible run-time options
+are specified. These include the partial matching options; a complete list is
+given in the
+<a href="pcrejit.html"><b>pcrejit</b></a>
+documentation. See also the <b>\J</b> escape sequence below for a way of
+setting the size of the JIT stack.
+</P>
+<P>
The <b>/T</b> modifier must be followed by a single digit. It causes a specific
set of built-in character tables to be passed to <b>pcre_compile()</b>. It is
used in the standard PCRE tests to check behaviour with different character
@@ -440,6 +457,7 @@
\Gdd call pcre_get_substring() for substring dd after a successful match (number less than 32)
\Gname call pcre_get_named_substring() for substring "name" after a successful match (name termin-
ated by next non-alphanumeric character)
+ \Jdd set up a JIT stack of dd kilobytes maximum (any number of digits)
\L call pcre_get_substringlist() after a successful match
\M discover the minimum MATCH_LIMIT and MATCH_LIMIT_RECURSION settings
\N pass the PCRE_NOTEMPTY option to <b>pcre_exec()</b> or <b>pcre_dfa_exec()</b>; if used twice, pass the
@@ -479,19 +497,30 @@
input.
</P>
<P>
+The <b>\J</b> escape provides a way of setting the maximum stack size that is
+used by the just-in-time optimization code. It is ignored if JIT optimization
+is not being used. Providing a stack that is larger than the default 32K is
+necessary only for very complicated patterns.
+</P>
+<P>
If \M is present, <b>pcretest</b> calls <b>pcre_exec()</b> several times, with
different values in the <i>match_limit</i> and <i>match_limit_recursion</i>
fields of the <b>pcre_extra</b> data structure, until it finds the minimum
-numbers for each parameter that allow <b>pcre_exec()</b> to complete. The
-<i>match_limit</i> number is a measure of the amount of backtracking that takes
-place, and checking it out can be instructive. For most simple matches, the
-number is quite small, but for patterns with very large numbers of matching
-possibilities, it can become large very quickly with increasing length of
-subject string. The <i>match_limit_recursion</i> number is a measure of how much
-stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is needed
-to complete the match attempt.
+numbers for each parameter that allow <b>pcre_exec()</b> to complete without
+error. Because this is testing a specific feature of the normal interpretive
+<b>pcre_exec()</b> execution, the use of any JIT optimization that might have
+been set up by the <b>/S+</b> qualifier of <b>-s+</b> option is disabled.
</P>
<P>
+The <i>match_limit</i> number is a measure of the amount of backtracking
+that takes place, and checking it out can be instructive. For most simple
+matches, the number is quite small, but for patterns with very large numbers of
+matching possibilities, it can become large very quickly with increasing length
+of subject string. The <i>match_limit_recursion</i> number is a measure of how
+much stack (or, if PCRE is compiled with NO_RECURSE, how much heap) memory is
+needed to complete the match attempt.
+</P>
+<P>
When \O is used, the value specified may be higher or lower than the size set
by the <b>-O</b> command line option (or defaulted to 45); \O applies only to
the call of <b>pcre_exec()</b> for the line in which it appears.
@@ -761,6 +790,8 @@
See the
<a href="pcreprecompile.html"><b>pcreprecompile</b></a>
documentation for a discussion about saving and re-using compiled patterns.
+Note that if the pattern was successfully studied with JIT optimization, the
+JIT data cannot be saved.
</P>
<P>
The data that is written is binary. The first eight bytes are the length of the
@@ -769,8 +800,8 @@
there is no study data (either the pattern was not studied, or studying did not
return any data), the second length is zero. The lengths are followed by an
exact copy of the compiled pattern. If there is additional study data, this
-follows immediately after the compiled pattern. After writing the file,
-<b>pcretest</b> expects to read a new pattern.
+(excluding any JIT data) follows immediately after the compiled pattern. After
+writing the file, <b>pcretest</b> expects to read a new pattern.
</P>
<P>
A saved pattern can be reloaded into <b>pcretest</b> by specifying < and a file
@@ -783,8 +814,9 @@
Compiled pattern loaded from /some/file
No study data
</pre>
-When the pattern has been loaded, <b>pcretest</b> proceeds to read data lines in
-the usual way.
+If the pattern was previously studied with the JIT optimization, the JIT
+information cannot be saved and restored, and so is lost. When the pattern has
+been loaded, <b>pcretest</b> proceeds to read data lines in the usual way.
</P>
<P>
You can copy a file written by <b>pcretest</b> to a different host and reload it
@@ -809,8 +841,9 @@
</P>
<br><a name="SEC13" href="#TOC1">SEE ALSO</a><br>
<P>
-<b>pcre</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrematching</b>(3),
-<b>pcrepartial</b>(d), <b>pcrepattern</b>(3), <b>pcreprecompile</b>(3).
+<b>pcre</b>(3), <b>pcreapi</b>(3), <b>pcrecallout</b>(3), <b>pcrejit</b>,
+<b>pcrematching</b>(3), <b>pcrepartial</b>(d), <b>pcrepattern</b>(3),
+<b>pcreprecompile</b>(3).
</P>
<br><a name="SEC14" href="#TOC1">AUTHOR</a><br>
<P>
@@ -823,7 +856,7 @@
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 01 August 2011
+Last updated: 26 August 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/html/pcreunicode.html
===================================================================
--- code/trunk/doc/html/pcreunicode.html 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/html/pcreunicode.html 2011-09-11 14:31:21 UTC (rev 691)
@@ -22,7 +22,7 @@
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
-strings of 1-byte characters. PCRE does not support any other formats (in
+strings of 1-byte characters. PCRE does not support any other formats (in
particular, it does not support UTF-16).
</P>
<P>
@@ -83,16 +83,20 @@
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set, what
happens depends on why the string is invalid. If the string conforms to the
"old" definition of UTF-8 (RFC 2279), it is processed as a string of characters
-in the range 0 to 0x7FFFFFFF. In other words, apart from the initial validity
-test, PCRE (when in UTF-8 mode) handles strings according to the more liberal
-rules of RFC 2279. However, if the string does not even conform to RFC 2279,
-the result is undefined. Your program may crash.
+in the range 0 to 0x7FFFFFFF by <b>pcre_dfa_exec()</b> and the interpreted
+version of <b>pcre_exec()</b>. In other words, apart from the initial validity
+test, these functions (when in UTF-8 mode) handle strings according to the more
+liberal rules of RFC 2279. However, the just-in-time (JIT) optimization for
+<b>pcre_exec()</b> supports only RFC 3629. If you are using JIT optimization, or
+if the string does not even conform to RFC 2279, the result is undefined. Your
+program may crash.
</P>
<P>
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check.
+situation, you will have to apply your own validity check, and avoid the use of
+JIT optimization.
</P>
<br><b>
General comments about UTF-8 mode
@@ -115,7 +119,10 @@
<P>
5. The escape sequence \C can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, <b>pcre_dfa_exec()</b>.
+the alternative matching function, <b>pcre_dfa_exec()</b>, nor is it supported
+by the JIT optimization of <b>pcre_exec()</b>. If JIT optimization is requested
+for a pattern that contains \C, it will not succeed, and so the matching will
+be carried out by the normal interpretive function.
</P>
<P>
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
@@ -168,7 +175,7 @@
REVISION
</b><br>
<P>
-Last updated: 24 August 2011
+Last updated: 06 September 2011
<br>
Copyright © 1997-2011 University of Cambridge.
<br>
Modified: code/trunk/doc/pcre.3
===================================================================
--- code/trunk/doc/pcre.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -95,8 +95,8 @@
pcrecpp details of the C++ wrapper
pcredemo a demonstration C program that uses PCRE
pcregrep description of the \fBpcregrep\fP command
- pcrejit discussion of the just-in-time optimization support
- pcrelimits details of size and other limits
+ pcrejit discussion of the just-in-time optimization support
+ pcrelimits details of size and other limits
pcrematching discussion of the two matching algorithms
pcrepartial details of the partial matching facility
.\" JOIN
@@ -109,7 +109,7 @@
pcrestack discussion of stack usage
pcresyntax quick syntax reference
pcretest description of the \fBpcretest\fP testing command
- pcreunicode discussion of Unicode and UTF-8 support
+ pcreunicode discussion of Unicode and UTF-8 support
.sp
In addition, in the "man" and HTML formats, there is a short page for each
C library function, listing its arguments and results.
Modified: code/trunk/doc/pcre.txt
===================================================================
--- code/trunk/doc/pcre.txt 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre.txt 2011-09-11 14:31:21 UTC (rev 691)
@@ -120,8 +120,8 @@
Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREBUILD(3) PCREBUILD(3)
@@ -217,6 +217,23 @@
are supported. Details are given in the pcrepattern documentation.
+JUST-IN-TIME COMPILER SUPPORT
+
+ Just-in-time compiler support is included in the build by specifying
+
+ --enable-jit
+
+ This support is available only for certain hardware architectures. If
+ this option is set for an unsupported architecture, a compile time
+ error occurs. See the pcrejit documentation for a discussion of JIT
+ usage. When JIT support is enabled, pcregrep automatically makes use of
+ it, unless you add
+
+ --disable-pcregrep-jit
+
+ to the "configure" command.
+
+
CODE VALUE OF NEWLINE
By default, PCRE interprets the linefeed (LF) character as indicating
@@ -464,11 +481,11 @@
REVISION
- Last updated: 02 August 2011
+ Last updated: 06 September 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREMATCHING(3) PCREMATCHING(3)
@@ -671,8 +688,8 @@
Last updated: 17 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREAPI(3) PCREAPI(3)
@@ -680,7 +697,7 @@
PCRE - Perl-compatible regular expressions
-PCRE NATIVE API
+PCRE NATIVE API BASIC FUNCTIONS
#include <pcre.h>
@@ -696,10 +713,22 @@
pcre_extra *pcre_study(const pcre *code, int options,
const char **errptr);
+ void pcre_free_study(pcre_extra *extra);
+
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
+
+PCRE NATIVE API AUXILIARY FUNCTIONS
+
+ pcre_jit_stack *pcre_jit_stack_alloc(int startsize, int maxsize);
+
+ void pcre_jit_stack_free(pcre_jit_stack *stack);
+
+ void pcre_assign_jit_stack(pcre_extra *extra,
+ pcre_jit_callback callback, void *data);
+
int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize,
@@ -749,6 +778,9 @@
char *pcre_version(void);
+
+PCRE NATIVE API INDIRECTED FUNCTIONS
+
void *(*pcre_malloc)(size_t);
void (*pcre_free)(void *);
@@ -764,42 +796,53 @@
PCRE has its own native API, which is described in this document. There
are also some wrapper functions that correspond to the POSIX regular
- expression API. These are described in the pcreposix documentation.
- Both of these APIs define a set of C function calls. A C++ wrapper is
- distributed with PCRE. It is documented in the pcrecpp page.
+ expression API, but they do not give access to all the functionality.
+ They are described in the pcreposix documentation. Both of these APIs
+ define a set of C function calls. A C++ wrapper is also distributed
+ with PCRE. It is documented in the pcrecpp page.
- The native API C function prototypes are defined in the header file
- pcre.h, and on Unix systems the library itself is called libpcre. It
+ The native API C function prototypes are defined in the header file
+ pcre.h, and on Unix systems the library itself is called libpcre. It
can normally be accessed by adding -lpcre to the command for linking an
application that uses PCRE. The header file defines the macros
- PCRE_MAJOR and PCRE_MINOR to contain the major and minor release num-
- bers for the library. Applications can use these to include support
+ PCRE_MAJOR and PCRE_MINOR to contain the major and minor release num-
+ bers for the library. Applications can use these to include support
for different releases of PCRE.
In a Windows environment, if you want to statically link an application
- program against a non-dll pcre.a file, you must define PCRE_STATIC
- before including pcre.h or pcrecpp.h, because otherwise the pcre_mal-
+ program against a non-dll pcre.a file, you must define PCRE_STATIC
+ before including pcre.h or pcrecpp.h, because otherwise the pcre_mal-
loc() and pcre_free() exported functions will be declared
__declspec(dllimport), with unwanted results.
- The functions pcre_compile(), pcre_compile2(), pcre_study(), and
- pcre_exec() are used for compiling and matching regular expressions in
- a Perl-compatible manner. A sample program that demonstrates the sim-
- plest way of using them is provided in the file called pcredemo.c in
+ The functions pcre_compile(), pcre_compile2(), pcre_study(), and
+ pcre_exec() are used for compiling and matching regular expressions in
+ a Perl-compatible manner. A sample program that demonstrates the sim-
+ plest way of using them is provided in the file called pcredemo.c in
the PCRE source distribution. A listing of this program is given in the
- pcredemo documentation, and the pcresample documentation describes how
+ pcredemo documentation, and the pcresample documentation describes how
to compile and run it.
+ Just-in-time compiler support is an optional feature of PCRE that can
+ be built in appropriate hardware environments. It greatly speeds up the
+ matching performance of many patterns. Simple programs can easily
+ request that it be used if available, by setting an option that is
+ ignored when it is not relevant. More complicated programs might need
+ to make use of the functions pcre_jit_stack_alloc(),
+ pcre_jit_stack_free(), and pcre_assign_jit_stack() in order to control
+ the JIT code's memory usage. These functions are discussed in the
+ pcrejit documentation.
+
A second matching function, pcre_dfa_exec(), which is not Perl-compati-
- ble, is also provided. This uses a different algorithm for the match-
- ing. The alternative algorithm finds all possible matches (at a given
- point in the subject), and scans the subject just once (unless there
- are lookbehind assertions). However, this algorithm does not return
- captured substrings. A description of the two matching algorithms and
- their advantages and disadvantages is given in the pcrematching docu-
+ ble, is also provided. This uses a different algorithm for the match-
+ ing. The alternative algorithm finds all possible matches (at a given
+ point in the subject), and scans the subject just once (unless there
+ are lookbehind assertions). However, this algorithm does not return
+ captured substrings. A description of the two matching algorithms and
+ their advantages and disadvantages is given in the pcrematching docu-
mentation.
- In addition to the main compiling and matching functions, there are
+ In addition to the main compiling and matching functions, there are
convenience functions for extracting captured substrings from a subject
string that is matched by pcre_exec(). They are:
@@ -814,102 +857,106 @@
pcre_free_substring() and pcre_free_substring_list() are also provided,
to free the memory used for extracted strings.
- The function pcre_maketables() is used to build a set of character
- tables in the current locale for passing to pcre_compile(),
- pcre_exec(), or pcre_dfa_exec(). This is an optional facility that is
- provided for specialist use. Most commonly, no special tables are
- passed, in which case internal tables that are generated when PCRE is
+ The function pcre_maketables() is used to build a set of character
+ tables in the current locale for passing to pcre_compile(),
+ pcre_exec(), or pcre_dfa_exec(). This is an optional facility that is
+ provided for specialist use. Most commonly, no special tables are
+ passed, in which case internal tables that are generated when PCRE is
built are used.
- The function pcre_fullinfo() is used to find out information about a
- compiled pattern; pcre_info() is an obsolete version that returns only
- some of the available information, but is retained for backwards com-
- patibility. The function pcre_version() returns a pointer to a string
+ The function pcre_fullinfo() is used to find out information about a
+ compiled pattern; pcre_info() is an obsolete version that returns only
+ some of the available information, but is retained for backwards com-
+ patibility. The function pcre_version() returns a pointer to a string
containing the version of PCRE and its date of release.
- The function pcre_refcount() maintains a reference count in a data
- block containing a compiled pattern. This is provided for the benefit
+ The function pcre_refcount() maintains a reference count in a data
+ block containing a compiled pattern. This is provided for the benefit
of object-oriented applications.
- The global variables pcre_malloc and pcre_free initially contain the
- entry points of the standard malloc() and free() functions, respec-
+ The global variables pcre_malloc and pcre_free initially contain the
+ entry points of the standard malloc() and free() functions, respec-
tively. PCRE calls the memory management functions via these variables,
- so a calling program can replace them if it wishes to intercept the
+ so a calling program can replace them if it wishes to intercept the
calls. This should be done before calling any PCRE functions.
- The global variables pcre_stack_malloc and pcre_stack_free are also
- indirections to memory management functions. These special functions
- are used only when PCRE is compiled to use the heap for remembering
+ The global variables pcre_stack_malloc and pcre_stack_free are also
+ indirections to memory management functions. These special functions
+ are used only when PCRE is compiled to use the heap for remembering
data, instead of recursive function calls, when running the pcre_exec()
- function. See the pcrebuild documentation for details of how to do
- this. It is a non-standard way of building PCRE, for use in environ-
- ments that have limited stacks. Because of the greater use of memory
- management, it runs more slowly. Separate functions are provided so
- that special-purpose external code can be used for this case. When
- used, these functions are always called in a stack-like manner (last
- obtained, first freed), and always for memory blocks of the same size.
- There is a discussion about PCRE's stack usage in the pcrestack docu-
+ function. See the pcrebuild documentation for details of how to do
+ this. It is a non-standard way of building PCRE, for use in environ-
+ ments that have limited stacks. Because of the greater use of memory
+ management, it runs more slowly. Separate functions are provided so
+ that special-purpose external code can be used for this case. When
+ used, these functions are always called in a stack-like manner (last
+ obtained, first freed), and always for memory blocks of the same size.
+ There is a discussion about PCRE's stack usage in the pcrestack docu-
mentation.
The global variable pcre_callout initially contains NULL. It can be set
- by the caller to a "callout" function, which PCRE will then call at
- specified points during a matching operation. Details are given in the
+ by the caller to a "callout" function, which PCRE will then call at
+ specified points during a matching operation. Details are given in the
pcrecallout documentation.
NEWLINES
- PCRE supports five different conventions for indicating line breaks in
- strings: a single CR (carriage return) character, a single LF (line-
+ PCRE supports five different conventions for indicating line breaks in
+ strings: a single CR (carriage return) character, a single LF (line-
feed) character, the two-character sequence CRLF, any of the three pre-
- ceding, or any Unicode newline sequence. The Unicode newline sequences
- are the three just mentioned, plus the single characters VT (vertical
- tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
+ ceding, or any Unicode newline sequence. The Unicode newline sequences
+ are the three just mentioned, plus the single characters VT (vertical
+ tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
separator, U+2028), and PS (paragraph separator, U+2029).
- Each of the first three conventions is used by at least one operating
- system as its standard newline sequence. When PCRE is built, a default
- can be specified. The default default is LF, which is the Unix stan-
- dard. When PCRE is run, the default can be overridden, either when a
+ Each of the first three conventions is used by at least one operating
+ system as its standard newline sequence. When PCRE is built, a default
+ can be specified. The default default is LF, which is the Unix stan-
+ dard. When PCRE is run, the default can be overridden, either when a
pattern is compiled, or when it is matched.
At compile time, the newline convention can be specified by the options
- argument of pcre_compile(), or it can be specified by special text at
+ argument of pcre_compile(), or it can be specified by special text at
the start of the pattern itself; this overrides any other settings. See
the pcrepattern page for details of the special character sequences.
In the PCRE documentation the word "newline" is used to mean "the char-
- acter or pair of characters that indicate a line break". The choice of
- newline convention affects the handling of the dot, circumflex, and
+ acter or pair of characters that indicate a line break". The choice of
+ newline convention affects the handling of the dot, circumflex, and
dollar metacharacters, the handling of #-comments in /x mode, and, when
- CRLF is a recognized line ending sequence, the match position advance-
+ CRLF is a recognized line ending sequence, the match position advance-
ment for a non-anchored pattern. There is more detail about this in the
section on pcre_exec() options below.
- The choice of newline convention does not affect the interpretation of
- the \n or \r escape sequences, nor does it affect what \R matches,
+ The choice of newline convention does not affect the interpretation of
+ the \n or \r escape sequences, nor does it affect what \R matches,
which is controlled in a similar way, but by separate options.
MULTITHREADING
- The PCRE functions can be used in multi-threading applications, with
+ The PCRE functions can be used in multi-threading applications, with
the proviso that the memory management functions pointed to by
pcre_malloc, pcre_free, pcre_stack_malloc, and pcre_stack_free, and the
callout function pointed to by pcre_callout, are shared by all threads.
- The compiled form of a regular expression is not altered during match-
+ The compiled form of a regular expression is not altered during match-
ing, so the same compiled pattern can safely be used by several threads
at once.
+ If the just-in-time optimization feature is being used, it needs sepa-
+ rate memory stack areas for each thread. See the pcrejit documentation
+ for more details.
+
SAVING PRECOMPILED PATTERNS FOR LATER USE
The compiled form of a regular expression can be saved and re-used at a
- later time, possibly by a different program, and even on a host other
- than the one on which it was compiled. Details are given in the
- pcreprecompile documentation. However, compiling a regular expression
- with one version of PCRE for use with a different version is not guar-
+ later time, possibly by a different program, and even on a host other
+ than the one on which it was compiled. Details are given in the
+ pcreprecompile documentation. However, compiling a regular expression
+ with one version of PCRE for use with a different version is not guar-
anteed to work and may cause crashes.
@@ -917,26 +964,31 @@
int pcre_config(int what, void *where);
- The function pcre_config() makes it possible for a PCRE client to dis-
+ The function pcre_config() makes it possible for a PCRE client to dis-
cover which optional features have been compiled into the PCRE library.
- The pcrebuild documentation has more details about these optional fea-
+ The pcrebuild documentation has more details about these optional fea-
tures.
- The first argument for pcre_config() is an integer, specifying which
+ The first argument for pcre_config() is an integer, specifying which
information is required; the second argument is a pointer to a variable
- into which the information is placed. The following information is
+ into which the information is placed. The following information is
available:
PCRE_CONFIG_UTF8
- The output is an integer that is set to one if UTF-8 support is avail-
+ The output is an integer that is set to one if UTF-8 support is avail-
able; otherwise it is set to zero.
PCRE_CONFIG_UNICODE_PROPERTIES
- The output is an integer that is set to one if support for Unicode
+ The output is an integer that is set to one if support for Unicode
character properties is available; otherwise it is set to zero.
+ PCRE_CONFIG_JIT
+
+ The output is an integer that is set to one if support for just-in-time
+ compiling is available; otherwise it is set to zero.
+
PCRE_CONFIG_NEWLINE
The output is an integer whose value specifies the default character
@@ -1423,9 +1475,21 @@
wants to pass any of the other fields to pcre_exec() or
pcre_dfa_exec(), it must set up its own pcre_extra block.
- The second argument of pcre_study() contains option bits. At present,
- no options are defined, and this argument should always be zero.
+ The second argument of pcre_study() contains option bits. There is only
+ one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-
+ time compiler is available, the pattern is further compiled into
+ machine code that executes much faster than the pcre_exec() matching
+ function. If the just-in-time compiler is not available, this option is
+ ignored. All other bits in the options argument must be zero.
+ JIT compilation is a heavyweight optimization. It can take some time
+ for patterns to be analyzed, and for one-off matches and simple pat-
+ terns the benefit of faster execution might be offset by a much slower
+ study time. Not all patterns can be optimized by the JIT compiler. For
+ those that cannot be handled, matching automatically falls back to the
+ pcre_exec() interpreter. For more details, see the pcrejit documenta-
+ tion.
+
The third argument for pcre_study() is a pointer for an error message.
If studying succeeds (even if no data is returned), the variable it
points to is set to NULL. Otherwise it is set to point to a textual
@@ -1433,13 +1497,29 @@
must not try to free it. You should test the error pointer for NULL
after calling pcre_study(), to be sure that it has run successfully.
- This is a typical call to pcre_study():
+ When you are finished with a pattern, you can free the memory used for
+ the study data by calling pcre_free_study(). This function was added to
+ the API for release 8.20. For earlier versions, the memory could be
+ freed with pcre_free(), just like the pattern itself. This will still
+ work in cases where PCRE_STUDY_JIT_COMPILE is not used, but it is
+ advisable to change to the new function when convenient.
- pcre_extra *pe;
- pe = pcre_study(
+ This is a typical way in which pcre_study() is used (except that in a
+ real application there should be tests for errors):
+
+ int rc;
+ pcre *re;
+ pcre_extra *sd;
+ re = pcre_compile("pattern", 0, &error, &erroroffset, NULL);
+ sd = pcre_study(
re, /* result of pcre_compile() */
- 0, /* no options exist */
+ 0, /* no options */
&error); /* set to NULL or points to a message */
+ rc = pcre_exec( /* see below for details of pcre_exec() options */
+ re, sd, "subject", 7, 0, 0, ovector, 30);
+ ...
+ pcre_free_study(sd);
+ pcre_free(re);
Studying a pattern does two things: first, a lower bound for the length
of subject string that is needed to match the pattern is computed. This
@@ -1454,68 +1534,71 @@
bytes is created. This speeds up finding a position in the subject at
which to start matching.
- The two optimizations just described can be disabled by setting the
- PCRE_NO_START_OPTIMIZE option when calling pcre_exec() or
- pcre_dfa_exec(). You might want to do this if your pattern contains
- callouts or (*MARK), and you want to make use of these facilities in
- cases where matching fails. See the discussion of PCRE_NO_START_OPTI-
- MIZE below.
+ These two optimizations apply to both pcre_exec() and pcre_dfa_exec().
+ However, they are not used by pcre_exec() if pcre_study() is called
+ with the PCRE_STUDY_JIT_COMPILE option, and just-in-time compiling is
+ successful. The optimizations can be disabled by setting the
+ PCRE_NO_START_OPTIMIZE option when calling pcre_exec() or
+ pcre_dfa_exec(). You might want to do this if your pattern contains
+ callouts or (*MARK) (which cannot be handled by the JIT compiler), and
+ you want to make use of these facilities in cases where matching fails.
+ See the discussion of PCRE_NO_START_OPTIMIZE below.
LOCALE SUPPORT
- PCRE handles caseless matching, and determines whether characters are
- letters, digits, or whatever, by reference to a set of tables, indexed
- by character value. When running in UTF-8 mode, this applies only to
- characters with codes less than 128. By default, higher-valued codes
+ PCRE handles caseless matching, and determines whether characters are
+ letters, digits, or whatever, by reference to a set of tables, indexed
+ by character value. When running in UTF-8 mode, this applies only to
+ characters with codes less than 128. By default, higher-valued codes
never match escapes such as \w or \d, but they can be tested with \p if
- PCRE is built with Unicode character property support. Alternatively,
- the PCRE_UCP option can be set at compile time; this causes \w and
+ PCRE is built with Unicode character property support. Alternatively,
+ the PCRE_UCP option can be set at compile time; this causes \w and
friends to use Unicode property support instead of built-in tables. The
use of locales with Unicode is discouraged. If you are handling charac-
- ters with codes greater than 128, you should either use UTF-8 and Uni-
+ ters with codes greater than 128, you should either use UTF-8 and Uni-
code, or use locales, but not try to mix the two.
- PCRE contains an internal set of tables that are used when the final
- argument of pcre_compile() is NULL. These are sufficient for many
+ PCRE contains an internal set of tables that are used when the final
+ argument of pcre_compile() is NULL. These are sufficient for many
applications. Normally, the internal tables recognize only ASCII char-
acters. However, when PCRE is built, it is possible to cause the inter-
nal tables to be rebuilt in the default "C" locale of the local system,
which may cause them to be different.
- The internal tables can always be overridden by tables supplied by the
+ The internal tables can always be overridden by tables supplied by the
application that calls PCRE. These may be created in a different locale
- from the default. As more and more applications change to using Uni-
+ from the default. As more and more applications change to using Uni-
code, the need for this locale support is expected to die away.
- External tables are built by calling the pcre_maketables() function,
- which has no arguments, in the relevant locale. The result can then be
- passed to pcre_compile() or pcre_exec() as often as necessary. For
- example, to build and use tables that are appropriate for the French
- locale (where accented characters with values greater than 128 are
+ External tables are built by calling the pcre_maketables() function,
+ which has no arguments, in the relevant locale. The result can then be
+ passed to pcre_compile() or pcre_exec() as often as necessary. For
+ example, to build and use tables that are appropriate for the French
+ locale (where accented characters with values greater than 128 are
treated as letters), the following code could be used:
setlocale(LC_CTYPE, "fr_FR");
tables = pcre_maketables();
re = pcre_compile(..., tables);
- The locale name "fr_FR" is used on Linux and other Unix-like systems;
+ The locale name "fr_FR" is used on Linux and other Unix-like systems;
if you are using Windows, the name for the French locale is "french".
- When pcre_maketables() runs, the tables are built in memory that is
- obtained via pcre_malloc. It is the caller's responsibility to ensure
- that the memory containing the tables remains available for as long as
+ When pcre_maketables() runs, the tables are built in memory that is
+ obtained via pcre_malloc. It is the caller's responsibility to ensure
+ that the memory containing the tables remains available for as long as
it is needed.
The pointer that is passed to pcre_compile() is saved with the compiled
- pattern, and the same tables are used via this pointer by pcre_study()
+ pattern, and the same tables are used via this pointer by pcre_study()
and normally also by pcre_exec(). Thus, by default, for any single pat-
tern, compilation, studying and matching all happen in the same locale,
but different patterns can be compiled in different locales.
- It is possible to pass a table pointer or NULL (indicating the use of
- the internal tables) to pcre_exec(). Although not intended for this
- purpose, this facility could be used to match a pattern in a different
+ It is possible to pass a table pointer or NULL (indicating the use of
+ the internal tables) to pcre_exec(). Although not intended for this
+ purpose, this facility could be used to match a pattern in a different
locale from the one in which it was compiled. Passing table pointers at
run time is discussed below in the section on matching a pattern.
@@ -1525,15 +1608,15 @@
int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
int what, void *where);
- The pcre_fullinfo() function returns information about a compiled pat-
+ The pcre_fullinfo() function returns information about a compiled pat-
tern. It replaces the obsolete pcre_info() function, which is neverthe-
less retained for backwards compability (and is documented below).
- The first argument for pcre_fullinfo() is a pointer to the compiled
- pattern. The second argument is the result of pcre_study(), or NULL if
- the pattern was not studied. The third argument specifies which piece
- of information is required, and the fourth argument is a pointer to a
- variable to receive the data. The yield of the function is zero for
+ The first argument for pcre_fullinfo() is a pointer to the compiled
+ pattern. The second argument is the result of pcre_study(), or NULL if
+ the pattern was not studied. The third argument specifies which piece
+ of information is required, and the fourth argument is a pointer to a
+ variable to receive the data. The yield of the function is zero for
success, or one of the following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
@@ -1541,144 +1624,154 @@
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of what was invalid
- The "magic number" is placed at the start of each compiled pattern as
- an simple check against passing an arbitrary memory pointer. Here is a
- typical call of pcre_fullinfo(), to obtain the length of the compiled
+ The "magic number" is placed at the start of each compiled pattern as
+ an simple check against passing an arbitrary memory pointer. Here is a
+ typical call of pcre_fullinfo(), to obtain the length of the compiled
pattern:
int rc;
size_t length;
rc = pcre_fullinfo(
re, /* result of pcre_compile() */
- pe, /* result of pcre_study(), or NULL */
+ sd, /* result of pcre_study(), or NULL */
PCRE_INFO_SIZE, /* what is required */
&length); /* where to put the data */
- The possible values for the third argument are defined in pcre.h, and
+ The possible values for the third argument are defined in pcre.h, and
are as follows:
PCRE_INFO_BACKREFMAX
- Return the number of the highest back reference in the pattern. The
- fourth argument should point to an int variable. Zero is returned if
+ Return the number of the highest back reference in the pattern. The
+ fourth argument should point to an int variable. Zero is returned if
there are no back references.
PCRE_INFO_CAPTURECOUNT
- Return the number of capturing subpatterns in the pattern. The fourth
+ Return the number of capturing subpatterns in the pattern. The fourth
argument should point to an int variable.
PCRE_INFO_DEFAULT_TABLES
- Return a pointer to the internal default character tables within PCRE.
- The fourth argument should point to an unsigned char * variable. This
+ Return a pointer to the internal default character tables within PCRE.
+ The fourth argument should point to an unsigned char * variable. This
information call is provided for internal use by the pcre_study() func-
- tion. External callers can cause PCRE to use its internal tables by
+ tion. External callers can cause PCRE to use its internal tables by
passing a NULL table pointer.
PCRE_INFO_FIRSTBYTE
- Return information about the first byte of any matched string, for a
- non-anchored pattern. The fourth argument should point to an int vari-
- able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
+ Return information about the first byte of any matched string, for a
+ non-anchored pattern. The fourth argument should point to an int vari-
+ able. (This option used to be called PCRE_INFO_FIRSTCHAR; the old name
is still recognized for backwards compatibility.)
- If there is a fixed first byte, for example, from a pattern such as
+ If there is a fixed first byte, for example, from a pattern such as
(cat|cow|coyote), its value is returned. Otherwise, if either
- (a) the pattern was compiled with the PCRE_MULTILINE option, and every
+ (a) the pattern was compiled with the PCRE_MULTILINE option, and every
branch starts with "^", or
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not
set (if it were set, the pattern would be anchored),
- -1 is returned, indicating that the pattern matches only at the start
- of a subject string or after any newline within the string. Otherwise
+ -1 is returned, indicating that the pattern matches only at the start
+ of a subject string or after any newline within the string. Otherwise
-2 is returned. For anchored patterns, -2 is returned.
PCRE_INFO_FIRSTTABLE
- If the pattern was studied, and this resulted in the construction of a
+ If the pattern was studied, and this resulted in the construction of a
256-bit table indicating a fixed set of bytes for the first byte in any
- matching string, a pointer to the table is returned. Otherwise NULL is
- returned. The fourth argument should point to an unsigned char * vari-
+ matching string, a pointer to the table is returned. Otherwise NULL is
+ returned. The fourth argument should point to an unsigned char * vari-
able.
PCRE_INFO_HASCRORLF
- Return 1 if the pattern contains any explicit matches for CR or LF
- characters, otherwise 0. The fourth argument should point to an int
- variable. An explicit match is either a literal CR or LF character, or
+ Return 1 if the pattern contains any explicit matches for CR or LF
+ characters, otherwise 0. The fourth argument should point to an int
+ variable. An explicit match is either a literal CR or LF character, or
\r or \n.
PCRE_INFO_JCHANGED
- Return 1 if the (?J) or (?-J) option setting is used in the pattern,
- otherwise 0. The fourth argument should point to an int variable. (?J)
+ Return 1 if the (?J) or (?-J) option setting is used in the pattern,
+ otherwise 0. The fourth argument should point to an int variable. (?J)
and (?-J) set and unset the local PCRE_DUPNAMES option, respectively.
+ PCRE_INFO_JIT
+
+ Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE
+ option, and just-in-time compiling was successful. The fourth argument
+ should point to an int variable. A return value of 0 means that JIT
+ support is not available in this version of PCRE, or that the pattern
+ was not studied with the PCRE_STUDY_JIT_COMPILE option, or that the JIT
+ compiler could not handle this particular pattern. See the pcrejit doc-
+ umentation for details of what can and cannot be handled.
+
PCRE_INFO_LASTLITERAL
- Return the value of the rightmost literal byte that must exist in any
- matched string, other than at its start, if such a byte has been
+ Return the value of the rightmost literal byte that must exist in any
+ matched string, other than at its start, if such a byte has been
recorded. The fourth argument should point to an int variable. If there
- is no such byte, -1 is returned. For anchored patterns, a last literal
- byte is recorded only if it follows something of variable length. For
+ is no such byte, -1 is returned. For anchored patterns, a last literal
+ byte is recorded only if it follows something of variable length. For
example, for the pattern /^a\d+z\d+/ the returned value is "z", but for
/^a\dz\d/ the returned value is -1.
PCRE_INFO_MINLENGTH
- If the pattern was studied and a minimum length for matching subject
- strings was computed, its value is returned. Otherwise the returned
- value is -1. The value is a number of characters, not bytes (this may
- be relevant in UTF-8 mode). The fourth argument should point to an int
- variable. A non-negative value is a lower bound to the length of any
- matching string. There may not be any strings of that length that do
+ If the pattern was studied and a minimum length for matching subject
+ strings was computed, its value is returned. Otherwise the returned
+ value is -1. The value is a number of characters, not bytes (this may
+ be relevant in UTF-8 mode). The fourth argument should point to an int
+ variable. A non-negative value is a lower bound to the length of any
+ matching string. There may not be any strings of that length that do
actually match, but every string that does match is at least that long.
PCRE_INFO_NAMECOUNT
PCRE_INFO_NAMEENTRYSIZE
PCRE_INFO_NAMETABLE
- PCRE supports the use of named as well as numbered capturing parenthe-
- ses. The names are just an additional way of identifying the parenthe-
+ PCRE supports the use of named as well as numbered capturing parenthe-
+ ses. The names are just an additional way of identifying the parenthe-
ses, which still acquire numbers. Several convenience functions such as
- pcre_get_named_substring() are provided for extracting captured sub-
- strings by name. It is also possible to extract the data directly, by
- first converting the name to a number in order to access the correct
+ pcre_get_named_substring() are provided for extracting captured sub-
+ strings by name. It is also possible to extract the data directly, by
+ first converting the name to a number in order to access the correct
pointers in the output vector (described with pcre_exec() below). To do
- the conversion, you need to use the name-to-number map, which is
+ the conversion, you need to use the name-to-number map, which is
described by these three values.
The map consists of a number of fixed-size entries. PCRE_INFO_NAMECOUNT
gives the number of entries, and PCRE_INFO_NAMEENTRYSIZE gives the size
- of each entry; both of these return an int value. The entry size
- depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
- a pointer to the first entry of the table (a pointer to char). The
+ of each entry; both of these return an int value. The entry size
+ depends on the length of the longest name. PCRE_INFO_NAMETABLE returns
+ a pointer to the first entry of the table (a pointer to char). The
first two bytes of each entry are the number of the capturing parenthe-
- sis, most significant byte first. The rest of the entry is the corre-
+ sis, most significant byte first. The rest of the entry is the corre-
sponding name, zero terminated.
- The names are in alphabetical order. Duplicate names may appear if (?|
+ The names are in alphabetical order. Duplicate names may appear if (?|
is used to create multiple groups with the same number, as described in
- the section on duplicate subpattern numbers in the pcrepattern page.
- Duplicate names for subpatterns with different numbers are permitted
- only if PCRE_DUPNAMES is set. In all cases of duplicate names, they
- appear in the table in the order in which they were found in the pat-
- tern. In the absence of (?| this is the order of increasing number;
+ the section on duplicate subpattern numbers in the pcrepattern page.
+ Duplicate names for subpatterns with different numbers are permitted
+ only if PCRE_DUPNAMES is set. In all cases of duplicate names, they
+ appear in the table in the order in which they were found in the pat-
+ tern. In the absence of (?| this is the order of increasing number;
when (?| is used this is not necessarily the case because later subpat-
terns may have lower numbers.
- As a simple example of the name/number table, consider the following
- pattern (assume PCRE_EXTENDED is set, so white space - including new-
+ As a simple example of the name/number table, consider the following
+ pattern (assume PCRE_EXTENDED is set, so white space - including new-
lines - is ignored):
(?<date> (?<year>(\d\d)?\d\d) -
(?<month>\d\d) - (?<day>\d\d) )
- There are four named subpatterns, so the table has four entries, and
- each entry in the table is eight bytes long. The table is as follows,
+ There are four named subpatterns, so the table has four entries, and
+ each entry in the table is eight bytes long. The table is as follows,
with non-printing bytes shows in hexadecimal, and undefined bytes shown
as ??:
@@ -1687,31 +1780,31 @@
00 04 m o n t h 00
00 02 y e a r 00 ??
- When writing code to extract data from named subpatterns using the
- name-to-number map, remember that the length of the entries is likely
+ When writing code to extract data from named subpatterns using the
+ name-to-number map, remember that the length of the entries is likely
to be different for each compiled pattern.
PCRE_INFO_OKPARTIAL
- Return 1 if the pattern can be used for partial matching with
- pcre_exec(), otherwise 0. The fourth argument should point to an int
- variable. From release 8.00, this always returns 1, because the
- restrictions that previously applied to partial matching have been
- lifted. The pcrepartial documentation gives details of partial match-
+ Return 1 if the pattern can be used for partial matching with
+ pcre_exec(), otherwise 0. The fourth argument should point to an int
+ variable. From release 8.00, this always returns 1, because the
+ restrictions that previously applied to partial matching have been
+ lifted. The pcrepartial documentation gives details of partial match-
ing.
PCRE_INFO_OPTIONS
- Return a copy of the options with which the pattern was compiled. The
- fourth argument should point to an unsigned long int variable. These
+ Return a copy of the options with which the pattern was compiled. The
+ fourth argument should point to an unsigned long int variable. These
option bits are those specified in the call to pcre_compile(), modified
by any top-level option settings at the start of the pattern itself. In
- other words, they are the options that will be in force when matching
- starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
- the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
+ other words, they are the options that will be in force when matching
+ starts. For example, if the pattern /(?im)abc(?-i)d/ is compiled with
+ the PCRE_EXTENDED option, the result is PCRE_CASELESS, PCRE_MULTILINE,
and PCRE_EXTENDED.
- A pattern is automatically anchored by PCRE if all of its top-level
+ A pattern is automatically anchored by PCRE if all of its top-level
alternatives begin with one of the following:
^ unless PCRE_MULTILINE is set
@@ -1725,7 +1818,7 @@
PCRE_INFO_SIZE
- Return the size of the compiled pattern, that is, the value that was
+ Return the size of the compiled pattern, that is, the value that was
passed as the argument to pcre_malloc() when PCRE was getting memory in
which to place the compiled data. The fourth argument should point to a
size_t variable.
@@ -1733,12 +1826,12 @@
PCRE_INFO_STUDYSIZE
Return the size of the data block pointed to by the study_data field in
- a pcre_extra block. If pcre_extra is NULL, or there is no study data,
- zero is returned. The fourth argument should point to a size_t vari-
- able. The study_data field is set by pcre_study() to record informa-
- tion that will speed up matching (see the section entitled "Studying a
+ a pcre_extra block. If pcre_extra is NULL, or there is no study data,
+ zero is returned. The fourth argument should point to a size_t vari-
+ able. The study_data field is set by pcre_study() to record informa-
+ tion that will speed up matching (see the section entitled "Studying a
pattern" above). The format of the study_data block is private, but its
- length is made available via this option so that it can be saved and
+ length is made available via this option so that it can be saved and
restored (see the pcreprecompile documentation for details).
@@ -1746,21 +1839,21 @@
int pcre_info(const pcre *code, int *optptr, int *firstcharptr);
- The pcre_info() function is now obsolete because its interface is too
- restrictive to return all the available data about a compiled pattern.
- New programs should use pcre_fullinfo() instead. The yield of
- pcre_info() is the number of capturing subpatterns, or one of the fol-
+ The pcre_info() function is now obsolete because its interface is too
+ restrictive to return all the available data about a compiled pattern.
+ New programs should use pcre_fullinfo() instead. The yield of
+ pcre_info() is the number of capturing subpatterns, or one of the fol-
lowing negative numbers:
PCRE_ERROR_NULL the argument code was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- If the optptr argument is not NULL, a copy of the options with which
- the pattern was compiled is placed in the integer it points to (see
+ If the optptr argument is not NULL, a copy of the options with which
+ the pattern was compiled is placed in the integer it points to (see
PCRE_INFO_OPTIONS above).
- If the pattern is not anchored and the firstcharptr argument is not
- NULL, it is used to pass back information about the first character of
+ If the pattern is not anchored and the firstcharptr argument is not
+ NULL, it is used to pass back information about the first character of
any matched string (see PCRE_INFO_FIRSTBYTE above).
@@ -1768,21 +1861,21 @@
int pcre_refcount(pcre *code, int adjust);
- The pcre_refcount() function is used to maintain a reference count in
+ The pcre_refcount() function is used to maintain a reference count in
the data block that contains a compiled pattern. It is provided for the
- benefit of applications that operate in an object-oriented manner,
+ benefit of applications that operate in an object-oriented manner,
where different parts of the application may be using the same compiled
pattern, but you want to free the block when they are all done.
When a pattern is compiled, the reference count field is initialized to
- zero. It is changed only by calling this function, whose action is to
- add the adjust value (which may be positive or negative) to it. The
+ zero. It is changed only by calling this function, whose action is to
+ add the adjust value (which may be positive or negative) to it. The
yield of the function is the new value. However, the value of the count
- is constrained to lie between 0 and 65535, inclusive. If the new value
+ is constrained to lie between 0 and 65535, inclusive. If the new value
is outside these limits, it is forced to the appropriate limit value.
- Except when it is zero, the reference count is not correctly preserved
- if a pattern is compiled on one host and then transferred to a host
+ Except when it is zero, the reference count is not correctly preserved
+ if a pattern is compiled on one host and then transferred to a host
whose byte-order is different. (This seems a highly unlikely scenario.)
@@ -1792,18 +1885,18 @@
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
- The function pcre_exec() is called to match a subject string against a
- compiled pattern, which is passed in the code argument. If the pattern
- was studied, the result of the study should be passed in the extra
- argument. This function is the main matching facility of the library,
+ The function pcre_exec() is called to match a subject string against a
+ compiled pattern, which is passed in the code argument. If the pattern
+ was studied, the result of the study should be passed in the extra
+ argument. This function is the main matching facility of the library,
and it operates in a Perl-like manner. For specialist use there is also
- an alternative matching function, which is described below in the sec-
+ an alternative matching function, which is described below in the sec-
tion about the pcre_dfa_exec() function.
- In most applications, the pattern will have been compiled (and option-
- ally studied) in the same process that calls pcre_exec(). However, it
+ In most applications, the pattern will have been compiled (and option-
+ ally studied) in the same process that calls pcre_exec(). However, it
is possible to save compiled patterns and study data, and then use them
- later in different processes, possibly even on different hosts. For a
+ later in different processes, possibly even on different hosts. For a
discussion about this, see the pcreprecompile documentation.
Here is an example of a simple call to pcre_exec():
@@ -1822,49 +1915,58 @@
Extra data for pcre_exec()
- If the extra argument is not NULL, it must point to a pcre_extra data
- block. The pcre_study() function returns such a block (when it doesn't
- return NULL), but you can also create one for yourself, and pass addi-
- tional information in it. The pcre_extra block contains the following
+ If the extra argument is not NULL, it must point to a pcre_extra data
+ block. The pcre_study() function returns such a block (when it doesn't
+ return NULL), but you can also create one for yourself, and pass addi-
+ tional information in it. The pcre_extra block contains the following
fields (not necessarily in this order):
unsigned long int flags;
void *study_data;
+ void *executable_jit;
unsigned long int match_limit;
unsigned long int match_limit_recursion;
void *callout_data;
const unsigned char *tables;
unsigned char **mark;
- The flags field is a bitmap that specifies which of the other fields
+ The flags field is a bitmap that specifies which of the other fields
are set. The flag bits are:
PCRE_EXTRA_STUDY_DATA
+ PCRE_EXTRA_EXECUTABLE_JIT
PCRE_EXTRA_MATCH_LIMIT
PCRE_EXTRA_MATCH_LIMIT_RECURSION
PCRE_EXTRA_CALLOUT_DATA
PCRE_EXTRA_TABLES
PCRE_EXTRA_MARK
- Other flag bits should be set to zero. The study_data field is set in
- the pcre_extra block that is returned by pcre_study(), together with
- the appropriate flag bit. You should not set this yourself, but you may
- add to the block by setting the other fields and their corresponding
- flag bits.
+ Other flag bits should be set to zero. The study_data field and some-
+ times the executable_jit field are set in the pcre_extra block that is
+ returned by pcre_study(), together with the appropriate flag bits. You
+ should not set these yourself, but you may add to the block by setting
+ the other fields and their corresponding flag bits.
The match_limit field provides a means of preventing PCRE from using up
- a vast amount of resources when running patterns that are not going to
- match, but which have a very large number of possibilities in their
- search trees. The classic example is a pattern that uses nested unlim-
+ a vast amount of resources when running patterns that are not going to
+ match, but which have a very large number of possibilities in their
+ search trees. The classic example is a pattern that uses nested unlim-
ited repeats.
- Internally, PCRE uses a function called match() which it calls repeat-
- edly (sometimes recursively). The limit set by match_limit is imposed
- on the number of times this function is called during a match, which
- has the effect of limiting the amount of backtracking that can take
- place. For patterns that are not anchored, the count restarts from zero
- for each position in the subject string.
+ Internally, pcre_exec() uses a function called match(), which it calls
+ repeatedly (sometimes recursively). The limit set by match_limit is
+ imposed on the number of times this function is called during a match,
+ which has the effect of limiting the amount of backtracking that can
+ take place. For patterns that are not anchored, the count restarts from
+ zero for each position in the subject string.
+ When pcre_exec() is called with a pattern that was successfully studied
+ with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is
+ executed is entirely different. However, there is still the possibility
+ of runaway matching that goes on for a very long time, and so the
+ match_limit value is also used in this case (but in a different way) to
+ limit how long the matching can continue.
+
The default value for the limit can be set when PCRE is built; the
default default is 10 million, which handles all but the most extreme
cases. You can override the default by suppling pcre_exec() with a
@@ -1878,9 +1980,11 @@
the total number of calls, because not all calls to match() are recur-
sive. This limit is of use only if it is set smaller than match_limit.
- Limiting the recursion depth limits the amount of stack that can be
- used, or, when PCRE has been compiled to use memory on the heap instead
- of the stack, the amount of heap memory that can be used.
+ Limiting the recursion depth limits the amount of machine stack that
+ can be used, or, when PCRE has been compiled to use memory on the heap
+ instead of the stack, the amount of heap memory that can be used. This
+ limit is not relevant, and is ignored, if the pattern was successfully
+ studied with PCRE_STUDY_JIT_COMPILE.
The default value for match_limit_recursion can be set when PCRE is
built; the default default is the same value as the default for
@@ -1923,19 +2027,26 @@
PCRE_NO_START_OPTIMIZE, PCRE_NO_UTF8_CHECK, PCRE_PARTIAL_SOFT, and
PCRE_PARTIAL_HARD.
+ If the pattern was successfully studied with the PCRE_STUDY_JIT_COMPILE
+ option, the only supported options for JIT execution are
+ PCRE_NO_UTF8_CHECK, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and
+ PCRE_NOTEMPTY_ATSTART. Note in particular that partial matching is not
+ supported. If an unsupported option is used, JIT execution is disabled
+ and the normal interpretive code in pcre_exec() is run.
+
PCRE_ANCHORED
- The PCRE_ANCHORED option limits pcre_exec() to matching at the first
- matching position. If a pattern was compiled with PCRE_ANCHORED, or
- turned out to be anchored by virtue of its contents, it cannot be made
+ The PCRE_ANCHORED option limits pcre_exec() to matching at the first
+ matching position. If a pattern was compiled with PCRE_ANCHORED, or
+ turned out to be anchored by virtue of its contents, it cannot be made
unachored at matching time.
PCRE_BSR_ANYCRLF
PCRE_BSR_UNICODE
These options (which are mutually exclusive) control what the \R escape
- sequence matches. The choice is either to match only CR, LF, or CRLF,
- or to match any Unicode newline sequence. These options override the
+ sequence matches. The choice is either to match only CR, LF, or CRLF,
+ or to match any Unicode newline sequence. These options override the
choice that was made or defaulted when the pattern was compiled.
PCRE_NEWLINE_CR
@@ -1944,319 +2055,335 @@
PCRE_NEWLINE_ANYCRLF
PCRE_NEWLINE_ANY
- These options override the newline definition that was chosen or
- defaulted when the pattern was compiled. For details, see the descrip-
- tion of pcre_compile() above. During matching, the newline choice
- affects the behaviour of the dot, circumflex, and dollar metacharac-
- ters. It may also alter the way the match position is advanced after a
+ These options override the newline definition that was chosen or
+ defaulted when the pattern was compiled. For details, see the descrip-
+ tion of pcre_compile() above. During matching, the newline choice
+ affects the behaviour of the dot, circumflex, and dollar metacharac-
+ ters. It may also alter the way the match position is advanced after a
match failure for an unanchored pattern.
- When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
- set, and a match attempt for an unanchored pattern fails when the cur-
- rent position is at a CRLF sequence, and the pattern contains no
- explicit matches for CR or LF characters, the match position is
+ When PCRE_NEWLINE_CRLF, PCRE_NEWLINE_ANYCRLF, or PCRE_NEWLINE_ANY is
+ set, and a match attempt for an unanchored pattern fails when the cur-
+ rent position is at a CRLF sequence, and the pattern contains no
+ explicit matches for CR or LF characters, the match position is
advanced by two characters instead of one, in other words, to after the
CRLF.
The above rule is a compromise that makes the most common cases work as
- expected. For example, if the pattern is .+A (and the PCRE_DOTALL
+ expected. For example, if the pattern is .+A (and the PCRE_DOTALL
option is not set), it does not match the string "\r\nA" because, after
- failing at the start, it skips both the CR and the LF before retrying.
- However, the pattern [\r\n]A does match that string, because it con-
+ failing at the start, it skips both the CR and the LF before retrying.
+ However, the pattern [\r\n]A does match that string, because it con-
tains an explicit CR or LF reference, and so advances only by one char-
acter after the first failure.
An explicit match for CR of LF is either a literal appearance of one of
- those characters, or one of the \r or \n escape sequences. Implicit
- matches such as [^X] do not count, nor does \s (which includes CR and
+ those characters, or one of the \r or \n escape sequences. Implicit
+ matches such as [^X] do not count, nor does \s (which includes CR and
LF in the characters that it matches).
- Notwithstanding the above, anomalous effects may still occur when CRLF
+ Notwithstanding the above, anomalous effects may still occur when CRLF
is a valid newline sequence and explicit \r or \n escapes appear in the
pattern.
PCRE_NOTBOL
This option specifies that first character of the subject string is not
- the beginning of a line, so the circumflex metacharacter should not
- match before it. Setting this without PCRE_MULTILINE (at compile time)
- causes circumflex never to match. This option affects only the behav-
+ the beginning of a line, so the circumflex metacharacter should not
+ match before it. Setting this without PCRE_MULTILINE (at compile time)
+ causes circumflex never to match. This option affects only the behav-
iour of the circumflex metacharacter. It does not affect \A.
PCRE_NOTEOL
This option specifies that the end of the subject string is not the end
- of a line, so the dollar metacharacter should not match it nor (except
- in multiline mode) a newline immediately before it. Setting this with-
+ of a line, so the dollar metacharacter should not match it nor (except
+ in multiline mode) a newline immediately before it. Setting this with-
out PCRE_MULTILINE (at compile time) causes dollar never to match. This
- option affects only the behaviour of the dollar metacharacter. It does
+ option affects only the behaviour of the dollar metacharacter. It does
not affect \Z or \z.
PCRE_NOTEMPTY
An empty string is not considered to be a valid match if this option is
- set. If there are alternatives in the pattern, they are tried. If all
- the alternatives match the empty string, the entire match fails. For
+ set. If there are alternatives in the pattern, they are tried. If all
+ the alternatives match the empty string, the entire match fails. For
example, if the pattern
a?b?
- is applied to a string not beginning with "a" or "b", it matches an
- empty string at the start of the subject. With PCRE_NOTEMPTY set, this
+ is applied to a string not beginning with "a" or "b", it matches an
+ empty string at the start of the subject. With PCRE_NOTEMPTY set, this
match is not valid, so PCRE searches further into the string for occur-
rences of "a" or "b".
PCRE_NOTEMPTY_ATSTART
- This is like PCRE_NOTEMPTY, except that an empty string match that is
- not at the start of the subject is permitted. If the pattern is
+ This is like PCRE_NOTEMPTY, except that an empty string match that is
+ not at the start of the subject is permitted. If the pattern is
anchored, such a match can occur only if the pattern contains \K.
- Perl has no direct equivalent of PCRE_NOTEMPTY or
- PCRE_NOTEMPTY_ATSTART, but it does make a special case of a pattern
- match of the empty string within its split() function, and when using
- the /g modifier. It is possible to emulate Perl's behaviour after
+ Perl has no direct equivalent of PCRE_NOTEMPTY or
+ PCRE_NOTEMPTY_ATSTART, but it does make a special case of a pattern
+ match of the empty string within its split() function, and when using
+ the /g modifier. It is possible to emulate Perl's behaviour after
matching a null string by first trying the match again at the same off-
- set with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then if that
+ set with PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED, and then if that
fails, by advancing the starting offset (see below) and trying an ordi-
- nary match again. There is some code that demonstrates how to do this
- in the pcredemo sample program. In the most general case, you have to
- check to see if the newline convention recognizes CRLF as a newline,
- and if so, and the current character is CR followed by LF, advance the
+ nary match again. There is some code that demonstrates how to do this
+ in the pcredemo sample program. In the most general case, you have to
+ check to see if the newline convention recognizes CRLF as a newline,
+ and if so, and the current character is CR followed by LF, advance the
starting offset by two characters instead of one.
PCRE_NO_START_OPTIMIZE
- There are a number of optimizations that pcre_exec() uses at the start
- of a match, in order to speed up the process. For example, if it is
+ There are a number of optimizations that pcre_exec() uses at the start
+ of a match, in order to speed up the process. For example, if it is
known that an unanchored match must start with a specific character, it
- searches the subject for that character, and fails immediately if it
- cannot find it, without actually running the main matching function.
+ searches the subject for that character, and fails immediately if it
+ cannot find it, without actually running the main matching function.
This means that a special item such as (*COMMIT) at the start of a pat-
- tern is not considered until after a suitable starting point for the
- match has been found. When callouts or (*MARK) items are in use, these
+ tern is not considered until after a suitable starting point for the
+ match has been found. When callouts or (*MARK) items are in use, these
"start-up" optimizations can cause them to be skipped if the pattern is
- never actually used. The start-up optimizations are in effect a pre-
+ never actually used. The start-up optimizations are in effect a pre-
scan of the subject that takes place before the pattern is run.
- The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
- possibly causing performance to suffer, but ensuring that in cases
- where the result is "no match", the callouts do occur, and that items
+ The PCRE_NO_START_OPTIMIZE option disables the start-up optimizations,
+ possibly causing performance to suffer, but ensuring that in cases
+ where the result is "no match", the callouts do occur, and that items
such as (*COMMIT) and (*MARK) are considered at every possible starting
- position in the subject string. If PCRE_NO_START_OPTIMIZE is set at
+ position in the subject string. If PCRE_NO_START_OPTIMIZE is set at
compile time, it cannot be unset at matching time.
- Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching
+ Setting PCRE_NO_START_OPTIMIZE can change the outcome of a matching
operation. Consider the pattern
(*COMMIT)ABC
- When this is compiled, PCRE records the fact that a match must start
- with the character "A". Suppose the subject string is "DEFABC". The
- start-up optimization scans along the subject, finds "A" and runs the
- first match attempt from there. The (*COMMIT) item means that the pat-
- tern must match the current starting position, which in this case, it
- does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
- set, the initial scan along the subject string does not happen. The
- first match attempt is run starting from "D" and when this fails,
- (*COMMIT) prevents any further matches being tried, so the overall
- result is "no match". If the pattern is studied, more start-up opti-
- mizations may be used. For example, a minimum length for the subject
+ When this is compiled, PCRE records the fact that a match must start
+ with the character "A". Suppose the subject string is "DEFABC". The
+ start-up optimization scans along the subject, finds "A" and runs the
+ first match attempt from there. The (*COMMIT) item means that the pat-
+ tern must match the current starting position, which in this case, it
+ does. However, if the same match is run with PCRE_NO_START_OPTIMIZE
+ set, the initial scan along the subject string does not happen. The
+ first match attempt is run starting from "D" and when this fails,
+ (*COMMIT) prevents any further matches being tried, so the overall
+ result is "no match". If the pattern is studied, more start-up opti-
+ mizations may be used. For example, a minimum length for the subject
may be recorded. Consider the pattern
(*MARK:A)(X|Y)
- The minimum length for a match is one character. If the subject is
- "ABC", there will be attempts to match "ABC", "BC", "C", and then
- finally an empty string. If the pattern is studied, the final attempt
- does not take place, because PCRE knows that the subject is too short,
- and so the (*MARK) is never encountered. In this case, studying the
- pattern does not affect the overall match result, which is still "no
+ The minimum length for a match is one character. If the subject is
+ "ABC", there will be attempts to match "ABC", "BC", "C", and then
+ finally an empty string. If the pattern is studied, the final attempt
+ does not take place, because PCRE knows that the subject is too short,
+ and so the (*MARK) is never encountered. In this case, studying the
+ pattern does not affect the overall match result, which is still "no
match", but it does affect the auxiliary information that is returned.
PCRE_NO_UTF8_CHECK
When PCRE_UTF8 is set at compile time, the validity of the subject as a
- UTF-8 string is automatically checked when pcre_exec() is subsequently
- called. The value of startoffset is also checked to ensure that it
- points to the start of a UTF-8 character. There is a discussion about
- the validity of UTF-8 strings in the section on UTF-8 support in the
- main pcre page. If an invalid UTF-8 sequence of bytes is found,
- pcre_exec() returns the error PCRE_ERROR_BADUTF8 or, if PCRE_PAR-
- TIAL_HARD is set and the problem is a truncated UTF-8 character at the
- end of the subject, PCRE_ERROR_SHORTUTF8. In both cases, information
- about the precise nature of the error may also be returned (see the
- descriptions of these errors in the section entitled Error return val-
+ UTF-8 string is automatically checked when pcre_exec() is subsequently
+ called. The value of startoffset is also checked to ensure that it
+ points to the start of a UTF-8 character. There is a discussion about
+ the validity of UTF-8 strings in the section on UTF-8 support in the
+ main pcre page. If an invalid UTF-8 sequence of bytes is found,
+ pcre_exec() returns the error PCRE_ERROR_BADUTF8 or, if PCRE_PAR-
+ TIAL_HARD is set and the problem is a truncated UTF-8 character at the
+ end of the subject, PCRE_ERROR_SHORTUTF8. In both cases, information
+ about the precise nature of the error may also be returned (see the
+ descriptions of these errors in the section entitled Error return val-
ues from pcre_exec() below). If startoffset contains a value that does
- not point to the start of a UTF-8 character (or to the end of the sub-
+ not point to the start of a UTF-8 character (or to the end of the sub-
ject), PCRE_ERROR_BADUTF8_OFFSET is returned.
- If you already know that your subject is valid, and you want to skip
- these checks for performance reasons, you can set the
- PCRE_NO_UTF8_CHECK option when calling pcre_exec(). You might want to
- do this for the second and subsequent calls to pcre_exec() if you are
- making repeated calls to find all the matches in a single subject
- string. However, you should be sure that the value of startoffset
- points to the start of a UTF-8 character (or the end of the subject).
- When PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8
- string as a subject or an invalid value of startoffset is undefined.
+ If you already know that your subject is valid, and you want to skip
+ these checks for performance reasons, you can set the
+ PCRE_NO_UTF8_CHECK option when calling pcre_exec(). You might want to
+ do this for the second and subsequent calls to pcre_exec() if you are
+ making repeated calls to find all the matches in a single subject
+ string. However, you should be sure that the value of startoffset
+ points to the start of a UTF-8 character (or the end of the subject).
+ When PCRE_NO_UTF8_CHECK is set, the effect of passing an invalid UTF-8
+ string as a subject or an invalid value of startoffset is undefined.
Your program may crash.
PCRE_PARTIAL_HARD
PCRE_PARTIAL_SOFT
- These options turn on the partial matching feature. For backwards com-
- patibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial
- match occurs if the end of the subject string is reached successfully,
- but there are not enough subject characters to complete the match. If
+ These options turn on the partial matching feature. For backwards com-
+ patibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. A partial
+ match occurs if the end of the subject string is reached successfully,
+ but there are not enough subject characters to complete the match. If
this happens when PCRE_PARTIAL_SOFT (but not PCRE_PARTIAL_HARD) is set,
- matching continues by testing any remaining alternatives. Only if no
- complete match can be found is PCRE_ERROR_PARTIAL returned instead of
- PCRE_ERROR_NOMATCH. In other words, PCRE_PARTIAL_SOFT says that the
- caller is prepared to handle a partial match, but only if no complete
+ matching continues by testing any remaining alternatives. Only if no
+ complete match can be found is PCRE_ERROR_PARTIAL returned instead of
+ PCRE_ERROR_NOMATCH. In other words, PCRE_PARTIAL_SOFT says that the
+ caller is prepared to handle a partial match, but only if no complete
match can be found.
- If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this
- case, if a partial match is found, pcre_exec() immediately returns
- PCRE_ERROR_PARTIAL, without considering any other alternatives. In
- other words, when PCRE_PARTIAL_HARD is set, a partial match is consid-
+ If PCRE_PARTIAL_HARD is set, it overrides PCRE_PARTIAL_SOFT. In this
+ case, if a partial match is found, pcre_exec() immediately returns
+ PCRE_ERROR_PARTIAL, without considering any other alternatives. In
+ other words, when PCRE_PARTIAL_HARD is set, a partial match is consid-
ered to be more important that an alternative complete match.
- In both cases, the portion of the string that was inspected when the
+ In both cases, the portion of the string that was inspected when the
partial match was found is set as the first matching string. There is a
- more detailed discussion of partial and multi-segment matching, with
+ more detailed discussion of partial and multi-segment matching, with
examples, in the pcrepartial documentation.
The string to be matched by pcre_exec()
- The subject string is passed to pcre_exec() as a pointer in subject, a
+ The subject string is passed to pcre_exec() as a pointer in subject, a
length (in bytes) in length, and a starting byte offset in startoffset.
- If this is negative or greater than the length of the subject,
- pcre_exec() returns PCRE_ERROR_BADOFFSET. When the starting offset is
- zero, the search for a match starts at the beginning of the subject,
+ If this is negative or greater than the length of the subject,
+ pcre_exec() returns PCRE_ERROR_BADOFFSET. When the starting offset is
+ zero, the search for a match starts at the beginning of the subject,
and this is by far the most common case. In UTF-8 mode, the byte offset
- must point to the start of a UTF-8 character (or the end of the sub-
- ject). Unlike the pattern string, the subject may contain binary zero
+ must point to the start of a UTF-8 character (or the end of the sub-
+ ject). Unlike the pattern string, the subject may contain binary zero
bytes.
- A non-zero starting offset is useful when searching for another match
- in the same subject by calling pcre_exec() again after a previous suc-
- cess. Setting startoffset differs from just passing over a shortened
- string and setting PCRE_NOTBOL in the case of a pattern that begins
+ A non-zero starting offset is useful when searching for another match
+ in the same subject by calling pcre_exec() again after a previous suc-
+ cess. Setting startoffset differs from just passing over a shortened
+ string and setting PCRE_NOTBOL in the case of a pattern that begins
with any kind of lookbehind. For example, consider the pattern
\Biss\B
- which finds occurrences of "iss" in the middle of words. (\B matches
- only if the current position in the subject is not a word boundary.)
- When applied to the string "Mississipi" the first call to pcre_exec()
- finds the first occurrence. If pcre_exec() is called again with just
- the remainder of the subject, namely "issipi", it does not match,
+ which finds occurrences of "iss" in the middle of words. (\B matches
+ only if the current position in the subject is not a word boundary.)
+ When applied to the string "Mississipi" the first call to pcre_exec()
+ finds the first occurrence. If pcre_exec() is called again with just
+ the remainder of the subject, namely "issipi", it does not match,
because \B is always false at the start of the subject, which is deemed
- to be a word boundary. However, if pcre_exec() is passed the entire
+ to be a word boundary. However, if pcre_exec() is passed the entire
string again, but with startoffset set to 4, it finds the second occur-
- rence of "iss" because it is able to look behind the starting point to
+ rence of "iss" because it is able to look behind the starting point to
discover that it is preceded by a letter.
- Finding all the matches in a subject is tricky when the pattern can
+ Finding all the matches in a subject is tricky when the pattern can
match an empty string. It is possible to emulate Perl's /g behaviour by
- first trying the match again at the same offset, with the
- PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED options, and then if that
- fails, advancing the starting offset and trying an ordinary match
+ first trying the match again at the same offset, with the
+ PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED options, and then if that
+ fails, advancing the starting offset and trying an ordinary match
again. There is some code that demonstrates how to do this in the pcre-
demo sample program. In the most general case, you have to check to see
- if the newline convention recognizes CRLF as a newline, and if so, and
+ if the newline convention recognizes CRLF as a newline, and if so, and
the current character is CR followed by LF, advance the starting offset
by two characters instead of one.
- If a non-zero starting offset is passed when the pattern is anchored,
+ If a non-zero starting offset is passed when the pattern is anchored,
one attempt to match at the given offset is made. This can only succeed
- if the pattern does not require the match to be at the start of the
+ if the pattern does not require the match to be at the start of the
subject.
How pcre_exec() returns captured substrings
- In general, a pattern matches a certain portion of the subject, and in
- addition, further substrings from the subject may be picked out by
- parts of the pattern. Following the usage in Jeffrey Friedl's book,
- this is called "capturing" in what follows, and the phrase "capturing
- subpattern" is used for a fragment of a pattern that picks out a sub-
- string. PCRE supports several other kinds of parenthesized subpattern
+ In general, a pattern matches a certain portion of the subject, and in
+ addition, further substrings from the subject may be picked out by
+ parts of the pattern. Following the usage in Jeffrey Friedl's book,
+ this is called "capturing" in what follows, and the phrase "capturing
+ subpattern" is used for a fragment of a pattern that picks out a sub-
+ string. PCRE supports several other kinds of parenthesized subpattern
that do not cause substrings to be captured.
Captured substrings are returned to the caller via a vector of integers
- whose address is passed in ovector. The number of elements in the vec-
- tor is passed in ovecsize, which must be a non-negative number. Note:
+ whose address is passed in ovector. The number of elements in the vec-
+ tor is passed in ovecsize, which must be a non-negative number. Note:
this argument is NOT the size of ovector in bytes.
- The first two-thirds of the vector is used to pass back captured sub-
- strings, each substring using a pair of integers. The remaining third
- of the vector is used as workspace by pcre_exec() while matching cap-
- turing subpatterns, and is not available for passing back information.
- The number passed in ovecsize should always be a multiple of three. If
+ The first two-thirds of the vector is used to pass back captured sub-
+ strings, each substring using a pair of integers. The remaining third
+ of the vector is used as workspace by pcre_exec() while matching cap-
+ turing subpatterns, and is not available for passing back information.
+ The number passed in ovecsize should always be a multiple of three. If
it is not, it is rounded down.
- When a match is successful, information about captured substrings is
- returned in pairs of integers, starting at the beginning of ovector,
- and continuing up to two-thirds of its length at the most. The first
- element of each pair is set to the byte offset of the first character
- in a substring, and the second is set to the byte offset of the first
- character after the end of a substring. Note: these values are always
+ When a match is successful, information about captured substrings is
+ returned in pairs of integers, starting at the beginning of ovector,
+ and continuing up to two-thirds of its length at the most. The first
+ element of each pair is set to the byte offset of the first character
+ in a substring, and the second is set to the byte offset of the first
+ character after the end of a substring. Note: these values are always
byte offsets, even in UTF-8 mode. They are not character counts.
- The first pair of integers, ovector[0] and ovector[1], identify the
- portion of the subject string matched by the entire pattern. The next
- pair is used for the first capturing subpattern, and so on. The value
+ The first pair of integers, ovector[0] and ovector[1], identify the
+ portion of the subject string matched by the entire pattern. The next
+ pair is used for the first capturing subpattern, and so on. The value
returned by pcre_exec() is one more than the highest numbered pair that
- has been set. For example, if two substrings have been captured, the
- returned value is 3. If there are no capturing subpatterns, the return
+ has been set. For example, if two substrings have been captured, the
+ returned value is 3. If there are no capturing subpatterns, the return
value from a successful match is 1, indicating that just the first pair
of offsets has been set.
If a capturing subpattern is matched repeatedly, it is the last portion
of the string that it matched that is returned.
- If the vector is too small to hold all the captured substring offsets,
+ If the vector is too small to hold all the captured substring offsets,
it is used as far as possible (up to two-thirds of its length), and the
- function returns a value of zero. If the substring offsets are not of
- interest, pcre_exec() may be called with ovector passed as NULL and
- ovecsize as zero. However, if the pattern contains back references and
- the ovector is not big enough to remember the related substrings, PCRE
- has to get additional memory for use during matching. Thus it is usu-
- ally advisable to supply an ovector.
+ function returns a value of zero. If neither the actual string matched
+ not any captured substrings are of interest, pcre_exec() may be called
+ with ovector passed as NULL and ovecsize as zero. However, if the pat-
+ tern contains back references and the ovector is not big enough to
+ remember the related substrings, PCRE has to get additional memory for
+ use during matching. Thus it is usually advisable to supply an ovector
+ of reasonable size.
+ There are some cases where zero is returned (indicating vector over-
+ flow) when in fact the vector is exactly the right size for the final
+ match. For example, consider the pattern
+
+ (a)(?:(b)c|bd)
+
+ If a vector of 6 elements (allowing for only 1 captured substring) is
+ given with subject string "abd", pcre_exec() will try to set the second
+ captured string, thereby recording a vector overflow, before failing to
+ match "c" and backing up to try the second alternative. The zero
+ return, however, does correctly indicate that the maximum number of
+ slots (namely 2) have been filled. In similar cases where there is tem-
+ porary overflow, but the final number of used slots is actually less
+ than the maximum, a non-zero value is returned.
+
The pcre_fullinfo() function can be used to find out how many capturing
- subpatterns there are in a compiled pattern. The smallest size for
- ovector that will allow for n captured substrings, in addition to the
+ subpatterns there are in a compiled pattern. The smallest size for
+ ovector that will allow for n captured substrings, in addition to the
offsets of the substring matched by the whole pattern, is (n+1)*3.
- It is possible for capturing subpattern number n+1 to match some part
+ It is possible for capturing subpattern number n+1 to match some part
of the subject when subpattern n has not been used at all. For example,
- if the string "abc" is matched against the pattern (a|(z))(bc) the
+ if the string "abc" is matched against the pattern (a|(z))(bc) the
return from the function is 4, and subpatterns 1 and 3 are matched, but
- 2 is not. When this happens, both values in the offset pairs corre-
+ 2 is not. When this happens, both values in the offset pairs corre-
sponding to unused subpatterns are set to -1.
- Offset values that correspond to unused subpatterns at the end of the
- expression are also set to -1. For example, if the string "abc" is
- matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not
- matched. The return from the function is 2, because the highest used
- capturing subpattern number is 1, and the offsets for for the second
- and third capturing subpatterns (assuming the vector is large enough,
+ Offset values that correspond to unused subpatterns at the end of the
+ expression are also set to -1. For example, if the string "abc" is
+ matched against the pattern (abc)(x(yz)?)? subpatterns 2 and 3 are not
+ matched. The return from the function is 2, because the highest used
+ capturing subpattern number is 1, and the offsets for for the second
+ and third capturing subpatterns (assuming the vector is large enough,
of course) are set to -1.
- Note: Elements of ovector that do not correspond to capturing parenthe-
- ses in the pattern are never changed. That is, if a pattern contains n
- capturing parentheses, no more than ovector[0] to ovector[2n+1] are set
- by pcre_exec(). The other elements retain whatever values they previ-
- ously had.
+ Note: Elements in the first two-thirds of ovector that do not corre-
+ spond to capturing parentheses in the pattern are never changed. That
+ is, if a pattern contains n capturing parentheses, no more than ovec-
+ tor[0] to ovector[2n+1] are set by pcre_exec(). The other elements (in
+ the first two-thirds) retain whatever values they previously had.
- Some convenience functions are provided for extracting the captured
+ Some convenience functions are provided for extracting the captured
substrings as separate strings. These are described below.
Error return values from pcre_exec()
- If pcre_exec() fails, it returns a negative number. The following are
+ If pcre_exec() fails, it returns a negative number. The following are
defined in the header file:
PCRE_ERROR_NOMATCH (-1)
@@ -2265,7 +2392,7 @@
PCRE_ERROR_NULL (-2)
- Either code or subject was passed as NULL, or ovector was NULL and
+ Either code or subject was passed as NULL, or ovector was NULL and
ovecsize was not zero.
PCRE_ERROR_BADOPTION (-3)
@@ -2274,82 +2401,82 @@
PCRE_ERROR_BADMAGIC (-4)
- PCRE stores a 4-byte "magic number" at the start of the compiled code,
+ PCRE stores a 4-byte "magic number" at the start of the compiled code,
to catch the case when it is passed a junk pointer and to detect when a
pattern that was compiled in an environment of one endianness is run in
- an environment with the other endianness. This is the error that PCRE
+ an environment with the other endianness. This is the error that PCRE
gives when the magic number is not present.
PCRE_ERROR_UNKNOWN_OPCODE (-5)
While running the pattern match, an unknown item was encountered in the
- compiled pattern. This error could be caused by a bug in PCRE or by
+ compiled pattern. This error could be caused by a bug in PCRE or by
overwriting of the compiled pattern.
PCRE_ERROR_NOMEMORY (-6)
- If a pattern contains back references, but the ovector that is passed
+ If a pattern contains back references, but the ovector that is passed
to pcre_exec() is not big enough to remember the referenced substrings,
- PCRE gets a block of memory at the start of matching to use for this
- purpose. If the call via pcre_malloc() fails, this error is given. The
+ PCRE gets a block of memory at the start of matching to use for this
+ purpose. If the call via pcre_malloc() fails, this error is given. The
memory is automatically freed at the end of matching.
- This error is also given if pcre_stack_malloc() fails in pcre_exec().
- This can happen only when PCRE has been compiled with --disable-stack-
+ This error is also given if pcre_stack_malloc() fails in pcre_exec().
+ This can happen only when PCRE has been compiled with --disable-stack-
for-recursion.
PCRE_ERROR_NOSUBSTRING (-7)
- This error is used by the pcre_copy_substring(), pcre_get_substring(),
+ This error is used by the pcre_copy_substring(), pcre_get_substring(),
and pcre_get_substring_list() functions (see below). It is never
returned by pcre_exec().
PCRE_ERROR_MATCHLIMIT (-8)
- The backtracking limit, as specified by the match_limit field in a
- pcre_extra structure (or defaulted) was reached. See the description
+ The backtracking limit, as specified by the match_limit field in a
+ pcre_extra structure (or defaulted) was reached. See the description
above.
PCRE_ERROR_CALLOUT (-9)
This error is never generated by pcre_exec() itself. It is provided for
- use by callout functions that want to yield a distinctive error code.
+ use by callout functions that want to yield a distinctive error code.
See the pcrecallout documentation for details.
PCRE_ERROR_BADUTF8 (-10)
- A string that contains an invalid UTF-8 byte sequence was passed as a
- subject, and the PCRE_NO_UTF8_CHECK option was not set. If the size of
- the output vector (ovecsize) is at least 2, the byte offset to the
- start of the the invalid UTF-8 character is placed in the first ele-
- ment, and a reason code is placed in the second element. The reason
+ A string that contains an invalid UTF-8 byte sequence was passed as a
+ subject, and the PCRE_NO_UTF8_CHECK option was not set. If the size of
+ the output vector (ovecsize) is at least 2, the byte offset to the
+ start of the the invalid UTF-8 character is placed in the first ele-
+ ment, and a reason code is placed in the second element. The reason
codes are listed in the following section. For backward compatibility,
- if PCRE_PARTIAL_HARD is set and the problem is a truncated UTF-8 char-
- acter at the end of the subject (reason codes 1 to 5),
+ if PCRE_PARTIAL_HARD is set and the problem is a truncated UTF-8 char-
+ acter at the end of the subject (reason codes 1 to 5),
PCRE_ERROR_SHORTUTF8 is returned instead of PCRE_ERROR_BADUTF8.
PCRE_ERROR_BADUTF8_OFFSET (-11)
- The UTF-8 byte sequence that was passed as a subject was checked and
- found to be valid (the PCRE_NO_UTF8_CHECK option was not set), but the
- value of startoffset did not point to the beginning of a UTF-8 charac-
+ The UTF-8 byte sequence that was passed as a subject was checked and
+ found to be valid (the PCRE_NO_UTF8_CHECK option was not set), but the
+ value of startoffset did not point to the beginning of a UTF-8 charac-
ter or the end of the subject.
PCRE_ERROR_PARTIAL (-12)
- The subject string did not match, but it did match partially. See the
+ The subject string did not match, but it did match partially. See the
pcrepartial documentation for details of partial matching.
PCRE_ERROR_BADPARTIAL (-13)
- This code is no longer in use. It was formerly returned when the
- PCRE_PARTIAL option was used with a compiled pattern containing items
- that were not supported for partial matching. From release 8.00
+ This code is no longer in use. It was formerly returned when the
+ PCRE_PARTIAL option was used with a compiled pattern containing items
+ that were not supported for partial matching. From release 8.00
onwards, there are no restrictions on partial matching.
PCRE_ERROR_INTERNAL (-14)
- An unexpected internal error has occurred. This error could be caused
+ An unexpected internal error has occurred. This error could be caused
by a bug in PCRE or by overwriting of the compiled pattern.
PCRE_ERROR_BADCOUNT (-15)
@@ -2359,7 +2486,7 @@
PCRE_ERROR_RECURSIONLIMIT (-21)
The internal recursion limit, as specified by the match_limit_recursion
- field in a pcre_extra structure (or defaulted) was reached. See the
+ field in a pcre_extra structure (or defaulted) was reached. See the
description above.
PCRE_ERROR_BADNEWLINE (-23)
@@ -2373,24 +2500,31 @@
PCRE_ERROR_SHORTUTF8 (-25)
- This error is returned instead of PCRE_ERROR_BADUTF8 when the subject
- string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD
- option is set. Information about the failure is returned as for
- PCRE_ERROR_BADUTF8. It is in fact sufficient to detect this case, but
- this special error code for PCRE_PARTIAL_HARD precedes the implementa-
- tion of returned information; it is retained for backwards compatibil-
+ This error is returned instead of PCRE_ERROR_BADUTF8 when the subject
+ string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD
+ option is set. Information about the failure is returned as for
+ PCRE_ERROR_BADUTF8. It is in fact sufficient to detect this case, but
+ this special error code for PCRE_PARTIAL_HARD precedes the implementa-
+ tion of returned information; it is retained for backwards compatibil-
ity.
PCRE_ERROR_RECURSELOOP (-26)
This error is returned when pcre_exec() detects a recursion loop within
- the pattern. Specifically, it means that either the whole pattern or a
- subpattern has been called recursively for the second time at the same
+ the pattern. Specifically, it means that either the whole pattern or a
+ subpattern has been called recursively for the second time at the same
position in the subject string. Some simple patterns that might do this
- are detected and faulted at compile time, but more complicated cases,
+ are detected and faulted at compile time, but more complicated cases,
in particular mutual recursions between two different subpatterns, can-
not be detected until run time.
+ PCRE_ERROR_JIT_STACKLIMIT (-27)
+
+ This error is returned when a pattern that was successfully studied
+ using the PCRE_STUDY_JIT_COMPILE option is being matched, but the mem-
+ ory available for the just-in-time processing stack is not large
+ enough. See the pcrejit documentation for more details.
+
Error numbers -16 to -20 and -22 are not used by pcre_exec().
Reason codes for invalid UTF-8 strings
@@ -2785,32 +2919,34 @@
The strings are returned in reverse order of length; that is, the long-
est matching string is given first. If there were too many matches to
fit into ovector, the yield of the function is zero, and the vector is
- filled with the longest matches.
+ filled with the longest matches. Unlike pcre_exec(), pcre_dfa_exec()
+ can use the entire ovector for returning matched strings.
Error returns from pcre_dfa_exec()
- The pcre_dfa_exec() function returns a negative number when it fails.
- Many of the errors are the same as for pcre_exec(), and these are
- described above. There are in addition the following errors that are
+ The pcre_dfa_exec() function returns a negative number when it fails.
+ Many of the errors are the same as for pcre_exec(), and these are
+ described above. There are in addition the following errors that are
specific to pcre_dfa_exec():
PCRE_ERROR_DFA_UITEM (-16)
- This return is given if pcre_dfa_exec() encounters an item in the pat-
- tern that it does not support, for instance, the use of \C or a back
+ This return is given if pcre_dfa_exec() encounters an item in the pat-
+ tern that it does not support, for instance, the use of \C or a back
reference.
PCRE_ERROR_DFA_UCOND (-17)
- This return is given if pcre_dfa_exec() encounters a condition item
- that uses a back reference for the condition, or a test for recursion
+ This return is given if pcre_dfa_exec() encounters a condition item
+ that uses a back reference for the condition, or a test for recursion
in a specific group. These are not supported.
PCRE_ERROR_DFA_UMLIMIT (-18)
- This return is given if pcre_dfa_exec() is called with an extra block
- that contains a setting of the match_limit field. This is not supported
- (it is meaningless).
+ This return is given if pcre_dfa_exec() is called with an extra block
+ that contains a setting of the match_limit or match_limit_recursion
+ fields. This is not supported (these fields are meaningless for DFA
+ matching).
PCRE_ERROR_DFA_WSSIZE (-19)
@@ -2840,11 +2976,11 @@
REVISION
- Last updated: 13 August 2011
+ Last updated: 06 September 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECALLOUT(3) PCRECALLOUT(3)
@@ -2888,7 +3024,11 @@
pattern is matched. This is useful information when you are trying to
optimize the performance of a particular pattern.
+ The use of callouts in a pattern makes it ineligible for optimization
+ by the just-in-time compiler. Studying such a pattern with the
+ PCRE_STUDY_JIT_COMPILE option always fails.
+
MISSING CALLOUTS
You should be aware that, because of optimizations in the way PCRE
@@ -3029,11 +3169,11 @@
REVISION
- Last updated: 31 July 2011
+ Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECOMPAT(3) PCRECOMPAT(3)
@@ -3198,8 +3338,8 @@
Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPATTERN(3) PCREPATTERN(3)
@@ -5707,8 +5847,8 @@
Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESYNTAX(3) PCRESYNTAX(3)
@@ -6077,8 +6217,8 @@
Last updated: 21 November 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREUNICODE(3) PCREUNICODE(3)
@@ -6150,64 +6290,71 @@
If you pass an invalid UTF-8 string when PCRE_NO_UTF8_CHECK is set,
what happens depends on why the string is invalid. If the string con-
forms to the "old" definition of UTF-8 (RFC 2279), it is processed as a
- string of characters in the range 0 to 0x7FFFFFFF. In other words,
- apart from the initial validity test, PCRE (when in UTF-8 mode) handles
- strings according to the more liberal rules of RFC 2279. However, if
- the string does not even conform to RFC 2279, the result is undefined.
- Your program may crash.
+ string of characters in the range 0 to 0x7FFFFFFF by pcre_dfa_exec()
+ and the interpreted version of pcre_exec(). In other words, apart from
+ the initial validity test, these functions (when in UTF-8 mode) handle
+ strings according to the more liberal rules of RFC 2279. However, the
+ just-in-time (JIT) optimization for pcre_exec() supports only RFC 3629.
+ If you are using JIT optimization, or if the string does not even con-
+ form to RFC 2279, the result is undefined. Your program may crash.
If you want to process strings of values in the full range 0 to
0x7FFFFFFF, encoded in a UTF-8-like manner as per the old RFC, you can
set PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in
- this situation, you will have to apply your own validity check.
+ this situation, you will have to apply your own validity check, and
+ avoid the use of JIT optimization.
General comments about UTF-8 mode
- 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
+ 1. An unbraced hexadecimal escape sequence (such as \xb3) matches a
two-byte UTF-8 character if the value is greater than 127.
- 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
+ 2. Octal numbers up to \777 are recognized, and match two-byte UTF-8
characters for values greater than \177.
- 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
+ 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi-
vidual bytes, for example: \x{100}{3}.
- 4. The dot metacharacter matches one UTF-8 character instead of a sin-
+ 4. The dot metacharacter matches one UTF-8 character instead of a sin-
gle byte.
- 5. The escape sequence \C can be used to match a single byte in UTF-8
- mode, but its use can lead to some strange effects. This facility is
- not available in the alternative matching function, pcre_dfa_exec().
+ 5. The escape sequence \C can be used to match a single byte in UTF-8
+ mode, but its use can lead to some strange effects. This facility is
+ not available in the alternative matching function, pcre_dfa_exec(),
+ nor is it supported by the JIT optimization of pcre_exec(). If JIT
+ optimization is requested for a pattern that contains \C, it will not
+ succeed, and so the matching will be carried out by the normal inter-
+ pretive function.
- 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
+ 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
test characters of any code value, but, by default, the characters that
- PCRE recognizes as digits, spaces, or word characters remain the same
- set as before, all with values less than 256. This remains true even
- when PCRE is built to include Unicode property support, because to do
+ PCRE recognizes as digits, spaces, or word characters remain the same
+ set as before, all with values less than 256. This remains true even
+ when PCRE is built to include Unicode property support, because to do
otherwise would slow down PCRE in many common cases. Note in particular
that this applies to \b and \B, because they are defined in terms of \w
- and \W. If you really want to test for a wider sense of, say, "digit",
- you can use explicit Unicode property tests such as \p{Nd}. Alterna-
- tively, if you set the PCRE_UCP option, the way that the character
- escapes work is changed so that Unicode properties are used to deter-
- mine which characters match. There are more details in the section on
+ and \W. If you really want to test for a wider sense of, say, "digit",
+ you can use explicit Unicode property tests such as \p{Nd}. Alterna-
+ tively, if you set the PCRE_UCP option, the way that the character
+ escapes work is changed so that Unicode properties are used to deter-
+ mine which characters match. There are more details in the section on
generic character types in the pcrepattern documentation.
- 7. Similarly, characters that match the POSIX named character classes
+ 7. Similarly, characters that match the POSIX named character classes
are all low-valued characters, unless the PCRE_UCP option is set.
- 8. However, the horizontal and vertical whitespace matching escapes
- (\h, \H, \v, and \V) do match all the appropriate Unicode characters,
+ 8. However, the horizontal and vertical whitespace matching escapes
+ (\h, \H, \v, and \V) do match all the appropriate Unicode characters,
whether or not PCRE_UCP is set.
- 9. Case-insensitive matching applies only to characters whose values
- are less than 128, unless PCRE is built with Unicode property support.
- Even when Unicode property support is available, PCRE still uses its
- own character tables when checking the case of low-valued characters,
- so as not to degrade performance. The Unicode property information is
+ 9. Case-insensitive matching applies only to characters whose values
+ are less than 128, unless PCRE is built with Unicode property support.
+ Even when Unicode property support is available, PCRE still uses its
+ own character tables when checking the case of low-valued characters,
+ so as not to degrade performance. The Unicode property information is
used only for characters with higher values. Furthermore, PCRE supports
- case-insensitive matching only when there is a one-to-one mapping
- between a letter's cases. There are a small number of many-to-one map-
+ case-insensitive matching only when there is a one-to-one mapping
+ between a letter's cases. There are a small number of many-to-one map-
pings in Unicode; these are not supported by PCRE.
@@ -6220,14 +6367,262 @@
REVISION
- Last updated: 24 August 2011
+ Last updated: 06 September 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
+PCREJIT(3) PCREJIT(3)
+
+
+NAME
+ PCRE - Perl-compatible regular expressions
+
+
+PCRE JUST-IN-TIME COMPILER SUPPORT
+
+ Just-in-time compiling is a heavyweight optimization that can greatly
+ speed up pattern matching. However, it comes at the cost of extra pro-
+ cessing before the match is performed. Therefore, it is of most benefit
+ when the same pattern is going to be matched many times. This does not
+ necessarily mean many calls of pcre_exec(); if the pattern is not
+ anchored, matching attempts may take place many times at various posi-
+ tions in the subject, even for a single call to pcre_exec(). If the
+ subject string is very long, it may still pay to use JIT for one-off
+ matches.
+
+ JIT support applies only to the traditional matching function,
+ pcre_exec(). It does not apply when pcre_dfa_exec() is being used. The
+ code for this support was written by Zoltan Herczeg.
+
+
+AVAILABILITY OF JIT SUPPORT
+
+ JIT support is an optional feature of PCRE. The "configure" option
+ --enable-jit (or equivalent CMake option) must be set when PCRE is
+ built if you want to use JIT. The support is limited to the following
+ hardware platforms:
+
+ ARM v5, v7, and Thumb2
+ Intel x86 32-bit and 64-bit
+ MIPS 32-bit
+ Power PC 32-bit and 64-bit
+
+ If --enable-jit is set on an unsupported platform, compilation fails.
+
+ A program can tell if JIT support is available by calling pcre_config()
+ with the PCRE_CONFIG_JIT option. The result is 1 when JIT is available,
+ and 0 otherwise. However, a simple program does not need to check this
+ in order to use JIT. The API is implemented in a way that falls back to
+ the ordinary PCRE code if JIT is not available.
+
+
+SIMPLE USE OF JIT
+
+ You have to do two things to make use of the JIT support in the sim-
+ plest way:
+
+ (1) Call pcre_study() with the PCRE_STUDY_JIT_COMPILE option for
+ each compiled pattern, and pass the resulting pcre_extra block to
+ pcre_exec().
+
+ (2) Use pcre_free_study() to free the pcre_extra block when it is
+ no longer needed instead of just freeing it yourself. This
+ ensures that any JIT data is also freed.
+
+ In some circumstances you may need to call additional functions. These
+ are described in the section entitled "Controlling the JIT stack"
+ below.
+
+ If JIT support is not available, PCRE_STUDY_JIT_COMPILE is ignored, and
+ no JIT data is set up. Otherwise, the compiled pattern is passed to the
+ JIT compiler, which turns it into machine code that executes much
+ faster than the normal interpretive code. When pcre_exec() is passed a
+ pcre_extra block containing a pointer to JIT code, it obeys that
+ instead of the normal code. The result is identical, but the code runs
+ much faster.
+
+ There are some pcre_exec() options that are not supported for JIT exe-
+ cution. There are also some pattern items that JIT cannot handle.
+ Details are given below. In both cases, execution automatically falls
+ back to the interpretive code.
+
+ If the JIT compiler finds an unsupported item, no JIT data is gener-
+ ated. You can find out if JIT execution is available after studying a
+ pattern by calling pcre_fullinfo() with the PCRE_INFO_JIT option. A
+ result of 1 means that JIT compilationw was successful. A result of 0
+ means that JIT support is not available, or the pattern was not studied
+ with PCRE_STUDY_JIT_COMPILE, or the JIT compiler was not able to handle
+ the pattern.
+
+
+UNSUPPORTED OPTIONS AND PATTERN ITEMS
+
+ The only pcre_exec() options that are supported for JIT execution are
+ PCRE_NO_UTF8_CHECK, PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and
+ PCRE_NOTEMPTY_ATSTART. Note in particular that partial matching is not
+ supported.
+
+ The unsupported pattern items are:
+
+ \C match a single byte, even in UTF-8 mode
+ (?Cn) callouts
+ (?(<name>)... conditional test on setting of a named subpattern
+ (?(R)... conditional test on whole pattern recursion
+ (?(Rn)... conditional test on recursion, by number
+ (?(R&name)... conditional test on recursion, by name
+ (*COMMIT) )
+ (*MARK) )
+ (*PRUNE) ) the backtracking control verbs
+ (*SKIP) )
+ (*THEN) )
+
+ Support for some of these may be added in future.
+
+
+RETURN VALUES FROM JIT EXECUTION
+
+ When a pattern is matched using JIT execution, the return values are
+ the same as those given by the interpretive pcre_exec() code, with the
+ addition of one new error code: PCRE_ERROR_JIT_STACKLIMIT. This means
+ that the memory used for the JIT stack was insufficient. See "Control-
+ ling the JIT stack" below for a discussion of JIT stack usage. For com-
+ patibility with the interpretive pcre_exec() code, no more than two-
+ thirds of the ovector argument is used for passing back captured sub-
+ strings.
+
+ The error code PCRE_ERROR_MATCHLIMIT is returned by the JIT code if
+ searching a very large pattern tree goes on for too long, as it is in
+ the same circumstance when JIT is not used, but the details of exactly
+ what is counted are not the same. The PCRE_ERROR_RECURSIONLIMIT error
+ code is never returned by JIT execution.
+
+
+SAVING AND RESTORING COMPILED PATTERNS
+
+ The code that is generated by the JIT compiler is architecture-spe-
+ cific, and is also position dependent. For those reasons it cannot be
+ saved and restored like the bytecode and other data of a compiled pat-
+ tern. You should be able run pcre_study() on a saved and restored pat-
+ tern, and thereby recreate the JIT data, but because JIT compilation
+ uses significant resources, it is probably not worth doing this.
+
+
+CONTROLLING THE JIT STACK
+
+ When the compiled JIT code runs, it needs a block of memory to use as a
+ stack. By default, it uses 32K on the machine stack. However, some
+ large or complicated patterns need more than this. The error
+ PCRE_ERROR_JIT_STACKLIMIT is given when there is not enough stack.
+ Three functions are provided for managing blocks of memory for use as
+ JIT stacks.
+
+ The pcre_jit_stack_alloc() function creates a JIT stack. Its arguments
+ are a starting size and a maximum size, and it returns a pointer to an
+ opaque structure of type pcre_jit_stack, or NULL if there is an error.
+ The pcre_jit_stack_free() function can be used to free a stack that is
+ no longer needed. (For the technically minded: the address space is
+ allocated by mmap or VirtualAlloc.)
+
+ JIT uses far less memory for recursion than the interpretive code, and
+ a maximum stack size of 512K to 1M should be more than enough for any
+ pattern.
+
+ The pcre_assign_jit_stack() function specifies which stack JIT code
+ should use. Its arguments are as follows:
+
+ pcre_extra *extra
+ pcre_jit_callback callback
+ void *data
+
+ The extra argument must be the result of studying a pattern with
+ PCRE_STUDY_JIT_COMPILE. There are three cases for the values of the
+ other two options:
+
+ (1) If callback is NULL and data is NULL, an internal 32K block
+ on the machine stack is used.
+
+ (2) If callback is NULL and data is not NULL, data must be
+ a valid JIT stack, the result of calling pcre_jit_stack_alloc().
+
+ (3) If callback not NULL, it must point to a function that is called
+ with data as an argument at the start of matching, in order to
+ set up a JIT stack. If the result is NULL, the internal 32K stack
+ is used; otherwise the return value must be a valid JIT stack,
+ the result of calling pcre_jit_stack_alloc().
+
+ You may safely assign the same JIT stack to more than one pattern, as
+ long as they are all matched sequentially in the same thread. In a mul-
+ tithread application, each thread must use its own JIT stack.
+
+ Strictly speaking, even more is allowed. You can assign the same stack
+ to any number of patterns as long as they are not used for matching by
+ multiple threads at the same time. For example, you can assign the same
+ stack to all compiled patterns, and use a global mutex in the callback
+ to wait until the stack is available for use. However, this is an inef-
+ ficient solution, and not recommended.
+
+ This is a suggestion for how a typical multithreaded program might
+ operate:
+
+ During thread initalization
+ thread_local_var = pcre_jit_stack_alloc(...)
+
+ During thread exit
+ pcre_jit_stack_free(thread_local_var)
+
+ Use a one-line callback function
+ return thread_local_var
+
+ All the functions described in this section do nothing if JIT is not
+ available, and pcre_assign_jit_stack() does nothing unless the extra
+ argument is non-NULL and points to a pcre_extra block that is the
+ result of a successful study with PCRE_STUDY_JIT_COMPILE.
+
+
+EXAMPLE CODE
+
+ This is a single-threaded example that specifies a JIT stack without
+ using a callback.
+
+ int rc;
+ int ovector[30];
+ pcre *re;
+ pcre_extra *extra;
+ pcre_jit_stack *jit_stack;
+
+ re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
+ /* Check for errors */
+ extra = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &error);
+ jit_stack = pcre_jit_stack_alloc(32*1024, 512*1024);
+ /* Check for error (NULL) */
+ pcre_assign_jit_stack(extra, NULL, jit_stack);
+ rc = pcre_exec(re, extra, subject, length, 0, 0, ovector, 30);
+ /* Check results */
+ pcre_free(re);
+ pcre_free_study(extra);
+ pcre_jit_stack_free(jit_stack);
+
+
+SEE ALSO
+
+ pcreapi(3)
+
+
+AUTHOR
+
+ Philip Hazel
+ University Computing Service
+ Cambridge CB2 3QH, England.
+
+
+REVISION
+
+ Last updated: 06 September 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPARTIAL(3) PCREPARTIAL(3)
@@ -6266,14 +6661,16 @@
plete match, though the details differ between the two matching func-
tions. If both options are set, PCRE_PARTIAL_HARD takes precedence.
- Setting a partial matching option disables two of PCRE's optimizations.
- PCRE remembers the last literal byte in a pattern, and abandons match-
- ing immediately if such a byte is not present in the subject string.
- This optimization cannot be used for a subject string that might match
- only partially. If the pattern was studied, PCRE knows the minimum
- length of a matching string, and does not bother to run the matching
- function on shorter strings. This optimization is also disabled for
- partial matching.
+ Setting a partial matching option for pcre_exec() disables the use of
+ any just-in-time code that was set up by calling pcre_study() with the
+ PCRE_STUDY_JIT_COMPILE option. It also disables two of PCRE's standard
+ optimizations. PCRE remembers the last literal byte in a pattern, and
+ abandons matching immediately if such a byte is not present in the sub-
+ ject string. This optimization cannot be used for a subject string that
+ might match only partially. If the pattern was studied, PCRE knows the
+ minimum length of a matching string, and does not bother to run the
+ matching function on shorter strings. This optimization is also dis-
+ abled for partial matching.
PARTIAL MATCHING USING pcre_exec()
@@ -6643,11 +7040,11 @@
REVISION
- Last updated: 07 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 26 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPRECOMPILE(3) PCREPRECOMPILE(3)
@@ -6662,7 +7059,9 @@
form instead of having to compile them every time the application is
run. If you are not using any private character tables (see the
pcre_maketables() documentation), this is relatively straightforward.
- If you are using private tables, it is a little bit more complicated.
+ If you are using private tables, it is a little bit more complicated.
+ However, if you are using the just-in-time optimization feature of
+ pcre_study(), it is not possible to save and reload the JIT data.
If you save compiled patterns to a file, you can copy them to a differ-
ent host and run them there. This works even if the new host has the
@@ -6670,16 +7069,17 @@
There may be a small performance penalty, but it should be insignifi-
cant. However, compiling regular expressions with one version of PCRE
for use with a different version is not guaranteed to work and may
- cause crashes.
+ cause crashes, and saving and restoring a compiled pattern loses any
+ JIT optimization data.
SAVING A COMPILED PATTERN
The value returned by pcre_compile() points to a single block of memory
- that holds the compiled pattern and associated data. You can find the
- length of this block in bytes by calling pcre_fullinfo() with an argu-
- ment of PCRE_INFO_SIZE. You can then save the data in any appropriate
- manner. Here is sample code that compiles a pattern and writes it to a
+ that holds the compiled pattern and associated data. You can find the
+ length of this block in bytes by calling pcre_fullinfo() with an argu-
+ ment of PCRE_INFO_SIZE. You can then save the data in any appropriate
+ manner. Here is sample code that compiles a pattern and writes it to a
file. It assumes that the variable fd refers to a file that is open for
output:
@@ -6694,62 +7094,65 @@
rc = fwrite(re, 1, size, fd);
if (rc != size) { ... handle errors ... }
- In this example, the bytes that comprise the compiled pattern are
- copied exactly. Note that this is binary data that may contain any of
- the 256 possible byte values. On systems that make a distinction
+ In this example, the bytes that comprise the compiled pattern are
+ copied exactly. Note that this is binary data that may contain any of
+ the 256 possible byte values. On systems that make a distinction
between binary and non-binary data, be sure that the file is opened for
binary output.
- If you want to write more than one pattern to a file, you will have to
- devise a way of separating them. For binary data, preceding each pat-
- tern with its length is probably the most straightforward approach.
- Another possibility is to write out the data in hexadecimal instead of
+ If you want to write more than one pattern to a file, you will have to
+ devise a way of separating them. For binary data, preceding each pat-
+ tern with its length is probably the most straightforward approach.
+ Another possibility is to write out the data in hexadecimal instead of
binary, one pattern to a line.
- Saving compiled patterns in a file is only one possible way of storing
- them for later use. They could equally well be saved in a database, or
- in the memory of some daemon process that passes them via sockets to
+ Saving compiled patterns in a file is only one possible way of storing
+ them for later use. They could equally well be saved in a database, or
+ in the memory of some daemon process that passes them via sockets to
the processes that want them.
- If the pattern has been studied, it is also possible to save the study
- data in a similar way to the compiled pattern itself. When studying
- generates additional information, pcre_study() returns a pointer to a
- pcre_extra data block. Its format is defined in the section on matching
- a pattern in the pcreapi documentation. The study_data field points to
- the binary study data, and this is what you must save (not the
- pcre_extra block itself). The length of the study data can be obtained
- by calling pcre_fullinfo() with an argument of PCRE_INFO_STUDYSIZE.
- Remember to check that pcre_study() did return a non-NULL value before
- trying to save the study data.
+ If the pattern has been studied, it is also possible to save the normal
+ study data in a similar way to the compiled pattern itself. However, if
+ the PCRE_STUDY_JIT_COMPILE was used, the just-in-time data that is cre-
+ ated cannot be saved because it is too dependent on the current envi-
+ ronment. When studying generates additional information, pcre_study()
+ returns a pointer to a pcre_extra data block. Its format is defined in
+ the section on matching a pattern in the pcreapi documentation. The
+ study_data field points to the binary study data, and this is what you
+ must save (not the pcre_extra block itself). The length of the study
+ data can be obtained by calling pcre_fullinfo() with an argument of
+ PCRE_INFO_STUDYSIZE. Remember to check that pcre_study() did return a
+ non-NULL value before trying to save the study data.
RE-USING A PRECOMPILED PATTERN
- Re-using a precompiled pattern is straightforward. Having reloaded it
- into main memory, you pass its pointer to pcre_exec() or
- pcre_dfa_exec() in the usual way. This should work even on another
- host, and even if that host has the opposite endianness to the one
+ Re-using a precompiled pattern is straightforward. Having reloaded it
+ into main memory, you pass its pointer to pcre_exec() or
+ pcre_dfa_exec() in the usual way. This should work even on another
+ host, and even if that host has the opposite endianness to the one
where the pattern was compiled.
- However, if you passed a pointer to custom character tables when the
- pattern was compiled (the tableptr argument of pcre_compile()), you
- must now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
- because the value saved with the compiled pattern will obviously be
+ However, if you passed a pointer to custom character tables when the
+ pattern was compiled (the tableptr argument of pcre_compile()), you
+ must now pass a similar pointer to pcre_exec() or pcre_dfa_exec(),
+ because the value saved with the compiled pattern will obviously be
nonsense. A field in a pcre_extra() block is used to pass this data, as
- described in the section on matching a pattern in the pcreapi documen-
+ described in the section on matching a pattern in the pcreapi documen-
tation.
- If you did not provide custom character tables when the pattern was
- compiled, the pointer in the compiled pattern is NULL, which causes
- pcre_exec() to use PCRE's internal tables. Thus, you do not need to
+ If you did not provide custom character tables when the pattern was
+ compiled, the pointer in the compiled pattern is NULL, which causes
+ pcre_exec() to use PCRE's internal tables. Thus, you do not need to
take any special action at run time in this case.
- If you saved study data with the compiled pattern, you need to create
+ If you saved study data with the compiled pattern, you need to create
your own pcre_extra data block and set the study_data field to point to
- the reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA
- bit in the flags field to indicate that study data is present. Then
- pass the pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
- usual way.
+ the reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA
+ bit in the flags field to indicate that study data is present. Then
+ pass the pcre_extra block to pcre_exec() or pcre_dfa_exec() in the
+ usual way. If the pattern was studied for just-in-time optimization,
+ that data cannot be saved, and so is lost by a save/restore cycle.
COMPATIBILITY WITH DIFFERENT PCRE RELEASES
@@ -6768,11 +7171,11 @@
REVISION
- Last updated: 17 November 2010
- Copyright (c) 1997-2010 University of Cambridge.
+ Last updated: 26 August 2011
+ Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPERFORM(3) PCREPERFORM(3)
@@ -6939,8 +7342,8 @@
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPOSIX(3) PCREPOSIX(3)
@@ -7202,8 +7605,8 @@
Last updated: 16 May 2010
Copyright (c) 1997-2010 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECPP(3) PCRECPP(3)
@@ -7544,8 +7947,8 @@
Last updated: 17 March 2009
Minor typo fixed: 25 July 2011
------------------------------------------------------------------------------
-
-
+
+
PCRESAMPLE(3) PCRESAMPLE(3)
@@ -7679,8 +8082,8 @@
Last updated: 24 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESTACK(3) PCRESTACK(3)
@@ -7706,6 +8109,14 @@
result of the current call (a "tail recursion"), the function is just
restarted instead.
+ The above comments apply when pcre_exec() is run in its normal inter-
+ pretive manner. If the pattern was studied with the PCRE_STUDY_JIT_COM-
+ PILE option, and just-in-time compiling was successful, and the options
+ passed to pcre_exec() were not incompatible, the matching process uses
+ the JIT-compiled code instead of the match() function. In this case,
+ the memory requirements are handled entirely differently. See the pcre-
+ jit documentation for details.
+
The pcre_dfa_exec() function operates in an entirely different way, and
uses recursion only when there is a regular expression recursion or
subroutine call in the pattern. This includes the processing of asser-
@@ -7717,7 +8128,7 @@
stack. At present, there is no protection against this.
The comments that follow do NOT apply to pcre_dfa_exec(); they are rel-
- evant only for pcre_exec().
+ evant only for pcre_exec() without the JIT optimization.
Reducing pcre_exec()'s stack usage
@@ -7829,8 +8240,8 @@
REVISION
- Last updated: 22 July 2011
+ Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
Modified: code/trunk/doc/pcre_assign_jit_stack.3
===================================================================
--- code/trunk/doc/pcre_assign_jit_stack.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_assign_jit_stack.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -7,7 +7,7 @@
.B #include <pcre.h>
.PP
.SM
-.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
+.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
.ti +5n
.B pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);
.
@@ -20,7 +20,7 @@
.sp
extra the data pointer returned by \fBpcre_study()\fP
callback a callback function
- data a JIT stack or a value to be passed to the callback
+ data a JIT stack or a value to be passed to the callback
function
.P
If \fIcallback\fP is NULL and \fIdata\fP is NULL, an internal 32K block on
@@ -30,12 +30,12 @@
be a valid JIT stack, the result of calling \fBpcre_jit_stack_alloc()\fP.
.P
If \fIcallback\fP not NULL, it is called with \fIdata\fP as an argument at
-the start of matching, in order to set up a JIT stack. If the result is NULL,
-the internal 32K stack is used; otherwise the return value must be a valid JIT
+the start of matching, in order to set up a JIT stack. If the result is NULL,
+the internal 32K stack is used; otherwise the return value must be a valid JIT
stack, the result of calling \fBpcre_jit_stack_alloc()\fP.
.P
-You may safely assign the same JIT stack to multiple patterns, as long as they
-are all matched in the same thread. In a multithread application, each thread
+You may safely assign the same JIT stack to multiple patterns, as long as they
+are all matched in the same thread. In a multithread application, each thread
must use its own JIT stack. For more details, see the
.\" HREF
\fBpcrejit\fP
Modified: code/trunk/doc/pcre_dfa_exec.3
===================================================================
--- code/trunk/doc/pcre_dfa_exec.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_dfa_exec.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -82,13 +82,13 @@
\fIcallout_data\fP Opaque data passed back to callouts
\fItables\fP Points to character tables or is NULL
\fImark\fP For passing back a *MARK pointer
- \fIexecutable_jit\fP Opaque data from JIT compilation
+ \fIexecutable_jit\fP Opaque data from JIT compilation
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
PCRE_EXTRA_TABLES, PCRE_EXTRA_MARK and PCRE_EXTRA_EXECUTABLE_JIT. For this
matching function, the \fImatch_limit\fP and \fImatch_limit_recursion\fP fields
-are not used, and must not be set. The PCRE_EXTRA_EXECUTABLE_JIT flag and
+are not used, and must not be set. The PCRE_EXTRA_EXECUTABLE_JIT flag and
the corresponding variable are ignored.
.P
There is a complete description of the PCRE native API in the
Modified: code/trunk/doc/pcre_exec.3
===================================================================
--- code/trunk/doc/pcre_exec.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_exec.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -68,7 +68,7 @@
\fIcallout_data\fP Opaque data passed back to callouts
\fItables\fP Points to character tables or is NULL
\fImark\fP For passing back a *MARK pointer
- \fIexecutable_jit\fP Opaque data from JIT compilation
+ \fIexecutable_jit\fP Opaque data from JIT compilation
.sp
The flag bits are PCRE_EXTRA_STUDY_DATA, PCRE_EXTRA_MATCH_LIMIT,
PCRE_EXTRA_MATCH_LIMIT_RECURSION, PCRE_EXTRA_CALLOUT_DATA,
Modified: code/trunk/doc/pcre_free_study.3
===================================================================
--- code/trunk/doc/pcre_free_study.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_free_study.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -12,8 +12,8 @@
.SH DESCRIPTION
.rs
.sp
-This function is used to free the memory used for the data generated by a call
-to \fBpcre_study()\fP when it is no longer needed. The argument must be the
+This function is used to free the memory used for the data generated by a call
+to \fBpcre_study()\fP when it is no longer needed. The argument must be the
result of such a call.
.P
There is a complete description of the PCRE native API in the
Modified: code/trunk/doc/pcre_fullinfo.3
===================================================================
--- code/trunk/doc/pcre_fullinfo.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_fullinfo.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -45,14 +45,14 @@
PCRE_INFO_SIZE Size of compiled pattern
PCRE_INFO_STUDYSIZE Size of study data
.sp
-The \fIwhere\fP argument must point to an integer variable, except for the
+The \fIwhere\fP argument must point to an integer variable, except for the
following \fIwhat\fP values:
.sp
PCRE_INFO_DEFAULT_TABLES const unsigned char *
PCRE_INFO_FIRSTTABLE const unsigned char *
PCRE_INFO_NAMETABLE const unsigned char *
PCRE_INFO_OPTIONS unsigned long int
- PCRE_INFO_SIZE size_t
+ PCRE_INFO_SIZE size_t
.sp
The yield of the function is zero on success or:
.sp
Modified: code/trunk/doc/pcre_jit_stack_alloc.3
===================================================================
--- code/trunk/doc/pcre_jit_stack_alloc.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_jit_stack_alloc.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -7,7 +7,7 @@
.B #include <pcre.h>
.PP
.SM
-.B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP,
+.B pcre_jit_stack *pcre_jit_stack_alloc(int \fIstartsize\fP,
.ti +5n
.B int \fImaxsize\fP);
.
@@ -19,7 +19,7 @@
stack, and a maximum size to which it is allowed to grow. The result can be
passed to the JIT runtime code by \fBpcre_assign_jit_stack()\fP, or that
function can set up a callback for obtaining a stack. A maximum stack size of
-512K to 1M should be more than enough for any pattern. For more details, see
+512K to 1M should be more than enough for any pattern. For more details, see
the
.\" HREF
\fBpcrejit\fP
Modified: code/trunk/doc/pcre_study.3
===================================================================
--- code/trunk/doc/pcre_study.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcre_study.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -28,10 +28,10 @@
information, or there was an error. You can tell the difference by looking at
the error value. It is NULL in first case.
.P
-The only option is PCRE_STUDY_JIT_COMPILE. It requests just-in-time compilation
-if possible. If PCRE has been compiled without JIT support, this option is
+The only option is PCRE_STUDY_JIT_COMPILE. It requests just-in-time compilation
+if possible. If PCRE has been compiled without JIT support, this option is
ignored. See the
-.\"HREF
+.\" HREF
\fBpcrejit\fP
.\"
page for further details.
Modified: code/trunk/doc/pcreapi.3
===================================================================
--- code/trunk/doc/pcreapi.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcreapi.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -41,7 +41,7 @@
.PP
.B void pcre_jit_stack_free(pcre_jit_stack *\fIstack\fP);
.PP
-.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
+.B void pcre_assign_jit_stack(pcre_extra *\fIextra\fP,
.ti +5n
.B pcre_jit_callback \fIcallback\fP, void *\fIdata\fP);
.PP
@@ -315,7 +315,7 @@
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
.P
-If the just-in-time optimization feature is being used, it needs separate
+If the just-in-time optimization feature is being used, it needs separate
memory stack areas for each thread. See the
.\" HREF
\fBpcrejit\fP
@@ -365,8 +365,8 @@
.sp
PCRE_CONFIG_JIT
.sp
-The output is an integer that is set to one if support for just-in-time
-compiling is available; otherwise it is set to zero.
+The output is an integer that is set to one if support for just-in-time
+compiling is available; otherwise it is set to zero.
.sp
PCRE_CONFIG_NEWLINE
.sp
@@ -888,17 +888,17 @@
wants to pass any of the other fields to \fBpcre_exec()\fP or
\fBpcre_dfa_exec()\fP, it must set up its own \fBpcre_extra\fP block.
.P
-The second argument of \fBpcre_study()\fP contains option bits. There is only
-one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-time
-compiler is available, the pattern is further compiled into machine code that
-executes much faster than the \fBpcre_exec()\fP matching function. If
-the just-in-time compiler is not available, this option is ignored. All other
+The second argument of \fBpcre_study()\fP contains option bits. There is only
+one option: PCRE_STUDY_JIT_COMPILE. If this is set, and the just-in-time
+compiler is available, the pattern is further compiled into machine code that
+executes much faster than the \fBpcre_exec()\fP matching function. If
+the just-in-time compiler is not available, this option is ignored. All other
bits in the \fIoptions\fP argument must be zero.
.P
-JIT compilation is a heavyweight optimization. It can take some time for
+JIT compilation is a heavyweight optimization. It can take some time for
patterns to be analyzed, and for one-off matches and simple patterns the
benefit of faster execution might be offset by a much slower study time.
-Not all patterns can be optimized by the JIT compiler. For those that cannot be
+Not all patterns can be optimized by the JIT compiler. For those that cannot be
handled, matching automatically falls back to the \fBpcre_exec()\fP
interpreter. For more details, see the
.\" HREF
@@ -920,7 +920,7 @@
where PCRE_STUDY_JIT_COMPILE is not used, but it is advisable to change to the
new function when convenient.
.P
-This is a typical way in which \fBpcre_study\fP() is used (except that in a
+This is a typical way in which \fBpcre_study\fP() is used (except that in a
real application there should be tests for errors):
.sp
int rc;
@@ -932,10 +932,10 @@
0, /* no options */
&error); /* set to NULL or points to a message */
rc = pcre_exec( /* see below for details of pcre_exec() options */
- re, sd, "subject", 7, 0, 0, ovector, 30);
+ re, sd, "subject", 7, 0, 0, ovector, 30);
...
pcre_free_study(sd);
- pcre_free(re);
+ pcre_free(re);
.sp
Studying a pattern does two things: first, a lower bound for the length of
subject string that is needed to match the pattern is computed. This does not
@@ -950,8 +950,8 @@
created. This speeds up finding a position in the subject at which to start
matching.
.P
-These two optimizations apply to both \fBpcre_exec()\fP and
-\fBpcre_dfa_exec()\fP. However, they are not used by \fBpcre_exec()\fP if
+These two optimizations apply to both \fBpcre_exec()\fP and
+\fBpcre_dfa_exec()\fP. However, they are not used by \fBpcre_exec()\fP if
\fBpcre_study()\fP is called with the PCRE_STUDY_JIT_COMPILE option, and
just-in-time compiling is successful. The optimizations can be disabled by
setting the PCRE_NO_START_OPTIMIZE option when calling \fBpcre_exec()\fP or
@@ -1122,11 +1122,11 @@
.sp
PCRE_INFO_JIT
.sp
-Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE option, and
+Return 1 if the pattern was studied with the PCRE_STUDY_JIT_COMPILE option, and
just-in-time compiling was successful. The fourth argument should point to an
\fBint\fP variable. A return value of 0 means that JIT support is not available
-in this version of PCRE, or that the pattern was not studied with the
-PCRE_STUDY_JIT_COMPILE option, or that the JIT compiler could not handle this
+in this version of PCRE, or that the pattern was not studied with the
+PCRE_STUDY_JIT_COMPILE option, or that the JIT compiler could not handle this
particular pattern. See the
.\" HREF
\fBpcrejit\fP
@@ -1377,7 +1377,7 @@
.sp
unsigned long int \fIflags\fP;
void *\fIstudy_data\fP;
- void *\fIexecutable_jit\fP;
+ void *\fIexecutable_jit\fP;
unsigned long int \fImatch_limit\fP;
unsigned long int \fImatch_limit_recursion\fP;
void *\fIcallout_data\fP;
@@ -1388,7 +1388,7 @@
are set. The flag bits are:
.sp
PCRE_EXTRA_STUDY_DATA
- PCRE_EXTRA_EXECUTABLE_JIT
+ PCRE_EXTRA_EXECUTABLE_JIT
PCRE_EXTRA_MATCH_LIMIT
PCRE_EXTRA_MATCH_LIMIT_RECURSION
PCRE_EXTRA_CALLOUT_DATA
@@ -1411,11 +1411,11 @@
imposed on the number of times this function is called during a match, which
has the effect of limiting the amount of backtracking that can take place. For
patterns that are not anchored, the count restarts from zero for each position
-in the subject string.
+in the subject string.
.P
-When \fBpcre_exec()\fP is called with a pattern that was successfully studied
-with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is executed
-is entirely different. However, there is still the possibility of runaway
+When \fBpcre_exec()\fP is called with a pattern that was successfully studied
+with the PCRE_STUDY_JIT_COMPILE option, the way that the matching is executed
+is entirely different. However, there is still the possibility of runaway
matching that goes on for a very long time, and so the \fImatch_limit\fP value
is also used in this case (but in a different way) to limit how long the
matching can continue.
@@ -1431,7 +1431,7 @@
instead of limiting the total number of times that \fBmatch()\fP is called, it
limits the depth of recursion. The recursion depth is a smaller number than the
total number of calls, because not all calls to \fBmatch()\fP are recursive.
-This limit is of use only if it is set smaller than \fImatch_limit\fP.
+This limit is of use only if it is set smaller than \fImatch_limit\fP.
.P
Limiting the recursion depth limits the amount of machine stack that can be
used, or, when PCRE has been compiled to use memory on the heap instead of the
@@ -1500,8 +1500,8 @@
If the pattern was successfully studied with the PCRE_STUDY_JIT_COMPILE option,
the only supported options for JIT execution are PCRE_NO_UTF8_CHECK,
PCRE_NOTBOL, PCRE_NOTEOL, PCRE_NOTEMPTY, and PCRE_NOTEMPTY_ATSTART. Note in
-particular that partial matching is not supported. If an unsupported option is
-used, JIT execution is disabled and the normal interpretive code in
+particular that partial matching is not supported. If an unsupported option is
+used, JIT execution is disabled and the normal interpretive code in
\fBpcre_exec()\fP is run.
.sp
PCRE_ANCHORED
@@ -1825,7 +1825,7 @@
If a vector of 6 elements (allowing for only 1 captured substring) is given
with subject string "abd", \fBpcre_exec()\fP will try to set the second
captured string, thereby recording a vector overflow, before failing to match
-"c" and backing up to try the second alternative. The zero return, however,
+"c" and backing up to try the second alternative. The zero return, however,
does correctly indicate that the maximum number of slots (namely 2) have been
filled. In similar cases where there is temporary overflow, but the final
number of used slots is actually less than the maximum, a non-zero value is
@@ -2009,13 +2009,13 @@
.sp
PCRE_ERROR_JIT_STACKLIMIT (-27)
.sp
-This error is returned when a pattern that was successfully studied using the
+This error is returned when a pattern that was successfully studied using the
PCRE_STUDY_JIT_COMPILE option is being matched, but the memory available for
the just-in-time processing stack is not large enough. See the
.\" HREF
\fBpcrejit\fP
.\"
-documentation for more details.
+documentation for more details.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.
@@ -2456,7 +2456,7 @@
The strings are returned in reverse order of length; that is, the longest
matching string is given first. If there were too many matches to fit into
\fIovector\fP, the yield of the function is zero, and the vector is filled with
-the longest matches. Unlike \fBpcre_exec()\fP, \fBpcre_dfa_exec()\fP can use
+the longest matches. Unlike \fBpcre_exec()\fP, \fBpcre_dfa_exec()\fP can use
the entire \fIovector\fP for returning matched strings.
.
.
Modified: code/trunk/doc/pcrebuild.3
===================================================================
--- code/trunk/doc/pcrebuild.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcrebuild.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -105,18 +105,18 @@
.sp
--enable-jit
.sp
-This support is available only for certain hardware architectures. If this
+This support is available only for certain hardware architectures. If this
option is set for an unsupported architecture, a compile time error occurs.
-See the
+See the
.\" HREF
\fBpcrejit\fP
.\"
documentation for a discussion of JIT usage. When JIT support is enabled,
pcregrep automatically makes use of it, unless you add
.sp
- --disable-pcregrep-jit
-.sp
-to the "configure" command.
+ --disable-pcregrep-jit
+.sp
+to the "configure" command.
.
.
.SH "CODE VALUE OF NEWLINE"
Modified: code/trunk/doc/pcrecallout.3
===================================================================
--- code/trunk/doc/pcrecallout.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcrecallout.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -40,8 +40,8 @@
indicates how the pattern is matched. This is useful information when you are
trying to optimize the performance of a particular pattern.
.P
-The use of callouts in a pattern makes it ineligible for optimization by the
-just-in-time compiler. Studying such a pattern with the PCRE_STUDY_JIT_COMPILE
+The use of callouts in a pattern makes it ineligible for optimization by the
+just-in-time compiler. Studying such a pattern with the PCRE_STUDY_JIT_COMPILE
option always fails.
.
.
Modified: code/trunk/doc/pcregrep.1
===================================================================
--- code/trunk/doc/pcregrep.1 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcregrep.1 2011-09-11 14:31:21 UTC (rev 691)
@@ -389,7 +389,7 @@
\fB--line-offsets\fP is used.
.TP
\fB--no-jit\fP
-If the PCRE library is built with support for just-in-time compiling (which
+If the PCRE library is built with support for just-in-time compiling (which
speeds up matching), \fBpcregrep\fP automatically makes use of this, unless it
was explicitly disabled at build time. This option can be used to disable the
use of JIT at run time. It is provided for testing and working round problems.
Modified: code/trunk/doc/pcregrep.txt
===================================================================
--- code/trunk/doc/pcregrep.txt 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcregrep.txt 2011-09-11 14:31:21 UTC (rev 691)
@@ -440,63 +440,70 @@
lines. If the filename is also being output, it precedes the
line number. This option is forced if --line-offsets is used.
+ --no-jit If the PCRE library is built with support for just-in-time
+ compiling (which speeds up matching), pcregrep automatically
+ makes use of this, unless it was explicitly disabled at build
+ time. This option can be used to disable the use of JIT at
+ run time. It is provided for testing and working round prob-
+ lems. It should never be needed in normal use.
+
-o, --only-matching
Show only the part of the line that matched a pattern instead
- of the whole line. In this mode, no context is shown. That
- is, the -A, -B, and -C options are ignored. If there is more
- than one match in a line, each of them is shown separately.
- If -o is combined with -v (invert the sense of the match to
- find non-matching lines), no output is generated, but the
- return code is set appropriately. If the matched portion of
- the line is empty, nothing is output unless the file name or
- line number are being printed, in which case they are shown
+ of the whole line. In this mode, no context is shown. That
+ is, the -A, -B, and -C options are ignored. If there is more
+ than one match in a line, each of them is shown separately.
+ If -o is combined with -v (invert the sense of the match to
+ find non-matching lines), no output is generated, but the
+ return code is set appropriately. If the matched portion of
+ the line is empty, nothing is output unless the file name or
+ line number are being printed, in which case they are shown
on an otherwise empty line. This option is mutually exclusive
with --file-offsets and --line-offsets.
-onumber, --only-matching=number
- Show only the part of the line that matched the capturing
+ Show only the part of the line that matched the capturing
parentheses of the given number. Up to 32 capturing parenthe-
ses are supported. Because these options can be given without
- an argument (see above), if an argument is present, it must
- be given in the same shell item, for example, -o3 or --only-
- matching=2. The comments given for the non-argument case
- above also apply to this case. If the specified capturing
- parentheses do not exist in the pattern, or were not set in
- the match, nothing is output unless the file name or line
+ an argument (see above), if an argument is present, it must
+ be given in the same shell item, for example, -o3 or --only-
+ matching=2. The comments given for the non-argument case
+ above also apply to this case. If the specified capturing
+ parentheses do not exist in the pattern, or were not set in
+ the match, nothing is output unless the file name or line
number are being printed.
-q, --quiet
Work quietly, that is, display nothing except error messages.
- The exit status indicates whether or not any matches were
+ The exit status indicates whether or not any matches were
found.
-r, --recursive
- If any given path is a directory, recursively scan the files
- it contains, taking note of any --include and --exclude set-
- tings. By default, a directory is read as a normal file; in
- some operating systems this gives an immediate end-of-file.
- This option is a shorthand for setting the -d option to
+ If any given path is a directory, recursively scan the files
+ it contains, taking note of any --include and --exclude set-
+ tings. By default, a directory is read as a normal file; in
+ some operating systems this gives an immediate end-of-file.
+ This option is a shorthand for setting the -d option to
"recurse".
--recursion-limit=number
See --match-limit above.
-s, --no-messages
- Suppress error messages about non-existent or unreadable
- files. Such files are quietly skipped. However, the return
+ Suppress error messages about non-existent or unreadable
+ files. Such files are quietly skipped. However, the return
code is still 2, even if matches were found in other files.
-u, --utf-8
- Operate in UTF-8 mode. This option is available only if PCRE
- has been compiled with UTF-8 support. Both patterns and sub-
+ Operate in UTF-8 mode. This option is available only if PCRE
+ has been compiled with UTF-8 support. Both patterns and sub-
ject lines must be valid strings of UTF-8 characters.
-V, --version
- Write the version numbers of pcregrep and the PCRE library
+ Write the version numbers of pcregrep and the PCRE library
that is being used to the standard error stream.
-v, --invert-match
- Invert the sense of the match, so that lines which do not
+ Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found.
-w, --word-regex, --word-regexp
@@ -504,103 +511,103 @@
lent to having \b at the start and end of the pattern.
-x, --line-regex, --line-regexp
- Force the patterns to be anchored (each must start matching
- at the beginning of a line) and in addition, require them to
- match entire lines. This is equivalent to having ^ and $
+ Force the patterns to be anchored (each must start matching
+ at the beginning of a line) and in addition, require them to
+ match entire lines. This is equivalent to having ^ and $
characters at the start and end of each alternative branch in
every pattern.
ENVIRONMENT VARIABLES
- The environment variables LC_ALL and LC_CTYPE are examined, in that
- order, for a locale. The first one that is set is used. This can be
- overridden by the --locale option. If no locale is set, the PCRE
+ The environment variables LC_ALL and LC_CTYPE are examined, in that
+ order, for a locale. The first one that is set is used. This can be
+ overridden by the --locale option. If no locale is set, the PCRE
library's default (usually the "C" locale) is used.
NEWLINES
- The -N (--newline) option allows pcregrep to scan files with different
- newline conventions from the default. However, the setting of this
- option does not affect the way in which pcregrep writes information to
- the standard error and output streams. It uses the string "\n" in C
- printf() calls to indicate newlines, relying on the C I/O library to
- convert this to an appropriate sequence if the output is sent to a
+ The -N (--newline) option allows pcregrep to scan files with different
+ newline conventions from the default. However, the setting of this
+ option does not affect the way in which pcregrep writes information to
+ the standard error and output streams. It uses the string "\n" in C
+ printf() calls to indicate newlines, relying on the C I/O library to
+ convert this to an appropriate sequence if the output is sent to a
file.
OPTIONS COMPATIBILITY
- Many of the short and long forms of pcregrep's options are the same as
- in the GNU grep program (version 2.5.4). Any long option of the form
- --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE
- terminology). However, the --file-offsets, --include-dir, --line-off-
+ Many of the short and long forms of pcregrep's options are the same as
+ in the GNU grep program (version 2.5.4). Any long option of the form
+ --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE
+ terminology). However, the --file-offsets, --include-dir, --line-off-
sets, --locale, --match-limit, -M, --multiline, -N, --newline, --recur-
sion-limit, -u, and --utf-8 options are specific to pcregrep, as is the
use of the --only-matching option with a capturing parentheses number.
- Although most of the common options work the same way, a few are dif-
- ferent in pcregrep. For example, the --include option's argument is a
- glob for GNU grep, but a regular expression for pcregrep. If both the
- -c and -l options are given, GNU grep lists only file names, without
+ Although most of the common options work the same way, a few are dif-
+ ferent in pcregrep. For example, the --include option's argument is a
+ glob for GNU grep, but a regular expression for pcregrep. If both the
+ -c and -l options are given, GNU grep lists only file names, without
counts, but pcregrep gives the counts.
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
- ified. If a short form option is used, the data may follow immedi-
+ ified. If a short form option is used, the data may follow immedi-
ately, or (with one exception) in the next command line item. For exam-
ple:
-f/some/file
-f /some/file
- The exception is the -o option, which may appear with or without data.
- Because of this, if data is present, it must follow immediately in the
+ The exception is the -o option, which may appear with or without data.
+ Because of this, if data is present, it must follow immediately in the
same item, for example -o3.
- If a long form option is used, the data may appear in the same command
- line item, separated by an equals character, or (with two exceptions)
+ If a long form option is used, the data may appear in the same command
+ line item, separated by an equals character, or (with two exceptions)
it may appear in the next command line item. For example:
--file=/some/file
--file /some/file
- Note, however, that if you want to supply a file name beginning with ~
- as data in a shell command, and have the shell expand ~ to a home
+ Note, however, that if you want to supply a file name beginning with ~
+ as data in a shell command, and have the shell expand ~ to a home
directory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
- The exceptions to the above are the --colour (or --color) and --only-
- matching options, for which the data is optional. If one of these
- options does have data, it must be given in the first form, using an
+ The exceptions to the above are the --colour (or --color) and --only-
+ matching options, for which the data is optional. If one of these
+ options does have data, it must be given in the first form, using an
equals character. Otherwise pcregrep will assume that it has no data.
MATCHING ERRORS
- It is possible to supply a regular expression that takes a very long
- time to fail to match certain lines. Such patterns normally involve
- nested indefinite repeats, for example: (a+)*\d when matched against a
- line of a's with no final digit. The PCRE matching function has a
- resource limit that causes it to abort in these circumstances. If this
+ It is possible to supply a regular expression that takes a very long
+ time to fail to match certain lines. Such patterns normally involve
+ nested indefinite repeats, for example: (a+)*\d when matched against a
+ line of a's with no final digit. The PCRE matching function has a
+ resource limit that causes it to abort in these circumstances. If this
happens, pcregrep outputs an error message and the line that caused the
- problem to the standard error stream. If there are more than 20 such
+ problem to the standard error stream. If there are more than 20 such
errors, pcregrep gives up.
- The --match-limit option of pcregrep can be used to set the overall
- resource limit; there is a second option called --recursion-limit that
- sets a limit on the amount of memory (usually stack) that is used (see
+ The --match-limit option of pcregrep can be used to set the overall
+ resource limit; there is a second option called --recursion-limit that
+ sets a limit on the amount of memory (usually stack) that is used (see
the discussion of these options above).
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found,
- and 2 for syntax errors, overlong lines, non-existent or inaccessible
- files (even if matches were found in other files) or too many matching
+ and 2 for syntax errors, overlong lines, non-existent or inaccessible
+ files (even if matches were found in other files) or too many matching
errors. Using the -s option to suppress error messages about inaccessi-
ble files does not affect the return code.
@@ -619,5 +626,5 @@
REVISION
- Last updated: 30 July 2011
+ Last updated: 06 September 2011
Copyright (c) 1997-2011 University of Cambridge.
Modified: code/trunk/doc/pcrejit.3
===================================================================
--- code/trunk/doc/pcrejit.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcrejit.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -47,9 +47,9 @@
(1) Call \fBpcre_study()\fP with the PCRE_STUDY_JIT_COMPILE option for
each compiled pattern, and pass the resulting \fBpcre_extra\fP block to
\fBpcre_exec()\fP.
-
+.sp
(2) Use \fBpcre_free_study()\fP to free the \fBpcre_extra\fP block when it is
- no longer needed instead of just freeing it yourself. This
+ no longer needed instead of just freeing it yourself. This
ensures that any JIT data is also freed.
.sp
In some circumstances you may need to call additional functions. These are
@@ -149,13 +149,13 @@
managing blocks of memory for use as JIT stacks.
.P
The \fBpcre_jit_stack_alloc()\fP function creates a JIT stack. Its arguments
-are a starting size and a maximum size, and it returns a pointer to an opaque
+are a starting size and a maximum size, and it returns a pointer to an opaque
structure of type \fBpcre_jit_stack\fP, or NULL if there is an error. The
\fBpcre_jit_stack_free()\fP function can be used to free a stack that is no
longer needed. (For the technically minded: the address space is allocated by
mmap or VirtualAlloc.)
.P
-JIT uses far less memory for recursion than the interpretive code,
+JIT uses far less memory for recursion than the interpretive code,
and a maximum stack size of 512K to 1M should be more than enough for any
pattern.
.P
@@ -197,10 +197,10 @@
.sp
During thread initalization
thread_local_var = pcre_jit_stack_alloc(...)
-
+.sp
During thread exit
pcre_jit_stack_free(thread_local_var)
-
+.sp
Use a one-line callback function
return thread_local_var
.sp
@@ -214,7 +214,7 @@
.rs
.sp
This is a single-threaded example that specifies a JIT stack without using a
-callback.
+callback.
.sp
int rc;
int ovector[30];
@@ -232,7 +232,7 @@
/* Check results */
pcre_free(re);
pcre_free_study(extra);
- pcre_jit_stack_free(jit_stack);
+ pcre_jit_stack_free(jit_stack);
.sp
.
.
Modified: code/trunk/doc/pcreprecompile.3
===================================================================
--- code/trunk/doc/pcreprecompile.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcreprecompile.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -12,8 +12,8 @@
\fBpcre_maketables()\fP
.\"
documentation), this is relatively straightforward. If you are using private
-tables, it is a little bit more complicated. However, if you are using the
-just-in-time optimization feature of \fBpcre_study()\fP, it is not possible to
+tables, it is a little bit more complicated. However, if you are using the
+just-in-time optimization feature of \fBpcre_study()\fP, it is not possible to
save and reload the JIT data.
.P
If you save compiled patterns to a file, you can copy them to a different host
@@ -21,7 +21,7 @@
to the one on which the patterns were compiled. There may be a small
performance penalty, but it should be insignificant. However, compiling regular
expressions with one version of PCRE for use with a different version is not
-guaranteed to work and may cause crashes, and saving and restoring a compiled
+guaranteed to work and may cause crashes, and saving and restoring a compiled
pattern loses any JIT optimization data.
.
.
@@ -62,7 +62,7 @@
them.
.P
If the pattern has been studied, it is also possible to save the normal study
-data in a similar way to the compiled pattern itself. However, if the
+data in a similar way to the compiled pattern itself. However, if the
PCRE_STUDY_JIT_COMPILE was used, the just-in-time data that is created cannot
be saved because it is too dependent on the current environment. When studying
generates additional information, \fBpcre_study()\fP returns a pointer to a
@@ -116,7 +116,7 @@
reloaded study data. You must also set the PCRE_EXTRA_STUDY_DATA bit in the
\fIflags\fP field to indicate that study data is present. Then pass the
\fBpcre_extra\fP block to \fBpcre_exec()\fP or \fBpcre_dfa_exec()\fP in the
-usual way. If the pattern was studied for just-in-time optimization, that data
+usual way. If the pattern was studied for just-in-time optimization, that data
cannot be saved, and so is lost by a save/restore cycle.
.
.
Modified: code/trunk/doc/pcrestack.3
===================================================================
--- code/trunk/doc/pcrestack.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcrestack.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -19,11 +19,11 @@
the recursive call would immediately be passed back as the result of the
current call (a "tail recursion"), the function is just restarted instead.
.P
-The above comments apply when \fBpcre_exec()\fP is run in its normal
-interpretive manner. If the pattern was studied with the
-PCRE_STUDY_JIT_COMPILE option, and just-in-time compiling was successful, and
-the options passed to \fBpcre_exec()\fP were not incompatible, the matching
-process uses the JIT-compiled code instead of the \fBmatch()\fP function. In
+The above comments apply when \fBpcre_exec()\fP is run in its normal
+interpretive manner. If the pattern was studied with the
+PCRE_STUDY_JIT_COMPILE option, and just-in-time compiling was successful, and
+the options passed to \fBpcre_exec()\fP were not incompatible, the matching
+process uses the JIT-compiled code instead of the \fBmatch()\fP function. In
this case, the memory requirements are handled entirely differently. See the
.\" HREF
\fBpcrejit\fP
Modified: code/trunk/doc/pcretest.1
===================================================================
--- code/trunk/doc/pcretest.1 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcretest.1 2011-09-11 14:31:21 UTC (rev 691)
@@ -262,7 +262,7 @@
contains multiple copies of the same substring. If the \fB+\fP modifier appears
twice, the same action is taken for captured substrings. In each case the
remainder is output on the following line with a plus character following the
-capture number. Note that this modifier must not immediately follow the /S
+capture number. Note that this modifier must not immediately follow the /S
modifier because /S+ has another meaning.
.P
The \fB/=\fP modifier requests that the values of all potential captured
@@ -329,18 +329,18 @@
never studied, independently of \fB-s\fP. This feature is used in the test
files in a few cases where the output is different when the pattern is studied.
.P
-If the \fB/S\fP modifier is immediately followed by a + character, the call to
-\fBpcre_study()\fP is made with the PCRE_STUDY_JIT_COMPILE option, requesting
-just-in-time optimization support if it is available. Note that there is also a
-\fB/+\fP modifier; it must not be given immediately after \fB/S\fP because this
-will be misinterpreted. If JIT studying is successful, it will automatically be
-used when \fBpcre_exec()\fP is run, except when incompatible run-time options
-are specified. These include the partial matching options; a complete list is
+If the \fB/S\fP modifier is immediately followed by a + character, the call to
+\fBpcre_study()\fP is made with the PCRE_STUDY_JIT_COMPILE option, requesting
+just-in-time optimization support if it is available. Note that there is also a
+\fB/+\fP modifier; it must not be given immediately after \fB/S\fP because this
+will be misinterpreted. If JIT studying is successful, it will automatically be
+used when \fBpcre_exec()\fP is run, except when incompatible run-time options
+are specified. These include the partial matching options; a complete list is
given in the
.\" HREF
\fBpcrejit\fP
.\"
-documentation. See also the \fB\eJ\fP escape sequence below for a way of
+documentation. See also the \fB\eJ\fP escape sequence below for a way of
setting the size of the JIT stack.
.P
The \fB/T\fP modifier must be followed by a single digit. It causes a specific
@@ -439,7 +439,7 @@
ated by next non-alphanumeric character)
.\" JOIN
\eJdd set up a JIT stack of dd kilobytes maximum (any
- number of digits)
+ number of digits)
.\" JOIN
\eL call pcre_get_substringlist() after a
successful match
@@ -507,16 +507,16 @@
input.
.P
The \fB\eJ\fP escape provides a way of setting the maximum stack size that is
-used by the just-in-time optimization code. It is ignored if JIT optimization
-is not being used. Providing a stack that is larger than the default 32K is
+used by the just-in-time optimization code. It is ignored if JIT optimization
+is not being used. Providing a stack that is larger than the default 32K is
necessary only for very complicated patterns.
.P
If \eM is present, \fBpcretest\fP calls \fBpcre_exec()\fP several times, with
different values in the \fImatch_limit\fP and \fImatch_limit_recursion\fP
fields of the \fBpcre_extra\fP data structure, until it finds the minimum
-numbers for each parameter that allow \fBpcre_exec()\fP to complete without
-error. Because this is testing a specific feature of the normal interpretive
-\fBpcre_exec()\fP execution, the use of any JIT optimization that might have
+numbers for each parameter that allow \fBpcre_exec()\fP to complete without
+error. Because this is testing a specific feature of the normal interpretive
+\fBpcre_exec()\fP execution, the use of any JIT optimization that might have
been set up by the \fB/S+\fP qualifier of \fB-s+\fP option is disabled.
.P
The \fImatch_limit\fP number is a measure of the amount of backtracking
@@ -795,7 +795,7 @@
.sp
The facilities described in this section are not available when the POSIX
interface to PCRE is being used, that is, when the \fB/P\fP pattern modifier is
-specified.
+specified.
.P
When the POSIX interface is not in use, you can cause \fBpcretest\fP to write a
compiled pattern to a file, by following the modifiers with > and a file name.
Modified: code/trunk/doc/pcretest.txt
===================================================================
--- code/trunk/doc/pcretest.txt 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcretest.txt 2011-09-11 14:31:21 UTC (rev 691)
@@ -71,25 +71,27 @@
-S size On Unix-like systems, set the size of the run-time stack to
size megabytes.
- -s Behave as if each pattern has the /S modifier; in other
- words, force each pattern to be studied. If the /I or /D
- option is present on a pattern (requesting output about the
- compiled pattern), information about the result of studying
- is not included when studying is caused only by -s and nei-
- ther -i nor -d is present on the command line. This behaviour
- means that the output from tests that are run with and with-
- out -s should be identical, except when options that output
- information about the actual running of a match are set. The
- -M, -t, and -tm options, which give information about
- resources used, are likely to produce different output with
- and without -s. Output may also differ if the /C option is
- present on an individual pattern. This uses callouts to trace
- the the matching process, and this may be different between
- studied and non-studied patterns. If the pattern contains
- (*MARK) items there may also be differences, for the same
- reason. The -s command line option can be overridden for spe-
- cific patterns that should never be studied (see the /S
- option below).
+ -s or -s+ Behave as if each pattern has the /S modifier; in other
+ words, force each pattern to be studied. If -s+ is used, the
+ PCRE_STUDY_JIT_COMPILE flag is passed to pcre_study(), caus-
+ ing just-in-time optimization to be set up if it is avail-
+ able. If the /I or /D option is present on a pattern
+ (requesting output about the compiled pattern), information
+ about the result of studying is not included when studying is
+ caused only by -s and neither -i nor -d is present on the
+ command line. This behaviour means that the output from tests
+ that are run with and without -s should be identical, except
+ when options that output information about the actual running
+ of a match are set. The -M, -t, and -tm options, which give
+ information about resources used, are likely to produce dif-
+ ferent output with and without -s. Output may also differ if
+ the /C option is present on an individual pattern. This uses
+ callouts to trace the the matching process, and this may be
+ different between studied and non-studied patterns. If the
+ pattern contains (*MARK) items there may also be differences,
+ for the same reason. The -s command line option can be over-
+ ridden for specific patterns that should never be studied
+ (see the /S pattern modifier below).
-t Run each compile, study, and match many times with a timer,
and output resulting time per compile or match (in millisec-
@@ -245,74 +247,86 @@
subject contains multiple copies of the same substring. If the + modi-
fier appears twice, the same action is taken for captured substrings.
In each case the remainder is output on the following line with a plus
- character following the capture number.
+ character following the capture number. Note that this modifier must
+ not immediately follow the /S modifier because /S+ has another meaning.
- The /= modifier requests that the values of all potential captured
- parentheses be output after a match by pcre_exec(). By default, only
+ The /= modifier requests that the values of all potential captured
+ parentheses be output after a match by pcre_exec(). By default, only
those up to the highest one actually used in the match are output (cor-
- responding to the return code from pcre_exec()). Values in the offsets
- vector corresponding to higher numbers should be set to -1, and these
- are output as "<unset>". This modifier gives a way of checking that
+ responding to the return code from pcre_exec()). Values in the offsets
+ vector corresponding to higher numbers should be set to -1, and these
+ are output as "<unset>". This modifier gives a way of checking that
this is happening.
- The /B modifier is a debugging feature. It requests that pcretest out-
- put a representation of the compiled byte code after compilation. Nor-
- mally this information contains length and offset values; however, if
- /Z is also present, this data is replaced by spaces. This is a special
+ The /B modifier is a debugging feature. It requests that pcretest out-
+ put a representation of the compiled byte code after compilation. Nor-
+ mally this information contains length and offset values; however, if
+ /Z is also present, this data is replaced by spaces. This is a special
feature for use in the automatic test scripts; it ensures that the same
output is generated for different internal link sizes.
- The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
+ The /D modifier is a PCRE debugging feature, and is equivalent to /BI,
that is, both the /B and the /I modifiers.
The /F modifier causes pcretest to flip the byte order of the fields in
- the compiled pattern that contain 2-byte and 4-byte numbers. This
- facility is for testing the feature in PCRE that allows it to execute
+ the compiled pattern that contain 2-byte and 4-byte numbers. This
+ facility is for testing the feature in PCRE that allows it to execute
patterns that were compiled on a host with a different endianness. This
- feature is not available when the POSIX interface to PCRE is being
- used, that is, when the /P pattern modifier is specified. See also the
+ feature is not available when the POSIX interface to PCRE is being
+ used, that is, when the /P pattern modifier is specified. See also the
section about saving and reloading compiled patterns below.
- The /I modifier requests that pcretest output information about the
- compiled pattern (whether it is anchored, has a fixed first character,
- and so on). It does this by calling pcre_fullinfo() after compiling a
- pattern. If the pattern is studied, the results of that are also out-
+ The /I modifier requests that pcretest output information about the
+ compiled pattern (whether it is anchored, has a fixed first character,
+ and so on). It does this by calling pcre_fullinfo() after compiling a
+ pattern. If the pattern is studied, the results of that are also out-
put.
- The /K modifier requests pcretest to show names from backtracking con-
- trol verbs that are returned from calls to pcre_exec(). It causes
- pcretest to create a pcre_extra block if one has not already been cre-
+ The /K modifier requests pcretest to show names from backtracking con-
+ trol verbs that are returned from calls to pcre_exec(). It causes
+ pcretest to create a pcre_extra block if one has not already been cre-
ated by a call to pcre_study(), and to set the PCRE_EXTRA_MARK flag and
the mark field within it, every time that pcre_exec() is called. If the
- variable that the mark field points to is non-NULL for a match, non-
+ variable that the mark field points to is non-NULL for a match, non-
match, or partial match, pcretest prints the string to which it points.
For a match, this is shown on a line by itself, tagged with "MK:". For
a non-match it is added to the message.
- The /L modifier must be followed directly by the name of a locale, for
+ The /L modifier must be followed directly by the name of a locale, for
example,
/pattern/Lfr_FR
For this reason, it must be the last modifier. The given locale is set,
- pcre_maketables() is called to build a set of character tables for the
- locale, and this is then passed to pcre_compile() when compiling the
- regular expression. Without an /L (or /T) modifier, NULL is passed as
+ pcre_maketables() is called to build a set of character tables for the
+ locale, and this is then passed to pcre_compile() when compiling the
+ regular expression. Without an /L (or /T) modifier, NULL is passed as
the tables pointer; that is, /L applies only to the expression on which
it appears.
- The /M modifier causes the size of memory block used to hold the com-
+ The /M modifier causes the size of memory block used to hold the com-
piled pattern to be output.
- If the /S modifier appears once, it causes pcre_study() to be called
- after the expression has been compiled, and the results used when the
- expression is matched. If /S appears twice, it suppresses studying,
+ If the /S modifier appears once, it causes pcre_study() to be called
+ after the expression has been compiled, and the results used when the
+ expression is matched. If /S appears twice, it suppresses studying,
even if it was requested externally by the -s command line option. This
- makes it possible to specify that certain patterns are always studied,
+ makes it possible to specify that certain patterns are always studied,
and others are never studied, independently of -s. This feature is used
in the test files in a few cases where the output is different when the
pattern is studied.
+ If the /S modifier is immediately followed by a + character, the call
+ to pcre_study() is made with the PCRE_STUDY_JIT_COMPILE option,
+ requesting just-in-time optimization support if it is available. Note
+ that there is also a /+ modifier; it must not be given immediately
+ after /S because this will be misinterpreted. If JIT studying is suc-
+ cessful, it will automatically be used when pcre_exec() is run, except
+ when incompatible run-time options are specified. These include the
+ partial matching options; a complete list is given in the pcrejit docu-
+ mentation. See also the \J escape sequence below for a way of setting
+ the size of the JIT stack.
+
The /T modifier must be followed by a single digit. It causes a spe-
cific set of built-in character tables to be passed to pcre_compile().
It is used in the standard PCRE tests to check behaviour with different
@@ -392,6 +406,8 @@
\Gname call pcre_get_named_substring() for substring
"name" after a successful match (name termin-
ated by next non-alphanumeric character)
+ \Jdd set up a JIT stack of dd kilobytes maximum (any
+ number of digits)
\L call pcre_get_substringlist() after a
successful match
\M discover the minimum MATCH_LIMIT and
@@ -444,18 +460,28 @@
way of passing an empty line as data, since a real empty line termi-
nates the data input.
- If \M is present, pcretest calls pcre_exec() several times, with dif-
- ferent values in the match_limit and match_limit_recursion fields of
- the pcre_extra data structure, until it finds the minimum numbers for
- each parameter that allow pcre_exec() to complete. The match_limit num-
- ber is a measure of the amount of backtracking that takes place, and
- checking it out can be instructive. For most simple matches, the number
- is quite small, but for patterns with very large numbers of matching
- possibilities, it can become large very quickly with increasing length
- of subject string. The match_limit_recursion number is a measure of how
- much stack (or, if PCRE is compiled with NO_RECURSE, how much heap)
- memory is needed to complete the match attempt.
+ The \J escape provides a way of setting the maximum stack size that is
+ used by the just-in-time optimization code. It is ignored if JIT opti-
+ mization is not being used. Providing a stack that is larger than the
+ default 32K is necessary only for very complicated patterns.
+ If \M is present, pcretest calls pcre_exec() several times, with dif-
+ ferent values in the match_limit and match_limit_recursion fields of
+ the pcre_extra data structure, until it finds the minimum numbers for
+ each parameter that allow pcre_exec() to complete without error.
+ Because this is testing a specific feature of the normal interpretive
+ pcre_exec() execution, the use of any JIT optimization that might have
+ been set up by the /S+ qualifier of -s+ option is disabled.
+
+ The match_limit number is a measure of the amount of backtracking that
+ takes place, and checking it out can be instructive. For most simple
+ matches, the number is quite small, but for patterns with very large
+ numbers of matching possibilities, it can become large very quickly
+ with increasing length of subject string. The match_limit_recursion
+ number is a measure of how much stack (or, if PCRE is compiled with
+ NO_RECURSE, how much heap) memory is needed to complete the match
+ attempt.
+
When \O is used, the value specified may be higher or lower than the
size set by the -O command line option (or defaulted to 45); \O applies
only to the call of pcre_exec() for the line in which it appears.
@@ -720,19 +746,20 @@
/pattern/im >/some/file
See the pcreprecompile documentation for a discussion about saving and
- re-using compiled patterns.
+ re-using compiled patterns. Note that if the pattern was successfully
+ studied with JIT optimization, the JIT data cannot be saved.
- The data that is written is binary. The first eight bytes are the
- length of the compiled pattern data followed by the length of the
- optional study data, each written as four bytes in big-endian order
- (most significant byte first). If there is no study data (either the
+ The data that is written is binary. The first eight bytes are the
+ length of the compiled pattern data followed by the length of the
+ optional study data, each written as four bytes in big-endian order
+ (most significant byte first). If there is no study data (either the
pattern was not studied, or studying did not return any data), the sec-
- ond length is zero. The lengths are followed by an exact copy of the
- compiled pattern. If there is additional study data, this follows imme-
- diately after the compiled pattern. After writing the file, pcretest
- expects to read a new pattern.
+ ond length is zero. The lengths are followed by an exact copy of the
+ compiled pattern. If there is additional study data, this (excluding
+ any JIT data) follows immediately after the compiled pattern. After
+ writing the file, pcretest expects to read a new pattern.
- A saved pattern can be reloaded into pcretest by specifying < and a
+ A saved pattern can be reloaded into pcretest by specifying < and a
file name instead of a pattern. The name of the file must not contain a
< character, as otherwise pcretest will interpret the line as a pattern
delimited by < characters. For example:
@@ -741,32 +768,34 @@
Compiled pattern loaded from /some/file
No study data
- When the pattern has been loaded, pcretest proceeds to read data lines
- in the usual way.
+ If the pattern was previously studied with the JIT optimization, the
+ JIT information cannot be saved and restored, and so is lost. When the
+ pattern has been loaded, pcretest proceeds to read data lines in the
+ usual way.
- You can copy a file written by pcretest to a different host and reload
- it there, even if the new host has opposite endianness to the one on
- which the pattern was compiled. For example, you can compile on an i86
+ You can copy a file written by pcretest to a different host and reload
+ it there, even if the new host has opposite endianness to the one on
+ which the pattern was compiled. For example, you can compile on an i86
machine and run on a SPARC machine.
- File names for saving and reloading can be absolute or relative, but
- note that the shell facility of expanding a file name that starts with
+ File names for saving and reloading can be absolute or relative, but
+ note that the shell facility of expanding a file name that starts with
a tilde (~) is not available.
- The ability to save and reload files in pcretest is intended for test-
- ing and experimentation. It is not intended for production use because
- only a single pattern can be written to a file. Furthermore, there is
- no facility for supplying custom character tables for use with a
- reloaded pattern. If the original pattern was compiled with custom
- tables, an attempt to match a subject string using a reloaded pattern
- is likely to cause pcretest to crash. Finally, if you attempt to load
+ The ability to save and reload files in pcretest is intended for test-
+ ing and experimentation. It is not intended for production use because
+ only a single pattern can be written to a file. Furthermore, there is
+ no facility for supplying custom character tables for use with a
+ reloaded pattern. If the original pattern was compiled with custom
+ tables, an attempt to match a subject string using a reloaded pattern
+ is likely to cause pcretest to crash. Finally, if you attempt to load
a file that is not in the correct format, the result is undefined.
SEE ALSO
- pcre(3), pcreapi(3), pcrecallout(3), pcrematching(3), pcrepartial(d),
- pcrepattern(3), pcreprecompile(3).
+ pcre(3), pcreapi(3), pcrecallout(3), pcrejit, pcrematching(3), pcrepar-
+ tial(d), pcrepattern(3), pcreprecompile(3).
AUTHOR
@@ -778,5 +807,5 @@
REVISION
- Last updated: 01 August 2011
+ Last updated: 26 August 2011
Copyright (c) 1997-2011 University of Cambridge.
Modified: code/trunk/doc/pcreunicode.3
===================================================================
--- code/trunk/doc/pcreunicode.3 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/pcreunicode.3 2011-09-11 14:31:21 UTC (rev 691)
@@ -12,7 +12,7 @@
with the PCRE_UTF8 option flag, or the pattern must start with the sequence
(*UTF8). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF-8 strings instead of
-strings of 1-byte characters. PCRE does not support any other formats (in
+strings of 1-byte characters. PCRE does not support any other formats (in
particular, it does not support UTF-16).
.P
If you compile PCRE with UTF-8 support, but do not use it at run time, the
@@ -81,7 +81,7 @@
If you want to process strings of values in the full range 0 to 0x7FFFFFFF,
encoded in a UTF-8-like manner as per the old RFC, you can set
PCRE_NO_UTF8_CHECK to bypass the more restrictive test. However, in this
-situation, you will have to apply your own validity check, and avoid the use of
+situation, you will have to apply your own validity check, and avoid the use of
JIT optimization.
.
.
@@ -101,9 +101,9 @@
.P
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode,
but its use can lead to some strange effects. This facility is not available in
-the alternative matching function, \fBpcre_dfa_exec()\fP, nor is it supported
-by the JIT optimization of \fBpcre_exec()\fP. If JIT optimization is requested
-for a pattern that contains \eC, it will not succeed, and so the matching will
+the alternative matching function, \fBpcre_dfa_exec()\fP, nor is it supported
+by the JIT optimization of \fBpcre_exec()\fP. If JIT optimization is requested
+for a pattern that contains \eC, it will not succeed, and so the matching will
be carried out by the normal interpretive function.
.P
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
Modified: code/trunk/doc/perltest.txt
===================================================================
--- code/trunk/doc/perltest.txt 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/doc/perltest.txt 2011-09-11 14:31:21 UTC (rev 691)
@@ -3,7 +3,7 @@
The perltest.pl script tests Perl's regular expressions; it has the same
specification as pcretest, and so can be given identical input, except that
-input patterns can be followed only by Perl's lower case modifiers and certain
+input patterns can be followed only by Perl's lower case modifiers and certain
other pcretest modifiers that are either handled or ignored:
/+ recognized and handled by perltest
@@ -18,7 +18,7 @@
The data lines are processed as Perl double-quoted strings, so if they contain
" $ or @ characters, these have to be escaped. For this reason, all such
characters in testinput1, testinput4, testinput6, and testinput11 are escaped
-so that they can be used for perltest as well as for pcretest. The pcretest \Y
+so that they can be used for perltest as well as for pcretest. The pcretest \Y
escape in data lines is removed.
The special upper case pattern modifiers such as /A that pcretest recognizes,
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_compile.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -2295,12 +2295,12 @@
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
It seems that the appearance of a nested POSIX class supersedes an apparent
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
-a digit.
+a digit.
In Perl, unescaped square brackets may also appear as part of class names. For
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
-seem right at all. PCRE does not allow closing square brackets in POSIX class
+seem right at all. PCRE does not allow closing square brackets in POSIX class
names.
Arguments:
@@ -2319,7 +2319,7 @@
{
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
ptr++;
- else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
+ else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
else
{
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_exec.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -6403,7 +6403,7 @@
/* Set the return code to the number of captured strings, or 0 if there were
too many to fit into the vector. */
-
+
rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
0 : md->end_offset_top/2;
Modified: code/trunk/pcre_fullinfo.c
===================================================================
--- code/trunk/pcre_fullinfo.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_fullinfo.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -130,7 +130,7 @@
break;
case PCRE_INFO_JIT:
- *((int *)where) = extra_data != NULL &&
+ *((int *)where) = extra_data != NULL &&
(extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra_data->executable_jit != NULL;
break;
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_internal.h 2011-09-11 14:31:21 UTC (rev 691)
@@ -1943,7 +1943,7 @@
#ifdef SUPPORT_JIT
extern void _pcre_jit_compile(const real_pcre *, pcre_extra *);
-extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR,
+extern int _pcre_jit_exec(const real_pcre *, void *, PCRE_SPTR,
int, int, int, int, int *, int);
extern void _pcre_jit_free(void *);
#endif
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_jit_compile.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -119,7 +119,7 @@
jump to D hot path
C fallback path
A fallback path
-
+
Notice, that the order of fallback code paths are the opposite of the fast
code paths. In this way the topmost value on the stack is always belong
to the current fallback code path. The fallback code path must check
@@ -405,7 +405,7 @@
return cc;
}
-/* Functions whose might need modification for all new supported opcodes:
+/* Functions whose might need modification for all new supported opcodes:
next_opcode
get_localspace
set_localptrs
@@ -2384,7 +2384,7 @@
}
context->byteptr = 0;
}
-
+
#else
/* Unaligned read is unsupported. */
@@ -3232,7 +3232,7 @@
else if (cc[1] >= 0xc0)
size += _pcre_utf8_table4[cc[1] & 0x3f];
}
- else
+ else
#endif
if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0)
size = 0;
@@ -6392,7 +6392,7 @@
{
executable_function *function;
if (extra != NULL &&
- (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
+ (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra->executable_jit != NULL)
{
function = (executable_function*)extra->executable_jit;
@@ -6403,7 +6403,7 @@
#else /* SUPPORT_JIT */
-/* These are dummy functions to avoid linking errors when JIT support is not
+/* These are dummy functions to avoid linking errors when JIT support is not
being compiled. */
PCRE_EXP_DECL pcre_jit_stack *
Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_jit_test.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -57,7 +57,7 @@
\xe2\x80\xa8 = 0x2028 (Line Separator)
\xc8\xba = 570 \xe2\xb1\xa5 = 11365 (lowercase length != uppercase length)
\xcc\x8d = 781 (Something with Mark property)
-*/
+*/
static void setstack(pcre_extra *extra);
static int regression_tests(void);
@@ -568,7 +568,7 @@
{ MUA, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
{ MUA, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
{ MUA, 0, "((a?)+)+b", "aaaaaaaaaaaaa b" },
-
+
/* Deep recursion: Stack limit reached. */
{ MA, 0, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
{ MA, 0, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
@@ -672,7 +672,7 @@
} else {
printf("\nSuccessful test ratio: %d%%\n", succesful * 100 / total);
return 1;
- }
+ }
}
/* End of pcre_jit_test.c */
Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcre_study.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -1281,11 +1281,11 @@
rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
&compile_block);
bits_set = rc == SSB_DONE;
- if (rc == SSB_UNKNOWN)
+ if (rc == SSB_UNKNOWN)
{
*errorptr = "internal error: opcode not recognized";
return NULL;
- }
+ }
}
/* Find the minimum length of subject string. */
@@ -1306,7 +1306,7 @@
so that if it becomes variable in the future, we don't have to change that
code. */
-if (bits_set || min > 0
+if (bits_set || min > 0
#ifdef SUPPORT_JIT
|| (options & PCRE_STUDY_JIT_COMPILE) != 0
#endif
@@ -1319,32 +1319,32 @@
*errorptr = "failed to get memory";
return NULL;
}
-
+
study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
extra->flags = PCRE_EXTRA_STUDY_DATA;
extra->study_data = study;
-
+
study->size = sizeof(pcre_study_data);
study->flags = 0;
-
+
if (bits_set)
{
study->flags |= PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
-
- /* Always set the minlength value in the block, because the JIT compiler
- makes use of it. However, don't set the bit unless the length is greater than
- zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
- checking this case. */
-
+
+ /* Always set the minlength value in the block, because the JIT compiler
+ makes use of it. However, don't set the bit unless the length is greater than
+ zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
+ checking this case. */
+
study->minlength = min;
if (min > 0) study->flags |= PCRE_STUDY_MINLEN;
-
+
/* If JIT support was compiled and requested, attempt the JIT compilation.
If no starting bytes were found, and the minimum length is zero, and JIT
compilation fails, abandon the extra block and return NULL. */
-
+
#ifdef SUPPORT_JIT
extra->executable_jit = NULL;
if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
@@ -1352,7 +1352,7 @@
{
pcre_free_study(extra);
extra = NULL;
- }
+ }
#endif
}
@@ -1374,7 +1374,7 @@
pcre_free_study(pcre_extra *extra)
{
#ifdef SUPPORT_JIT
-if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
+if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra->executable_jit != NULL)
_pcre_jit_free(extra->executable_jit);
#endif
Modified: code/trunk/pcregrep.c
===================================================================
--- code/trunk/pcregrep.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcregrep.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -1871,7 +1871,7 @@
case N_HELP: help(); pcregrep_exit(0);
case N_LBUFFER: line_buffered = TRUE; break;
case N_LOFFSETS: line_offsets = number = TRUE; break;
- case N_NOJIT: study_options &= ~PCRE_STUDY_JIT_COMPILE; break;
+ case N_NOJIT: study_options &= ~PCRE_STUDY_JIT_COMPILE; break;
case 'c': count_only = TRUE; break;
case 'F': process_options |= PO_FIXED_STRINGS; break;
case 'H': filenames = FN_FORCE; break;
@@ -2586,14 +2586,14 @@
if (f != stdin) fclose(f);
}
-/* Study the regular expressions, as we will be running them many times. Unless
+/* Study the regular expressions, as we will be running them many times. Unless
JIT has been explicitly disabled, arrange a stack for it to use. */
#ifdef SUPPORT_PCREGREP_JIT
if ((study_options & PCRE_STUDY_JIT_COMPILE) != 0)
jit_stack = pcre_jit_stack_alloc(32*1024, 1024*1024);
-#endif
-
+#endif
+
for (j = 0; j < pattern_count; j++)
{
hints_list[j] = pcre_study(pattern_list[j], study_options, &error);
@@ -2606,7 +2606,7 @@
}
hint_count++;
#ifdef SUPPORT_PCREGREP_JIT
- if (jit_stack != NULL && hints_list[j] != NULL)
+ if (jit_stack != NULL && hints_list[j] != NULL)
pcre_assign_jit_stack(hints_list[j], NULL, jit_stack);
#endif
}
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2011-09-10 16:19:31 UTC (rev 690)
+++ code/trunk/pcretest.c 2011-09-11 14:31:21 UTC (rev 691)
@@ -230,7 +230,7 @@
"bad offset value",
NULL, /* SHORTUTF8 is handled specially */
"nested recursion at the same subject position",
- "JIT stack limit reached"
+ "JIT stack limit reached"
};
@@ -1289,11 +1289,11 @@
if (strcmp(argv[op], "-m") == 0) showstore = 1;
else if (strcmp(argv[op], "-s") == 0) force_study = 0;
- else if (strcmp(argv[op], "-s+") == 0)
+ else if (strcmp(argv[op], "-s+") == 0)
{
force_study = 1;
force_study_options = PCRE_STUDY_JIT_COMPILE;
- }
+ }
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
else if (strcmp(argv[op], "-b") == 0) debug = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
@@ -1665,15 +1665,15 @@
#endif
case 'S':
- if (do_study == 0)
+ if (do_study == 0)
{
- do_study = 1;
+ do_study = 1;
if (*pp == '+')
{
study_options |= PCRE_STUDY_JIT_COMPILE;
- pp++;
- }
- }
+ pp++;
+ }
+ }
else
{
do_study = 0;
@@ -2156,22 +2156,22 @@
fprintf(outfile, "\n");
}
}
-
+
/* Show this only if the JIT was set by /S, not by -s. */
-
+
if ((study_options & PCRE_STUDY_JIT_COMPILE) != 0)
{
- int jit;
+ int jit;
new_info(re, extra, PCRE_INFO_JIT, &jit);
- if (jit)
- fprintf(outfile, "JIT study was successful\n");
- else
-#ifdef SUPPORT_JIT
- fprintf(outfile, "JIT study was not successful\n");
+ if (jit)
+ fprintf(outfile, "JIT study was successful\n");
+ else
+#ifdef SUPPORT_JIT
+ fprintf(outfile, "JIT study was not successful\n");
#else
- fprintf(outfile, "JIT support is not available in this version of PCRE\n");
+ fprintf(outfile, "JIT support is not available in this version of PCRE\n");
#endif
- }
+ }
}
}
@@ -2492,17 +2492,17 @@
getnamesptr = npp;
}
continue;
-
+
case 'J':
while(isdigit(*p)) n = n * 10 + *p++ - '0';
- if (extra != NULL
- && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
+ if (extra != NULL
+ && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
&& extra->executable_jit != NULL)
- {
+ {
if (jit_stack != NULL) pcre_jit_stack_free(jit_stack);
jit_stack = pcre_jit_stack_alloc(1, n * 1024);
pcre_assign_jit_stack(extra, jit_callback, jit_stack);
- }
+ }
continue;
case 'L':
@@ -2728,7 +2728,7 @@
extra->flags = 0;
}
else extra->flags &= ~PCRE_EXTRA_EXECUTABLE_JIT;
-
+
(void)check_match_limit(re, extra, bptr, len, start_offset,
options|g_notempty, use_offsets, use_size_offsets,
PCRE_EXTRA_MATCH_LIMIT, &(extra->match_limit),
@@ -3083,11 +3083,11 @@
setlocale(LC_CTYPE, "C");
locale_set = 0;
}
- if (jit_stack != NULL)
+ if (jit_stack != NULL)
{
pcre_jit_stack_free(jit_stack);
- jit_stack = NULL;
- }
+ jit_stack = NULL;
+ }
}
if (infile == stdin) fprintf(outfile, "\n");