Revision: 1408
http://vcs.pcre.org/viewvc?view=rev&revision=1408
Author: ph10
Date: 2013-12-03 16:27:00 +0000 (Tue, 03 Dec 2013)
Log Message:
-----------
Add support for [[:<:]] and [[:>:]] as a transition aid.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcrepattern.3
code/trunk/pcre_compile.c
code/trunk/pcre_internal.h
code/trunk/testdata/testinput2
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/ChangeLog 2013-12-03 16:27:00 UTC (rev 1408)
@@ -197,6 +197,9 @@
2. Add an option (-DINSTALL_MSVC_PDB) to enable installation of .pdb files.
This allows higher-level build scripts which want .pdb files to avoid
hard-coding the exact files needed."
+
+42. Added support for [[:<:]] and [[:>:]] as used in the BSD POSIX library to
+ mean "start of word" and "end of word", respectively, as a transition aid.
Version 8.33 28-May-2013
Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/doc/pcrepattern.3 2013-12-03 16:27:00 UTC (rev 1408)
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "25 November 2013" "PCRE 8.34"
+.TH PCREPATTERN 3 "03 December 2013" "PCRE 8.34"
.SH NAME
PCRE - Perl-compatible regular expressions
.SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -1313,9 +1313,9 @@
The only metacharacters that are recognized in character classes are backslash,
hyphen (only where it can be interpreted as specifying a range), circumflex
(only at the start), opening square bracket (only when it can be interpreted as
-introducing a POSIX class name - see the next section), and the terminating
-closing square bracket. However, escaping other non-alphanumeric characters
-does no harm.
+introducing a POSIX class name, or for a special compatibility feature - see
+the next two sections), and the terminating closing square bracket. However,
+escaping other non-alphanumeric characters does no harm.
.
.
.SH "POSIX CHARACTER CLASSES"
@@ -1403,6 +1403,30 @@
points less than 128.
.
.
+.SH "COMPATIBILITY FEATURE FOR WORD BOUNDARIES"
+.rs
+.sp
+In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly
+syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of
+word". PCRE treats these items as follows:
+.sp
+ [[:<:]] is converted to \eb(?=\ew)
+ [[:>:]] is converted to \eb(?<=\ew)
+.sp
+Only these exact character sequences are recognized. A sequence such as
+[a[:<:]b] provokes error for an unrecognized POSIX class name. This support is
+not compatible with Perl. It is provided to help migrations from other
+environments, and is best not used in any new patterns. Note that \eb matches
+at the start and the end of a word (see
+.\" HTML <a href="#smallassertions">
+.\" </a>
+"Simple assertions"
+.\"
+above), and in a Perl-style pattern the preceding or following character
+normally shows which is wanted, without the need for the assertions that are
+used above in order to give exactly the POSIX behaviour.
+.
+.
.SH "VERTICAL BAR"
.rs
.sp
@@ -3231,6 +3255,6 @@
.rs
.sp
.nf
-Last updated: 25 November 2013
+Last updated: 03 December 2013
Copyright (c) 1997-2013 University of Cambridge.
.fi
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/pcre_compile.c 2013-12-03 16:27:00 UTC (rev 1408)
@@ -260,6 +260,19 @@
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
+/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
+another regex library. */
+
+static const pcre_uchar sub_start_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
+
+static const pcre_uchar sub_end_of_word[] = {
+ CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+ CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
+ CHAR_RIGHT_PARENTHESIS, '\0' };
+
+
/* Tables of names of POSIX character classes and their lengths. The names are
now all in a single string, to reduce the number of relocations when a shared
library is dynamically loaded. The list of lengths is terminated by a zero
@@ -4685,8 +4698,30 @@
goto FAILED;
}
goto NORMAL_CHAR;
+
+ /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
+ used for "start of word" and "end of word". As these are otherwise illegal
+ sequences, we don't break anything by recognizing them. They are replaced
+ by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
+ erroneous and are handled by the normal code below. */
case CHAR_LEFT_SQUARE_BRACKET:
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_start_of_word - 1;
+ continue;
+ }
+
+ if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
+ {
+ nestptr = ptr + 7;
+ ptr = sub_end_of_word - 1;
+ continue;
+ }
+
+ /* Handle a real character class. */
+
previous = code;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/pcre_internal.h 2013-12-03 16:27:00 UTC (rev 1408)
@@ -1797,6 +1797,8 @@
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
+#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
+#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/testdata/testinput2 2013-12-03 16:27:00 UTC (rev 1408)
@@ -4032,4 +4032,17 @@
/(?(R&6yh)abc)/
+/-- Test the ugly "start or end of word" compatibility syntax --/
+
+/[[:<:]]red[[:>:]]/BZ
+ little red riding hood
+ a /red/ thing
+ red is a colour
+ put it all on red
+ ** Failers
+ no reduction
+ Alfred Winifred
+
+/[a[:<:]] should give error/
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/testdata/testoutput2 2013-12-03 16:27:00 UTC (rev 1408)
@@ -14089,4 +14089,40 @@
/(?(R&6yh)abc)/
Failed: group name must start with a non-digit at offset 5
+/-- Test the ugly "start or end of word" compatibility syntax --/
+
+/[[:<:]]red[[:>:]]/BZ
+------------------------------------------------------------------
+ Bra
+ \b
+ Assert
+ \w
+ Ket
+ red
+ \b
+ AssertB
+ Reverse
+ \w
+ Ket
+ Ket
+ End
+------------------------------------------------------------------
+ little red riding hood
+ 0: red
+ a /red/ thing
+ 0: red
+ red is a colour
+ 0: red
+ put it all on red
+ 0: red
+ ** Failers
+No match
+ no reduction
+No match
+ Alfred Winifred
+No match
+
+/[a[:<:]] should give error/
+Failed: unknown POSIX class name at offset 4
+
/-- End of testinput2 --/