[Pcre-svn] [1408] code/trunk: Add support for [[:<:]] and [[…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1408] code/trunk: Add support for [[:<:]] and [[:>:]] as a transition aid.
Revision: 1408
          http://vcs.pcre.org/viewvc?view=rev&revision=1408
Author:   ph10
Date:     2013-12-03 16:27:00 +0000 (Tue, 03 Dec 2013)


Log Message:
-----------
Add support for [[:<:]] and [[:>:]] as a transition aid.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/doc/pcrepattern.3
    code/trunk/pcre_compile.c
    code/trunk/pcre_internal.h
    code/trunk/testdata/testinput2
    code/trunk/testdata/testoutput2


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/ChangeLog    2013-12-03 16:27:00 UTC (rev 1408)
@@ -197,6 +197,9 @@
     2. Add an option (-DINSTALL_MSVC_PDB) to enable installation of .pdb files.
        This allows higher-level build scripts which want .pdb files to avoid
        hard-coding the exact files needed."
+       
+42. Added support for [[:<:]] and [[:>:]] as used in the BSD POSIX library to
+    mean "start of word" and "end of word", respectively, as a transition aid.



Version 8.33 28-May-2013

Modified: code/trunk/doc/pcrepattern.3
===================================================================
--- code/trunk/doc/pcrepattern.3    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/doc/pcrepattern.3    2013-12-03 16:27:00 UTC (rev 1408)
@@ -1,4 +1,4 @@
-.TH PCREPATTERN 3 "25 November 2013" "PCRE 8.34"
+.TH PCREPATTERN 3 "03 December 2013" "PCRE 8.34"
 .SH NAME
 PCRE - Perl-compatible regular expressions
 .SH "PCRE REGULAR EXPRESSION DETAILS"
@@ -1313,9 +1313,9 @@
 The only metacharacters that are recognized in character classes are backslash,
 hyphen (only where it can be interpreted as specifying a range), circumflex
 (only at the start), opening square bracket (only when it can be interpreted as
-introducing a POSIX class name - see the next section), and the terminating
-closing square bracket. However, escaping other non-alphanumeric characters
-does no harm.
+introducing a POSIX class name, or for a special compatibility feature - see
+the next two sections), and the terminating closing square bracket. However,
+escaping other non-alphanumeric characters does no harm.
 .
 .
 .SH "POSIX CHARACTER CLASSES"
@@ -1403,6 +1403,30 @@
 points less than 128.
 .
 .
+.SH "COMPATIBILITY FEATURE FOR WORD BOUNDARIES"
+.rs
+.sp
+In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly 
+syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of 
+word". PCRE treats these items as follows:
+.sp
+  [[:<:]]  is converted to  \eb(?=\ew)
+  [[:>:]]  is converted to  \eb(?<=\ew)
+.sp
+Only these exact character sequences are recognized. A sequence such as
+[a[:<:]b] provokes error for an unrecognized POSIX class name. This support is 
+not compatible with Perl. It is provided to help migrations from other 
+environments, and is best not used in any new patterns. Note that \eb matches 
+at the start and the end of a word (see 
+.\" HTML <a href="#smallassertions">
+.\" </a>
+"Simple assertions"
+.\"
+above), and in a Perl-style pattern the preceding or following character 
+normally shows which is wanted, without the need for the assertions that are 
+used above in order to give exactly the POSIX behaviour.
+.
+.
 .SH "VERTICAL BAR"
 .rs
 .sp
@@ -3231,6 +3255,6 @@
 .rs
 .sp
 .nf
-Last updated: 25 November 2013
+Last updated: 03 December 2013
 Copyright (c) 1997-2013 University of Cambridge.
 .fi


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/pcre_compile.c    2013-12-03 16:27:00 UTC (rev 1408)
@@ -260,6 +260,19 @@
 static const int verbcount = sizeof(verbs)/sizeof(verbitem);



+/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in 
+another regex library. */
+
+static const pcre_uchar sub_start_of_word[] = {
+  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+  CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; 
+
+static const pcre_uchar sub_end_of_word[] = {
+  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
+  CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
+  CHAR_RIGHT_PARENTHESIS, '\0' }; 
+
+
 /* Tables of names of POSIX character classes and their lengths. The names are
 now all in a single string, to reduce the number of relocations when a shared
 library is dynamically loaded. The list of lengths is terminated by a zero
@@ -4685,8 +4698,30 @@
       goto FAILED;
       }
     goto NORMAL_CHAR;
+    
+    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is 
+    used for "start of word" and "end of word". As these are otherwise illegal
+    sequences, we don't break anything by recognizing them. They are replaced
+    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
+    erroneous and are handled by the normal code below. */


     case CHAR_LEFT_SQUARE_BRACKET:
+    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
+      {
+      nestptr = ptr + 7;
+      ptr = sub_start_of_word - 1;
+      continue;  
+      }  
+
+    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
+      {
+      nestptr = ptr + 7;
+      ptr = sub_end_of_word - 1;
+      continue;  
+      }  
+
+    /* Handle a real character class. */
+ 
     previous = code;


     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if


Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/pcre_internal.h    2013-12-03 16:27:00 UTC (rev 1408)
@@ -1797,6 +1797,8 @@
 #define STRING_xdigit               STR_x STR_d STR_i STR_g STR_i STR_t


 #define STRING_DEFINE               STR_D STR_E STR_F STR_I STR_N STR_E
+#define STRING_WEIRD_STARTWORD      STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
+#define STRING_WEIRD_ENDWORD        STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET


 #define STRING_CR_RIGHTPAR              STR_C STR_R STR_RIGHT_PARENTHESIS
 #define STRING_LF_RIGHTPAR              STR_L STR_F STR_RIGHT_PARENTHESIS


Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/testdata/testinput2    2013-12-03 16:27:00 UTC (rev 1408)
@@ -4032,4 +4032,17 @@


/(?(R&6yh)abc)/

+/-- Test the ugly "start or end of word" compatibility syntax --/
+
+/[[:<:]]red[[:>:]]/BZ
+    little red riding hood
+    a /red/ thing 
+    red is a colour
+    put it all on red  
+    ** Failers
+    no reduction
+    Alfred Winifred
+    
+/[a[:<:]] should give error/ 
+
 /-- End of testinput2 --/


Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2    2013-12-01 16:31:40 UTC (rev 1407)
+++ code/trunk/testdata/testoutput2    2013-12-03 16:27:00 UTC (rev 1408)
@@ -14089,4 +14089,40 @@
 /(?(R&6yh)abc)/
 Failed: group name must start with a non-digit at offset 5


+/-- Test the ugly "start or end of word" compatibility syntax --/
+
+/[[:<:]]red[[:>:]]/BZ
+------------------------------------------------------------------
+        Bra
+        \b
+        Assert
+        \w
+        Ket
+        red
+        \b
+        AssertB
+        Reverse
+        \w
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+    little red riding hood
+ 0: red
+    a /red/ thing 
+ 0: red
+    red is a colour
+ 0: red
+    put it all on red  
+ 0: red
+    ** Failers
+No match
+    no reduction
+No match
+    Alfred Winifred
+No match
+    
+/[a[:<:]] should give error/ 
+Failed: unknown POSIX class name at offset 4
+
 /-- End of testinput2 --/