[Pcre-svn] [327] code/trunk: Make EBCDIC [a-z] type ranges P…

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [327] code/trunk: Make EBCDIC [a-z] type ranges Perl compatible.
Revision: 327
          http://www.exim.org/viewvc/pcre2?view=rev&revision=327
Author:   ph10
Date:     2015-07-24 19:18:05 +0100 (Fri, 24 Jul 2015)
Log Message:
-----------
Make EBCDIC [a-z] type ranges Perl compatible.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/doc/pcre2pattern.3
    code/trunk/src/pcre2_compile.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/ChangeLog    2015-07-24 18:18:05 UTC (rev 327)
@@ -81,7 +81,11 @@
 very pedantic coding infelicities and a buffer overflow while checking a UTF-8 
 string if the final multi-byte UTF-8 character was truncated.


+22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
+class, where both values are literal letters in the same case, omit the
+non-letter EBCDIC code points within the range.

+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/doc/pcre2pattern.3
===================================================================
--- code/trunk/doc/pcre2pattern.3    2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/doc/pcre2pattern.3    2015-07-24 18:18:05 UTC (rev 327)
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -1324,10 +1324,19 @@
 where a range ending character is expected. For example, [z-\exff] is valid,
 but [A-\ed] and [A-[:digit:]] are not.
 .P
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
-can include any characters that are valid for the current mode.
+Ranges normally include all code points between the start and end characters,
+inclusive. They can also be used for code points specified numerically, for
+example [\e000-\e037]. Ranges can include any characters that are valid for the
+current mode.
 .P
+There is a special case in EBCDIC environments for ranges whose end points are 
+both specified as literal letters in the same case. For compatibility with 
+Perl, EBCDIC code points within the range that are not letters are omitted. For 
+example, [h-k] matches only four characters, even though the codes for h and k 
+are 0x88 and 0x92, a range of 11 code points. However, if the range is 
+specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
+are included.
+.P
 If a range that includes letters is used when caseless matching is set, it
 matches the letters in either case. For example, [W-c] is equivalent to
 [][\e\e^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character
@@ -3367,6 +3376,6 @@
 .rs
 .sp
 .nf
-Last updated: 17 July 2015
+Last updated: 24 July 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi


Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c    2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/src/pcre2_compile.c    2015-07-24 18:18:05 UTC (rev 327)
@@ -3323,25 +3323,25 @@
         goto FAILED;
         }
       break;
-      
+
       /* Conditional group */


       case CHAR_LEFT_PARENTHESIS:
       if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
-        {  
+        {
         nest_depth++;
         ptr += 2;
-        break; 
+        break;
         }
-        
+
       /* Must be an assertion or a callout */
- 
+
       switch(ptr[4])
        {
        case CHAR_LESS_THAN_SIGN:
-       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) 
+       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
          goto MISSING_ASSERTION;
-       /* Fall through */       
+       /* Fall through */


        case CHAR_C:
        case CHAR_EXCLAMATION_MARK:
@@ -3348,13 +3348,13 @@
        case CHAR_EQUALS_SIGN:
        ptr++;
        break;
-       
+
        default:
-       MISSING_ASSERTION: 
-       ptr += 3;            /* To improve error message */         
+       MISSING_ASSERTION:
+       ptr += 3;            /* To improve error message */
        errorcode = ERR28;
-       goto FAILED; 
-       }      
+       goto FAILED;
+       }
       break;


       case CHAR_COLON:
@@ -3939,7 +3939,7 @@
       {
       nestptr = ptr + 7;
       ptr = sub_start_of_word;  /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
       continue;
       }


@@ -3947,7 +3947,7 @@
       {
       nestptr = ptr + 7;
       ptr = sub_end_of_word;    /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
       continue;
       }


@@ -4046,6 +4046,9 @@
     for(;;)
       {
       PCRE2_SPTR oldptr;
+#ifdef EBCDIC
+      BOOL range_is_literal = TRUE;
+#endif


       if (c == CHAR_NULL && ptr >= cb->end_pattern)
         {
@@ -4226,7 +4229,13 @@
         {
         escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
         if (*errorcodeptr != 0) goto FAILED;
-        if (escape == 0) c = ec;               /* Escaped single char */
+        if (escape == 0)    /* Escaped single char */
+          {
+          c = ec;
+#ifdef EBCDIC
+          range_is_literal = FALSE;
+#endif
+          }
         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
         else if (escape == ESC_N)          /* \N is not supported in a class */
           {
@@ -4430,7 +4439,9 @@
             int descape;
             descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
             if (*errorcodeptr != 0) goto FAILED;
-
+#ifdef EBCDIC
+            range_is_literal = FALSE;
+#endif
             /* 0 means a character was put into d; \b is backspace; any other
             special causes an error. */


@@ -4476,9 +4487,48 @@

         if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;


+        /* In an EBCDIC environment, Perl treats alphabetic ranges specially
+        because there are holes in the encoding, and simply using the range A-Z
+        (for example) would include the characters in the holes. This applies
+        only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
+
+#ifdef EBCDIC
+        if (range_is_literal &&
+             (cb->ctypes[c] & ctype_letter) != 0 &&
+             (cb->ctypes[d] & ctype_letter) != 0 &&
+             (c <= CHAR_z) == (d <= CHAR_z))
+          {
+          uint32_t uc = (c <= CHAR_z)? 0 : 64;
+          uint32_t C = c - uc;
+          uint32_t D = d - uc;
+
+          if (C <= CHAR_i)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_i)? D : CHAR_i) + uc);
+            C = CHAR_j;
+            }
+
+          if (C <= D && C <= CHAR_r)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_r)? D : CHAR_r) + uc);
+            C = CHAR_s;
+            }
+
+          if (C <= D)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                D + uc);
+            }
+          }
+        else
+#endif
         class_has_8bitchar +=
           add_to_class(classbits, &class_uchardata, options, cb, c, d);
-
         goto CONTINUE_CLASS;   /* Go get the next char in the class */
         }