Revision: 327
http://www.exim.org/viewvc/pcre2?view=rev&revision=327
Author: ph10
Date: 2015-07-24 19:18:05 +0100 (Fri, 24 Jul 2015)
Log Message:
-----------
Make EBCDIC [a-z] type ranges Perl compatible.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcre2pattern.3
code/trunk/src/pcre2_compile.c
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/ChangeLog 2015-07-24 18:18:05 UTC (rev 327)
@@ -81,7 +81,11 @@
very pedantic coding infelicities and a buffer overflow while checking a UTF-8
string if the final multi-byte UTF-8 character was truncated.
+22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
+class, where both values are literal letters in the same case, omit the
+non-letter EBCDIC code points within the range.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/doc/pcre2pattern.3
===================================================================
--- code/trunk/doc/pcre2pattern.3 2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/doc/pcre2pattern.3 2015-07-24 18:18:05 UTC (rev 327)
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -1324,10 +1324,19 @@
where a range ending character is expected. For example, [z-\exff] is valid,
but [A-\ed] and [A-[:digit:]] are not.
.P
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
-can include any characters that are valid for the current mode.
+Ranges normally include all code points between the start and end characters,
+inclusive. They can also be used for code points specified numerically, for
+example [\e000-\e037]. Ranges can include any characters that are valid for the
+current mode.
.P
+There is a special case in EBCDIC environments for ranges whose end points are
+both specified as literal letters in the same case. For compatibility with
+Perl, EBCDIC code points within the range that are not letters are omitted. For
+example, [h-k] matches only four characters, even though the codes for h and k
+are 0x88 and 0x92, a range of 11 code points. However, if the range is
+specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
+are included.
+.P
If a range that includes letters is used when caseless matching is set, it
matches the letters in either case. For example, [W-c] is equivalent to
[][\e\e^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character
@@ -3367,6 +3376,6 @@
.rs
.sp
.nf
-Last updated: 17 July 2015
+Last updated: 24 July 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-07-24 13:30:50 UTC (rev 326)
+++ code/trunk/src/pcre2_compile.c 2015-07-24 18:18:05 UTC (rev 327)
@@ -3323,25 +3323,25 @@
goto FAILED;
}
break;
-
+
/* Conditional group */
case CHAR_LEFT_PARENTHESIS:
if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */
- {
+ {
nest_depth++;
ptr += 2;
- break;
+ break;
}
-
+
/* Must be an assertion or a callout */
-
+
switch(ptr[4])
{
case CHAR_LESS_THAN_SIGN:
- if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
+ if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
goto MISSING_ASSERTION;
- /* Fall through */
+ /* Fall through */
case CHAR_C:
case CHAR_EXCLAMATION_MARK:
@@ -3348,13 +3348,13 @@
case CHAR_EQUALS_SIGN:
ptr++;
break;
-
+
default:
- MISSING_ASSERTION:
- ptr += 3; /* To improve error message */
+ MISSING_ASSERTION:
+ ptr += 3; /* To improve error message */
errorcode = ERR28;
- goto FAILED;
- }
+ goto FAILED;
+ }
break;
case CHAR_COLON:
@@ -3939,7 +3939,7 @@
{
nestptr = ptr + 7;
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
- ptr--; /* sanitizer moans about a negative index. */
+ ptr--; /* sanitizer moans about a negative index. */
continue;
}
@@ -3947,7 +3947,7 @@
{
nestptr = ptr + 7;
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
- ptr--; /* sanitizer moans about a negative index. */
+ ptr--; /* sanitizer moans about a negative index. */
continue;
}
@@ -4046,6 +4046,9 @@
for(;;)
{
PCRE2_SPTR oldptr;
+#ifdef EBCDIC
+ BOOL range_is_literal = TRUE;
+#endif
if (c == CHAR_NULL && ptr >= cb->end_pattern)
{
@@ -4226,7 +4229,13 @@
{
escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED;
- if (escape == 0) c = ec; /* Escaped single char */
+ if (escape == 0) /* Escaped single char */
+ {
+ c = ec;
+#ifdef EBCDIC
+ range_is_literal = FALSE;
+#endif
+ }
else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
else if (escape == ESC_N) /* \N is not supported in a class */
{
@@ -4430,7 +4439,9 @@
int descape;
descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED;
-
+#ifdef EBCDIC
+ range_is_literal = FALSE;
+#endif
/* 0 means a character was put into d; \b is backspace; any other
special causes an error. */
@@ -4476,9 +4487,48 @@
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
+ /* In an EBCDIC environment, Perl treats alphabetic ranges specially
+ because there are holes in the encoding, and simply using the range A-Z
+ (for example) would include the characters in the holes. This applies
+ only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
+
+#ifdef EBCDIC
+ if (range_is_literal &&
+ (cb->ctypes[c] & ctype_letter) != 0 &&
+ (cb->ctypes[d] & ctype_letter) != 0 &&
+ (c <= CHAR_z) == (d <= CHAR_z))
+ {
+ uint32_t uc = (c <= CHAR_z)? 0 : 64;
+ uint32_t C = c - uc;
+ uint32_t D = d - uc;
+
+ if (C <= CHAR_i)
+ {
+ class_has_8bitchar +=
+ add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+ ((D < CHAR_i)? D : CHAR_i) + uc);
+ C = CHAR_j;
+ }
+
+ if (C <= D && C <= CHAR_r)
+ {
+ class_has_8bitchar +=
+ add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+ ((D < CHAR_r)? D : CHAR_r) + uc);
+ C = CHAR_s;
+ }
+
+ if (C <= D)
+ {
+ class_has_8bitchar +=
+ add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+ D + uc);
+ }
+ }
+ else
+#endif
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, c, d);
-
goto CONTINUE_CLASS; /* Go get the next char in the class */
}