Revision: 1431
http://vcs.pcre.org/viewvc?view=rev&revision=1431
Author: ph10
Date: 2014-01-02 17:41:28 +0000 (Thu, 02 Jan 2014)
Log Message:
-----------
Revert RAWUCHAR macros, renaming them as UCHAR21 and adding an explanatory
comment.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcre_string_utils.c
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2014-01-01 17:11:54 UTC (rev 1430)
+++ code/trunk/ChangeLog 2014-01-02 17:41:28 UTC (rev 1431)
@@ -31,10 +31,11 @@
must be bigger than the treshold as well. This function is useful, when
the characters above the treshold are handled in the same way.
-7. The macros RAWUCHAR and RAWUCHARTEST were identical (and the latter was not
- testing anything, contrary to its name and comment) and were not really
- fulfilling any useful function, so I have replaced their use by plain code.
- Similarly for RAWUCHARINC and RAWUCHARINCTEST.
+7. The macros whose names start with RAWUCHAR are placeholders for a future
+ mode in which only the bottom 21 bits of 32-bit data items are used. To
+ make this more memorable for those maintaining the code, the names have
+ been changed to start with UCHAR21, and an extensive comment has been added
+ to their definition.
Version 8.34 15-December-2013
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2014-01-01 17:11:54 UTC (rev 1430)
+++ code/trunk/pcre_dfa_exec.c 2014-01-02 17:41:28 UTC (rev 1431)
@@ -7,7 +7,7 @@
below for why this module is different).
Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -1473,7 +1473,7 @@
goto ANYNL01;
case CHAR_CR:
- if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL01:
@@ -1742,7 +1742,7 @@
goto ANYNL02;
case CHAR_CR:
- if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL02:
@@ -2012,7 +2012,7 @@
goto ANYNL03;
case CHAR_CR:
- if (ptr + 1 < end_subject && ptr[1] == CHAR_LF) ncount = 1;
+ if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
/* Fall through */
ANYNL03:
@@ -2210,7 +2210,7 @@
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
reset_could_continue = TRUE;
}
- else if (ptr[1] == CHAR_LF)
+ else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
{
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
}
@@ -3474,12 +3474,12 @@
{
pcre_uchar csc;
while (current_subject < end_subject &&
- (csc = *current_subject) != first_char && csc != first_char2)
+ (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
current_subject++;
}
else
while (current_subject < end_subject &&
- *current_subject != first_char)
+ UCHAR21TEST(current_subject) != first_char)
current_subject++;
}
@@ -3509,9 +3509,10 @@
ANYCRLF, and we are now at a LF, advance the match position by one
more character. */
- if (current_subject[-1] == CHAR_CR &&
+ if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
- current_subject < end_subject && *current_subject == CHAR_NL)
+ current_subject < end_subject &&
+ UCHAR21TEST(current_subject) == CHAR_NL)
current_subject++;
}
}
@@ -3522,7 +3523,7 @@
{
while (current_subject < end_subject)
{
- register pcre_uint32 c = *current_subject;
+ register pcre_uint32 c = UCHAR21TEST(current_subject);
#ifndef COMPILE_PCRE8
if (c > 255) c = 255;
#endif
@@ -3579,7 +3580,7 @@
{
while (p < end_subject)
{
- register pcre_uint32 pp = *p++;
+ register pcre_uint32 pp = UCHAR21INCTEST(p);
if (pp == req_char || pp == req_char2) { p--; break; }
}
}
@@ -3587,7 +3588,7 @@
{
while (p < end_subject)
{
- if (*p++ == req_char) { p--; break; }
+ if (UCHAR21INCTEST(p) == req_char) { p--; break; }
}
}
@@ -3655,9 +3656,9 @@
not contain any explicit matches for \r or \n, and the newline option is CRLF
or ANY or ANYCRLF, advance the match position by one more character. */
- if (current_subject[-1] == CHAR_CR &&
+ if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
current_subject < end_subject &&
- *current_subject == CHAR_NL &&
+ UCHAR21TEST(current_subject) == CHAR_NL &&
(re->flags & PCRE_HASCRORLF) == 0 &&
(md->nltype == NLTYPE_ANY ||
md->nltype == NLTYPE_ANYCRLF ||
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2014-01-01 17:11:54 UTC (rev 1430)
+++ code/trunk/pcre_exec.c 2014-01-02 17:41:28 UTC (rev 1431)
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -134,7 +134,7 @@
BOOL utf = md->utf;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
- if (isprint(c = *p++)) printf("%c", (char)c); else printf("\\x{%02x}", c);
+ if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
}
#endif
@@ -237,8 +237,8 @@
{
pcre_uint32 cc, cp;
if (eptr >= md->end_subject) return -2; /* Partial match */
- cc = *eptr;
- cp = *p;
+ cc = UCHAR21TEST(eptr);
+ cp = UCHAR21TEST(p);
if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
p++;
eptr++;
@@ -254,7 +254,7 @@
while (length-- > 0)
{
if (eptr >= md->end_subject) return -2; /* Partial match */
- if (*p++ != *eptr++) return -1;
+ if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
}
}
@@ -2103,7 +2103,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2147,7 +2147,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2290,7 +2290,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21TEST(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -2444,7 +2444,7 @@
{
SCHECK_PARTIAL();
}
- else if (*eptr == CHAR_LF) eptr++;
+ else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -3218,7 +3218,7 @@
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
RRETURN(MATCH_NOMATCH);
}
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
+ while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
}
else
#endif
@@ -3258,7 +3258,7 @@
if (fc < 128)
{
- pcre_uint32 cc = *eptr;
+ pcre_uint32 cc = UCHAR21(eptr);
if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
ecode++;
eptr++;
@@ -3527,7 +3527,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
eptr++;
}
@@ -3545,7 +3545,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
eptr++;
}
@@ -3562,7 +3562,7 @@
SCHECK_PARTIAL();
break;
}
- cc = *eptr;
+ cc = UCHAR21TEST(eptr);
if (fc != cc && foc != cc) break;
eptr++;
}
@@ -3589,7 +3589,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
}
if (min == max) continue;
@@ -3606,7 +3606,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
+ if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
}
@@ -3620,7 +3620,7 @@
SCHECK_PARTIAL();
break;
}
- if (fc != *eptr) break;
+ if (fc != UCHAR21TEST(eptr)) break;
eptr++;
}
if (possessive) continue; /* No backtracking */
@@ -4375,7 +4375,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -4417,7 +4417,7 @@
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
+ if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -4527,7 +4527,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4544,7 +4544,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21(eptr);
if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4561,7 +4561,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4578,7 +4578,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21(eptr);
if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -4595,7 +4595,7 @@
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
- cc = *eptr;
+ cc = UCHAR21(eptr);
if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
RRETURN(MATCH_NOMATCH);
eptr++;
@@ -5156,7 +5156,7 @@
{
default: RRETURN(MATCH_NOMATCH);
case CHAR_CR:
- if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
+ if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
break;
case CHAR_LF:
@@ -5695,7 +5695,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -5721,7 +5721,7 @@
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
- *eptr == NLBLOCK->nl[0])
+ UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
@@ -5778,7 +5778,7 @@
if (c == CHAR_CR)
{
if (++eptr >= md->end_subject) break;
- if (*eptr == CHAR_LF) eptr++;
+ if (UCHAR21(eptr) == CHAR_LF) eptr++;
}
else
{
@@ -5941,8 +5941,8 @@
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
BACKCHAR(eptr);
- if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL &&
- eptr[-1] == CHAR_CR) eptr--;
+ if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
+ UCHAR21(eptr - 1) == CHAR_CR) eptr--;
}
}
else
@@ -6789,10 +6789,10 @@
if (first_char != first_char2)
while (start_match < end_subject &&
- (smc = *start_match) != first_char && smc != first_char2)
+ (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
start_match++;
else
- while (start_match < end_subject && *start_match != first_char)
+ while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
start_match++;
}
@@ -6824,7 +6824,7 @@
if (start_match[-1] == CHAR_CR &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
- *start_match == CHAR_NL)
+ UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
}
}
@@ -6835,7 +6835,7 @@
{
while (start_match < end_subject)
{
- register pcre_uint32 c = *start_match;
+ register pcre_uint32 c = UCHAR21TEST(start_match);
#ifndef COMPILE_PCRE8
if (c > 255) c = 255;
#endif
@@ -6893,7 +6893,7 @@
{
while (p < end_subject)
{
- register pcre_uint32 pp = *p++;
+ register pcre_uint32 pp = UCHAR21INCTEST(p);
if (pp == req_char || pp == req_char2) { p--; break; }
}
}
@@ -6901,7 +6901,7 @@
{
while (p < end_subject)
{
- if (*p++ == req_char) { p--; break; }
+ if (UCHAR21INCTEST(p) == req_char) { p--; break; }
}
}
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2014-01-01 17:11:54 UTC (rev 1430)
+++ code/trunk/pcre_internal.h 2014-01-02 17:41:28 UTC (rev 1431)
@@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -316,7 +316,8 @@
&(NLBLOCK->nllen), utf)) \
: \
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
- *p == NLBLOCK->nl[0] && (NLBLOCK->nllen == 1 || p[1] == NLBLOCK->nl[1]) \
+ UCHAR21TEST(p) == NLBLOCK->nl[0] && \
+ (NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1]) \
) \
)
@@ -329,8 +330,8 @@
&(NLBLOCK->nllen), utf)) \
: \
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
- *(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \
- (NLBLOCK->nllen == 1 || *(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \
+ UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \
+ (NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \
) \
)
@@ -581,12 +582,27 @@
#define MAX_MARK ((1u << 8) - 1)
#endif
+/* There is a proposed future special "UTF-21" mode, in which only the lowest
+21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
+high-order bits available to the application for other uses. In preparation for
+the future implementation of this mode, there are macros that load a data item
+and, if in this special mode, mask it to 21 bits. These macros all have names
+starting with UCHAR21. In all other modes, including the normal 32-bit
+library, the macros all have the same simple definitions. When the new mode is
+implemented, it is expected that these definitions will be varied appropriately
+using #ifdef when compiling the library that supports the special mode. */
+
+#define UCHAR21(eptr) (*(eptr))
+#define UCHAR21TEST(eptr) (*(eptr))
+#define UCHAR21INC(eptr) (*(eptr)++)
+#define UCHAR21INCTEST(eptr) (*(eptr)++)
+
/* When UTF encoding is being used, a character is no longer just a single
-byte. The macros for character handling generate simple sequences when used in
-character-mode, and more complicated ones for UTF characters. GETCHARLENTEST
-and other macros are not used when UTF is not supported, so they are not
-defined. To make sure they can never even appear when UTF support is omitted,
-we don't even define them. */
+byte in 8-bit mode or a single short in 16-bit mode. The macros for character
+handling generate simple sequences when used in the basic mode, and more
+complicated ones for UTF characters. GETCHARLENTEST and other macros are not
+used when UTF is not supported. To make sure they can never even appear when
+UTF support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF
Modified: code/trunk/pcre_string_utils.c
===================================================================
--- code/trunk/pcre_string_utils.c 2014-01-01 17:11:54 UTC (rev 1430)
+++ code/trunk/pcre_string_utils.c 2014-01-02 17:41:28 UTC (rev 1431)
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2013 University of Cambridge
+ Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -91,8 +91,8 @@
while (*str1 != '\0' || *str2 != '\0')
{
- c1 = *str1++;
- c2 = *str2++;
+ c1 = UCHAR21INC(str1);
+ c2 = UCHAR21INC(str2);
if (c1 != c2)
return ((c1 > c2) << 1) - 1;
}
@@ -131,7 +131,7 @@
while (*str1 != '\0' || *ustr2 != '\0')
{
- c1 = *str1++;
+ c1 = UCHAR21INC(str1);
c2 = (pcre_uchar)*ustr2++;
if (c1 != c2)
return ((c1 > c2) << 1) - 1;