Revision: 1083
http://vcs.pcre.org/viewvc?view=rev&revision=1083
Author: chpe
Date: 2012-10-16 16:55:24 +0100 (Tue, 16 Oct 2012)
Log Message:
-----------
pcre32: Mask out bits >= 22 on 32-bit characters in UTF-32 mode
UTF-32 only uses 21 bits, so the upper bits may be used to store flags
etc. To allow passing the unmodified internal buffers to pcre32, make
pcre32 mask out those upper bits.
TODO: do the same for the JIT compiler, and add tests
Modified Paths:
--------------
code/trunk/pcre32_ord2utf32.c
code/trunk/pcre_internal.h
Modified: code/trunk/pcre32_ord2utf32.c
===================================================================
--- code/trunk/pcre32_ord2utf32.c 2012-10-16 15:55:20 UTC (rev 1082)
+++ code/trunk/pcre32_ord2utf32.c 2012-10-16 15:55:24 UTC (rev 1083)
@@ -50,8 +50,6 @@
#include "pcre_internal.h"
-#define MASK (0x1fffffu)
-
/*************************************************
* Convert character value to UTF-32 *
*************************************************/
@@ -71,7 +69,7 @@
{
#ifdef SUPPORT_UTF
-cvalue &= MASK;
+cvalue &= UTF32_MASK;
/* Checking invalid cvalue character, encoded as invalid UTF-32 character */
if ((cvalue & 0xfffff800u) == 0xd800u || cvalue >= 0x110000u)
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-10-16 15:55:20 UTC (rev 1082)
+++ code/trunk/pcre_internal.h 2012-10-16 15:55:24 UTC (rev 1083)
@@ -864,43 +864,47 @@
#undef GET_EXTRALEN
#undef NOT_FIRSTCHAR
+#define UTF32_MASK (0x1ffffful)
+
/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */
#define GETCHAR(c, eptr) \
- c = *eptr;
+ c = (*eptr) & UTF32_MASK;
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
- c = *eptr;
+ c = *eptr; \
+ if (utf) c &= UTF32_MASK;
/* Get the next UTF-32 character, advancing the pointer. This is called when we
know we are in UTF-32 mode. */
#define GETCHARINC(c, eptr) \
- c = *eptr++;
+ c = (*eptr++) & UTF32_MASK;
/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-32 mode. */
#define GETCHARINCTEST(c, eptr) \
- c = *eptr++;
+ c = *eptr++; \
+ if (utf) c &= UTF32_MASK;
/* Get the next UTF-32 character, not advancing the pointer, not incrementing
length (since all UTF-32 is of length 1). This is called when we know we are in
UTF-32 mode. */
#define GETCHARLEN(c, eptr, len) \
- c = *eptr;
+ GETCHAR(c, eptr)
-/* Get the next UTF-832character, testing for UTF-32 mode, not advancing the
+/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
pointer, not incrementing the length (since all UTF-32 is of length 1).
This is called when we do not know if we are in UTF-32 mode. */
#define GETCHARLENTEST(c, eptr, len) \
- c = *eptr;
+ GETCHARTEST(c, eptr)
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-32 mode - we don't put a test within the