Hi,
what do you think about adding a compile time option to pcre that
will enable a deterministic utf8 parsing behaviour for text that
contains some invalid utf-8 byte sequences?
Idea:
- every byte sequence is considered as one utf8 character that
begins with a byte 1xxxxxxx followed by as much as possible bytes
of the form 10xxxxxx.
- such byte sequences could be invalid utf8. In this case the utf8
character evaluates to an INVALID_UTF8 character (e.g. 0xffff).
The above idea would it make possible to parse utf8 characters forward
and backward in a consistent way, even if invalid byte sequences are
contained. Moreover 7bit ascii characters would never be part of a byte
sequence that forms an (invalid) utf8 character. So this is ideal for
the use case that 7bit ascii text is mixed with some valid and some
invalid utf8 bytes.
For this the macros in pcre_internal.h have to be extended. As long as
the text contains mostly 7bit characters the performance should nearly
be the same as in the unchecked version.
To make it clearer, I enclose some extended macros
as examples (untested):
illustration: valid utf-8 byte sequences:
Bits Last code point Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
7 U+007F 0xxxxxxx
11 U+07FF 110xxxxx 10xxxxxx
16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 0xc0 = 1100 0000
// 0x80 = 1000 0000
pcre-8.13:
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
would be:
#define BACKCHAR(eptr) \
if ((*eptr & 0xc0) == 0x80) \
{ \
uschar cc; \
do { cc = (*(--eptr) & 0xc0); } while (cc == 0x80); \
if (cc < 0x80) eptr++; \
}
pcre-8.13:
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (c >= 0xc0) GETUTF8INC(c, eptr); // 0xc0 = 11000000
would be:
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (c >= 0x80) GETUTF8INC(c, eptr); // 0x80 = 10000000
GETUTF8INC would be:
#define INVALID_UTF8 0xffff
// 0x20 = 0010 0000
// 0xe0 = 1110 0000
// 0xc0 = 1100 0000
// 0x80 = 1000 0000
#define GETUTF8INC(c, eptr) \
{ \
int v; \
if (c >= 0xc0) \
{ \
if ((c & 0x20) == 0) \
{ \
c = ((c & 0x1f) << 6) | (*eptr & 0x3f); \
v = (eptr[0] & 0xc0) == 0x80 \
(eptr[1] & 0xc0) != 0x80; \
if (v) eptr += 1; \
} \
else if ((c & 0x10) == 0) \
{ \
c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
v = (eptr[0] & 0xc0) == 0x80 \
&& (eptr[1] & 0xc0) == 0x80 \
&& (eptr[2] & 0xc0) != 0x80; \
if (v) eptr += 2; \
} \
else if ((c & 0x08) == 0) \
{ \
c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
v = (eptr[0] & 0xc0) == 0x80 \
&& (eptr[1] & 0xc0) == 0x80 \
&& (eptr[2] & 0xc0) == 0x80 \
&& (eptr[3] & 0xc0) != 0x80; \
if (v) eptr += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
(eptr[3] & 0x3f); \
v = (eptr[0] & 0xc0) == 0x80 \
&& (eptr[1] & 0xc0) == 0x80 \
&& (eptr[2] & 0xc0) == 0x80 \
&& (eptr[3] & 0xc0) == 0x80 \
&& (eptr[4] & 0xc0) != 0x80; \
if (v) eptr += 4; \
} \
else \
{ \
c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
v = (eptr[0] & 0xc0) == 0x80 \
&& (eptr[1] & 0xc0) == 0x80 \
&& (eptr[2] & 0xc0) == 0x80 \
&& (eptr[3] & 0xc0) == 0x80 \
&& (eptr[4] & 0xc0) == 0x80 \
&& (eptr[5] & 0xc0) != 0x80; \
if (v) eptr += 5; \
} \
} \
else \
{ \
v = 0; \
} \
if (!v) \
{ \
c = INVALID_UTF8; \
while ((*eptr & 0xc0) == 0x80) ++eptr; \
} \
}
Best regards,
Oliver