[pcre-dev] parsing invalid utf8 by sequences

Autor: Oliver Schmidt
Data:
Dla: pcre-dev
Temat: [pcre-dev] parsing invalid utf8 by sequences

Hi,

what do you think about adding a compile time option to pcre that
will enable a deterministic utf8 parsing behaviour for text that
contains some invalid utf-8 byte sequences?

Idea: 
  - every byte sequence is considered as one utf8 character that
    begins with a byte 1xxxxxxx followed by as much as possible bytes 
    of the form 10xxxxxx.
  - such byte sequences could be invalid utf8. In this case the utf8
    character evaluates to an INVALID_UTF8 character (e.g. 0xffff).

The above idea would it make possible to parse utf8 characters forward
and backward in a consistent way, even if invalid byte sequences are
contained. Moreover 7bit ascii characters would never be part of a byte
sequence that forms an (invalid) utf8 character. So this is ideal for
the use case that 7bit ascii text is mixed with some valid and some
invalid utf8 bytes.

For this the macros in pcre_internal.h have to be extended. As long as
the text contains mostly 7bit characters the performance should nearly
be the same as in the unchecked version.

To make it clearer, I enclose some extended macros
as examples (untested):

illustration: valid utf-8 byte sequences:

    Bits    Last code point  Byte 1    Byte 2    Byte 3    Byte 4    Byte 5    Byte 6
      7     U+007F           0xxxxxxx                                 
     11     U+07FF           110xxxxx  10xxxxxx                         
     16     U+FFFF           1110xxxx  10xxxxxx  10xxxxxx                       
     21     U+1FFFFF         11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
     26     U+3FFFFFF        111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
     31     U+7FFFFFFF       1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx

          // 0xc0 = 1100 0000
          // 0x80 = 1000 0000

pcre-8.13:
          #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--

would be:
          #define BACKCHAR(eptr) \
            if ((*eptr & 0xc0) == 0x80) \
              { \
              uschar cc; \
              do { cc = (*(--eptr) & 0xc0); } while (cc == 0x80); \
              if (cc < 0x80) eptr++; \
              }

pcre-8.13: 
          #define GETCHARINC(c, eptr) \
            c = *eptr++; \
            if (c >= 0xc0) GETUTF8INC(c, eptr);  // 0xc0 = 11000000

would be:
          #define GETCHARINC(c, eptr) \
            c = *eptr++; \
            if (c >= 0x80) GETUTF8INC(c, eptr);  // 0x80 = 10000000

GETUTF8INC  would be:
          #define INVALID_UTF8 0xffff

          // 0x20 = 0010 0000
          // 0xe0 = 1110 0000
          // 0xc0 = 1100 0000
          // 0x80 = 1000 0000

          #define GETUTF8INC(c, eptr) \
              { \
              int v; \
              if (c >= 0xc0) \
                { \
                if ((c & 0x20) == 0) \
                  { \
                  c = ((c & 0x1f) << 6) | (*eptr & 0x3f); \
                  v =    (eptr[0] & 0xc0) == 0x80 \
                         (eptr[1] & 0xc0) != 0x80; \
                  if (v) eptr += 1; \
                  } \
                else if ((c & 0x10) == 0) \
                  { \
                  c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
                  v =    (eptr[0] & 0xc0) == 0x80 \
                      && (eptr[1] & 0xc0) == 0x80 \
                      && (eptr[2] & 0xc0) != 0x80; \
                  if (v) eptr += 2; \
                  } \
                else if ((c & 0x08) == 0) \
                  { \
                  c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
                      ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
                  v =    (eptr[0] & 0xc0) == 0x80 \
                      && (eptr[1] & 0xc0) == 0x80 \
                      && (eptr[2] & 0xc0) == 0x80 \
                      && (eptr[3] & 0xc0) != 0x80; \
                  if (v) eptr += 3; \
                  } \
                else if ((c & 0x04) == 0) \
                  { \
                  c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
                      ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
                      (eptr[3] & 0x3f); \
                  v =    (eptr[0] & 0xc0) == 0x80 \
                      && (eptr[1] & 0xc0) == 0x80 \
                      && (eptr[2] & 0xc0) == 0x80 \
                      && (eptr[3] & 0xc0) == 0x80 \
                      && (eptr[4] & 0xc0) != 0x80; \
                  if (v) eptr += 4; \
                  } \
                else \
                  { \
                  c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
                      ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
                      ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
                  v =    (eptr[0] & 0xc0) == 0x80 \
                      && (eptr[1] & 0xc0) == 0x80 \
                      && (eptr[2] & 0xc0) == 0x80 \
                      && (eptr[3] & 0xc0) == 0x80 \
                      && (eptr[4] & 0xc0) == 0x80 \
                      && (eptr[5] & 0xc0) != 0x80; \
                  if (v) eptr += 5; \
                  } \
                } \
              else \
                { \
                  v = 0; \
                } \
              if (!v) \
                { \
                  c = INVALID_UTF8; \
                  while ((*eptr & 0xc0) == 0x80) ++eptr; \
                } \
              }

Best regards,
Oliver