[pcre-dev] [Bug 1040] New: Performance Improvement in UTF-8 …

Page principale
Supprimer ce message
Auteur: Dmitry Ukolov
Date:  
À: pcre-dev
Sujet: [pcre-dev] [Bug 1040] New: Performance Improvement in UTF-8 mode
------- You are receiving this mail because: -------
You are on the CC list for the bug.

http://bugs.exim.org/show_bug.cgi?id=1040
           Summary: Performance Improvement in UTF-8 mode
           Product: PCRE
           Version: 8.10
          Platform: x86
        OS/Version: Windows
            Status: NEW
          Severity: wishlist
          Priority: medium
         Component: Code
        AssignedTo: ph10@???
        ReportedBy: udmitry@???
                CC: pcre-dev@???



Hi, I rewrote macroses for GETCHARxxx in UTF-8 mode.
I work mainly with 2-byte symbols.
With my changes pcre_exec() works faster to 20% on 2-byte UTF8 and to 10% on
3-byte. I eliminate cycles and make hard-coded getting of UTF-8 symbol.


This is changed code (it work correct only after changes for bug #1037).

pcre_internal.h, lines 429-

#define GETCHAR(c, eptr) \
  c = *eptr; \
  if (c >= 0xc0) \
    { \
    if ( !(c & 0x20) ) \
      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
    else if ( !(c & 0x10) ) \
      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
    else if ( !(c & 0x08) ) \
      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
    else if ( !(c & 0x04) ) \
      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
    else \
      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
    }


/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */

#define GETCHARTEST(c, eptr) \
  c = *eptr; \
  if (utf8 && c >= 0xc0) \
    { \
    if ( !(c & 0x20) ) \
      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
    else if ( !(c & 0x10) ) \
      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
    else if ( !(c & 0x08) ) \
      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
    else if ( !(c & 0x04) ) \
      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
    else \
      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
    }


/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */

#define GETCHARINC(c, eptr) \
  c = *eptr++; \
  if (c >= 0xc0) \
    { \
    if ( !(c & 0x20) ) \
      c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x10) ) \
      c = ((c & 0x0f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x08) ) \
      c = ((c & 0x07) << 18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) <<
6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x04) ) \
      c = ((c & 0x03) << 24) | ((*eptr++ & 0x3f) << 18) | ((*eptr++ & 0x3f) <<
12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    else \
      c = ((c & 0x01) << 30) | ((*eptr++ & 0x3f) << 24) | ((*eptr++ & 0x3f) <<
18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    }


/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */

#define GETCHARINCTEST(c, eptr) \
  c = *eptr++; \
  if (utf8 && c >= 0xc0) \
    { \
    if ( !(c & 0x20) ) \
      c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x10) ) \
      c = ((c & 0x0f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x08) ) \
      c = ((c & 0x07) << 18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) <<
6) | (*eptr++ & 0x3f); \
    else if ( !(c & 0x04) ) \
      c = ((c & 0x03) << 24) | ((*eptr++ & 0x3f) << 18) | ((*eptr++ & 0x3f) <<
12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    else \
      c = ((c & 0x01) << 30) | ((*eptr++ & 0x3f) << 24) | ((*eptr++ & 0x3f) <<
18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
    }


/* Get the next UTF-8 character, not advancing the pointer, incrementing length
if there are extra bytes. This is called when we know we are in UTF-8 mode. */

#define GETCHARLEN(c, eptr, len) \
  c = *eptr; \
  if (c >= 0xc0) \
    { \
    if ( !(c & 0x20) ) \
      { \
      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
      len++; \
      } \
    else if ( !(c & 0x10) ) \
      { \
      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
      len += 2; \
      } \
    else if ( !(c & 0x08) ) \
      {\
      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
      len += 3; \
      } \
    else if ( !(c & 0x04) ) \
      { \
      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
      len += 4; \
      } \
    else \
      {\
      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
      len += 5; \
      } \
    }


/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
do not know if we are in UTF-8 mode. */

#define GETCHARLENTEST(c, eptr, len) \
  c = *eptr; \
  if (utf8 && c >= 0xc0) \
  { \
    if ( !(c & 0x20) ) \
      {\
      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
      len++; \
      } \
    else if ( !(c & 0x10) ) \
      { \
      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
      len += 2; \
      } \
    else if ( !(c & 0x08) ) \
      { \
      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
      len += 3; \
      } \
    else if ( !(c & 0x04) ) \
      { \
      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
      len += 4; \
      } \
    else \
      { \
      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
      len += 5; \
      } \
  }



Sincerely,
Dmitry.


--
Configure bugmail: http://bugs.exim.org/userprefs.cgi?tab=email