------- You are receiving this mail because: -------
You are on the CC list for the bug.
http://bugs.exim.org/show_bug.cgi?id=1040
Summary: Performance Improvement in UTF-8 mode
Product: PCRE
Version: 8.10
Platform: x86
OS/Version: Windows
Status: NEW
Severity: wishlist
Priority: medium
Component: Code
AssignedTo: ph10@???
ReportedBy: udmitry@???
CC: pcre-dev@???
Hi, I rewrote macroses for GETCHARxxx in UTF-8 mode.
I work mainly with 2-byte symbols.
With my changes pcre_exec() works faster to 20% on 2-byte UTF8 and to 10% on
3-byte. I eliminate cycles and make hard-coded getting of UTF-8 symbol.
This is changed code (it work correct only after changes for bug #1037).
pcre_internal.h, lines 429-
#define GETCHAR(c, eptr) \
c = *eptr; \
if (c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
else if ( !(c & 0x10) ) \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
else if ( !(c & 0x08) ) \
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
else if ( !(c & 0x04) ) \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
else \
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
}
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
if (utf8 && c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
else if ( !(c & 0x10) ) \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
else if ( !(c & 0x08) ) \
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
else if ( !(c & 0x04) ) \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
else \
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
}
/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x10) ) \
c = ((c & 0x0f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x08) ) \
c = ((c & 0x07) << 18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) <<
6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x04) ) \
c = ((c & 0x03) << 24) | ((*eptr++ & 0x3f) << 18) | ((*eptr++ & 0x3f) <<
12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
else \
c = ((c & 0x01) << 30) | ((*eptr++ & 0x3f) << 24) | ((*eptr++ & 0x3f) <<
18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
}
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
if (utf8 && c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x10) ) \
c = ((c & 0x0f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x08) ) \
c = ((c & 0x07) << 18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) <<
6) | (*eptr++ & 0x3f); \
else if ( !(c & 0x04) ) \
c = ((c & 0x03) << 24) | ((*eptr++ & 0x3f) << 18) | ((*eptr++ & 0x3f) <<
12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
else \
c = ((c & 0x01) << 30) | ((*eptr++ & 0x3f) << 24) | ((*eptr++ & 0x3f) <<
18) | ((*eptr++ & 0x3f) << 12) | ((*eptr++ & 0x3f) << 6) | (*eptr++ & 0x3f); \
}
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
if (c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
{ \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
len++; \
} \
else if ( !(c & 0x10) ) \
{ \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
len += 2; \
} \
else if ( !(c & 0x08) ) \
{\
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
len += 3; \
} \
else if ( !(c & 0x04) ) \
{ \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
len += 4; \
} \
else \
{\
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
len += 5; \
} \
}
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
if (utf8 && c >= 0xc0) \
{ \
if ( !(c & 0x20) ) \
{\
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
len++; \
} \
else if ( !(c & 0x10) ) \
{ \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
len += 2; \
} \
else if ( !(c & 0x08) ) \
{ \
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) <<
6) | (eptr[3] & 0x3f); \
len += 3; \
} \
else if ( !(c & 0x04) ) \
{ \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) <<
12) | ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
len += 4; \
} \
else \
{ \
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | ((eptr[2] & 0x3f) <<
18) | ((eptr[3] & 0x3f) << 12) | ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
len += 5; \
} \
}
Sincerely,
Dmitry.
--
Configure bugmail:
http://bugs.exim.org/userprefs.cgi?tab=email