Revision: 736
http://vcs.pcre.org/viewvc?view=rev&revision=736
Author: zherczeg
Date: 2011-10-16 16:48:03 +0100 (Sun, 16 Oct 2011)
Log Message:
-----------
Support OP_ANYBYTE in JIT when utf8 is disabled and optimizing utf8 character length computation
Modified Paths:
--------------
code/trunk/pcre_jit_compile.c
code/trunk/pcre_jit_test.c
code/trunk/pcre_tables.c
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_jit_compile.c 2011-10-16 15:48:03 UTC (rev 736)
@@ -467,6 +467,12 @@
case OP_SKIPZERO:
return cc + 1;
+ case OP_ANYBYTE:
+#ifdef SUPPORT_UTF8
+ if (common->utf8) return NULL;
+#endif
+ return cc + 1;
+
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
@@ -1336,8 +1342,7 @@
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
@@ -1358,8 +1363,7 @@
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
JUMPHERE(jump);
@@ -1383,8 +1387,7 @@
/* This can be an extra read in some situations, but hopefully
it is a clever early read in most cases. */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
return;
@@ -1444,7 +1447,7 @@
static void do_utf8readchar(compiler_common *common)
{
/* Fast decoding an utf8 character. TMP1 contains the first byte
-of the character (>= 192). Return char value in TMP1, length - 1 in TMP2. */
+of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
DEFINE_COMPILER;
struct sljit_jump *jump;
@@ -1527,7 +1530,7 @@
static void do_utf8readtype8(compiler_common *common)
{
/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 192) and TMP1 is destroyed. Return value in TMP1. */
+of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_jump *compare;
@@ -1553,8 +1556,7 @@
JUMPHERE(jump);
/* We only have types for characters less than 256. */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -1598,6 +1600,9 @@
struct sljit_jump *start;
struct sljit_jump *end = NULL;
struct sljit_jump *nl = NULL;
+#ifdef SUPPORT_UTF8
+struct sljit_jump *singlebyte;
+#endif
jump_list *newline = NULL;
BOOL newlinecheck = FALSE;
BOOL readbyte = FALSE;
@@ -1668,16 +1673,15 @@
if (newlinecheck)
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(singlebyte);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPHERE(start);
@@ -1730,16 +1734,14 @@
}
}
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
@@ -1846,7 +1848,7 @@
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
if (common->utf8)
- OP1(SLJIT_MOV_UB, TMP3, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -1857,11 +1859,16 @@
#ifdef SUPPORT_UTF8
if (common->utf8)
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP3, 0);
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
+ OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+#ifdef SUPPORT_UTF8
+if (common->utf8)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
@@ -2788,14 +2795,22 @@
if (common->utf8)
{
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(jump[0]);
return cc;
}
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
return cc;
+ case OP_ANYBYTE:
+ check_input_end(common, fallbacks);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ return cc;
+
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UCP
case OP_NOTPROP:
@@ -3042,17 +3057,20 @@
if (c <= 127)
{
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
if (type == OP_NOT || !char_has_othercase(common, cc))
add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c));
else
{
/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
- OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20);
- add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c | 0x20));
+ OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
}
/* Skip the variable-length character. */
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(jump[0]);
return cc + length;
}
else
@@ -4744,6 +4762,7 @@
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
+ case OP_ANYBYTE:
case OP_NOTPROP:
case OP_PROP:
case OP_ANYNL:
Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c 2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_jit_test.c 2011-10-16 15:48:03 UTC (rev 736)
@@ -135,6 +135,10 @@
{ PCRE_CASELESS, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
{ PCRE_CASELESS, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
{ PCRE_CASELESS, 0, "a1", "Aa1" },
+ { MA, 0, "\\Ca", "cda" },
+ { CMA, 0, "\\Ca", "CDA" },
+ { MA, 0, "\\Cx", "cda" },
+ { CMA, 0, "\\Cx", "CDA" },
/* Assertions. */
{ MUA, 0, "\\b[^A]", "A_B#" },
Modified: code/trunk/pcre_tables.c
===================================================================
--- code/trunk/pcre_tables.c 2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_tables.c 2011-10-16 15:48:03 UTC (rev 736)
@@ -88,25 +88,15 @@
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#ifdef SUPPORT_JIT
-/* Full table of the number of extra bytes. See _pcre_utf8_table4 above. */
+/* Full table of the number of extra bytes when the
+character code is greater or equal than 0xc0.
+See _pcre_utf8_table4 above. */
const uschar _pcre_utf8_char_sizes[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,
+ 3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,
};
#endif