[Pcre-svn] [736] code/trunk: Support OP_ANYBYTE in JIT when …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [736] code/trunk: Support OP_ANYBYTE in JIT when utf8 is disabled and optimizing utf8 character length computation
Revision: 736
          http://vcs.pcre.org/viewvc?view=rev&revision=736
Author:   zherczeg
Date:     2011-10-16 16:48:03 +0100 (Sun, 16 Oct 2011)


Log Message:
-----------
Support OP_ANYBYTE in JIT when utf8 is disabled and optimizing utf8 character length computation

Modified Paths:
--------------
    code/trunk/pcre_jit_compile.c
    code/trunk/pcre_jit_test.c
    code/trunk/pcre_tables.c


Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_jit_compile.c    2011-10-16 15:48:03 UTC (rev 736)
@@ -467,6 +467,12 @@
   case OP_SKIPZERO:
   return cc + 1;


+ case OP_ANYBYTE:
+#ifdef SUPPORT_UTF8
+ if (common->utf8) return NULL;
+#endif
+ return cc + 1;
+
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
@@ -1336,8 +1342,7 @@
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
@@ -1358,8 +1363,7 @@
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
JUMPHERE(jump);
@@ -1383,8 +1387,7 @@
/* This can be an extra read in some situations, but hopefully
it is a clever early read in most cases. */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
return;
@@ -1444,7 +1447,7 @@
static void do_utf8readchar(compiler_common *common)
{
/* Fast decoding an utf8 character. TMP1 contains the first byte
-of the character (>= 192). Return char value in TMP1, length - 1 in TMP2. */
+of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
DEFINE_COMPILER;
struct sljit_jump *jump;

@@ -1527,7 +1530,7 @@
static void do_utf8readtype8(compiler_common *common)
{
/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 192) and TMP1 is destroyed. Return value in TMP1. */
+of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_jump *compare;
@@ -1553,8 +1556,7 @@
JUMPHERE(jump);

/* We only have types for characters less than 256. */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -1598,6 +1600,9 @@
struct sljit_jump *start;
struct sljit_jump *end = NULL;
struct sljit_jump *nl = NULL;
+#ifdef SUPPORT_UTF8
+struct sljit_jump *singlebyte;
+#endif
jump_list *newline = NULL;
BOOL newlinecheck = FALSE;
BOOL readbyte = FALSE;
@@ -1668,16 +1673,15 @@
if (newlinecheck)
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);

+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(singlebyte);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPHERE(start);

@@ -1730,16 +1734,14 @@
     }
   }


+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
@@ -1846,7 +1848,7 @@
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
if (common->utf8)
- OP1(SLJIT_MOV_UB, TMP3, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -1857,11 +1859,16 @@

 #ifdef SUPPORT_UTF8
 if (common->utf8)
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP3, 0);
-else
-  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
+  OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
+#endif
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+#ifdef SUPPORT_UTF8
+if (common->utf8)
+  {
+  CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+  OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+  }
 #endif
 JUMPTO(SLJIT_JUMP, start);
 JUMPHERE(found);
@@ -2788,14 +2795,22 @@
   if (common->utf8)
     {
     OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+    jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+    OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+    JUMPHERE(jump[0]);
     return cc;
     }
 #endif
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
   return cc;


+  case OP_ANYBYTE:
+  check_input_end(common, fallbacks);
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+  return cc;
+
 #ifdef SUPPORT_UTF8
 #ifdef SUPPORT_UCP
   case OP_NOTPROP:
@@ -3042,17 +3057,20 @@
     if (c <= 127)
       {
       OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-      OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
       if (type == OP_NOT || !char_has_othercase(common, cc))
         add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c));
       else
         {
         /* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
-        OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20);
-        add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c | 0x20));
+        OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
+        add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
         }
       /* Skip the variable-length character. */
-      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+      jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+      OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+      JUMPHERE(jump[0]);
       return cc + length;
       }
     else
@@ -4744,6 +4762,7 @@
     case OP_WORDCHAR:
     case OP_ANY:
     case OP_ALLANY:
+    case OP_ANYBYTE:
     case OP_NOTPROP:
     case OP_PROP:
     case OP_ANYNL:


Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c    2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_jit_test.c    2011-10-16 15:48:03 UTC (rev 736)
@@ -135,6 +135,10 @@
     { PCRE_CASELESS, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
     { PCRE_CASELESS, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
     { PCRE_CASELESS, 0, "a1", "Aa1" },
+    { MA, 0, "\\Ca", "cda" },
+    { CMA, 0, "\\Ca", "CDA" },
+    { MA, 0, "\\Cx", "cda" },
+    { CMA, 0, "\\Cx", "CDA" },


     /* Assertions. */
     { MUA, 0, "\\b[^A]", "A_B#" },


Modified: code/trunk/pcre_tables.c
===================================================================
--- code/trunk/pcre_tables.c    2011-10-13 15:51:27 UTC (rev 735)
+++ code/trunk/pcre_tables.c    2011-10-16 15:48:03 UTC (rev 736)
@@ -88,25 +88,15 @@
   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };


#ifdef SUPPORT_JIT
-/* Full table of the number of extra bytes. See _pcre_utf8_table4 above. */
+/* Full table of the number of extra bytes when the
+character code is greater or equal than 0xc0.
+See _pcre_utf8_table4 above. */

const uschar _pcre_utf8_char_sizes[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,
+ 3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,
};
#endif