[Pcre-svn] [1427] code/trunk: JIT: Optimize character range …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1427] code/trunk: JIT: Optimize character range read based on minimum value in UTF-8 mode.
Revision: 1427
          http://vcs.pcre.org/viewvc?view=rev&revision=1427
Author:   zherczeg
Date:     2014-01-01 15:15:09 +0000 (Wed, 01 Jan 2014)


Log Message:
-----------
JIT: Optimize character range read based on minimum value in UTF-8 mode. Several tests were added for checking existing character range optimizations.

Modified Paths:
--------------
    code/trunk/pcre_jit_compile.c
    code/trunk/pcre_jit_test.c


Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2014-01-01 13:14:19 UTC (rev 1426)
+++ code/trunk/pcre_jit_compile.c    2014-01-01 15:15:09 UTC (rev 1427)
@@ -2563,13 +2563,16 @@
 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
 struct sljit_jump *jump;
 #endif
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+struct sljit_jump *jump2;
+#endif


SLJIT_UNUSED_ARG(update_str_ptr);
SLJIT_UNUSED_ARG(min);
SLJIT_UNUSED_ARG(max);
SLJIT_ASSERT(min <= max);

-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

#if defined SUPPORT_UTF && defined COMPILE_PCRE8
@@ -2578,7 +2581,51 @@
if (max < 128 && !update_str_ptr) return;

   jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
-  if (max >= 0x800)
+  if (min >= 0x10000)
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0);
+    if (update_str_ptr)
+      OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+    jump2 = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0x7);
+    OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+    OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+    OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+    OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
+    if (!update_str_ptr)
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+    OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+    OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+    JUMPHERE(jump2);
+    if (update_str_ptr)
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
+    }
+  else if (min >= 0x800 && max <= 0xffff)
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0);
+    if (update_str_ptr)
+      OP1(SLJIT_MOV_UB, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+    jump2 = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xf);
+    OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+    OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+    if (!update_str_ptr)
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+    OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+    OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+    JUMPHERE(jump2);
+    if (update_str_ptr)
+      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
+    }
+  else if (max >= 0x800)
     add_jump(compiler, (max < 0x10000) ? &common->utfreadchar16 : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
   else if (max < 128)
     {
@@ -4502,7 +4549,7 @@
 /* We are not necessary in utf mode even in 8 bit mode. */
 cc = ccbegin;
 detect_partial_match(common, backtracks);
-read_char_range(common, min, max, (cc[0] & XCL_NOT) != 0);
+read_char_range(common, min, max, (cc[-1] & XCL_NOT) != 0);


 if ((cc[-1] & XCL_HASPROP) == 0)
   {
@@ -4525,7 +4572,10 @@
     cc += 32 / sizeof(pcre_uchar);
     }
   else
-    add_jump(compiler, (cc[-1] & XCL_NOT) == 0 ? backtracks : &found, CMP(SLJIT_C_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff));
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, min);
+    add_jump(compiler, (cc[-1] & XCL_NOT) == 0 ? backtracks : &found, CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, max - min));
+    }
   }
 else if ((cc[-1] & XCL_MAP) != 0)
   {


Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c    2014-01-01 13:14:19 UTC (rev 1426)
+++ code/trunk/pcre_jit_test.c    2014-01-01 15:15:09 UTC (rev 1427)
@@ -75,9 +75,14 @@
       \xe1\xbf\xb8 = 0x1ff8 = 8184
    \xf0\x90\x90\x80 = 0x10400 = 66560
       \xf0\x90\x90\xa8 = 0x10428 = 66600
+   \xc7\x84 = 0x1c4 = 452
+     \xc7\x85 = 0x1c5 = 453
+     \xc7\x86 = 0x1c6 = 454
+
  Mark property:
    \xcc\x8d = 0x30d = 781
  Special:
+   \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
    \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
    \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
    \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
@@ -332,6 +337,16 @@
     { MUA, 0, "x[^bcdl]+", "xlxbxaekmd" },
     { MUA, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
     { MUA, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
+    { CMUA, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
+    { CMUA, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
+    { MUA, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
+    { MUA, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
+    { MUA, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
+    { MUA, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
+    { MUA, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
+    { MUA, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
+    { MUA, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
+    { MUA, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },


     /* Unicode properties. */
     { MUAP, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },