[Pcre-svn] [1420] code/trunk: Improve character range checks…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1420] code/trunk: Improve character range checks in JIT.
Revision: 1420
          http://vcs.pcre.org/viewvc?view=rev&revision=1420
Author:   zherczeg
Date:     2013-12-29 11:43:10 +0000 (Sun, 29 Dec 2013)


Log Message:
-----------
Improve character range checks in JIT.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_jit_compile.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-12-29 04:42:14 UTC (rev 1419)
+++ code/trunk/ChangeLog    2013-12-29 11:43:10 UTC (rev 1420)
@@ -25,7 +25,13 @@
     characters, any three characters with fixed position can be searched.
     Search order: first, last, middle.


+6.  Improve character range checks in JIT. Characters are read by an inprecise
+    function now, which returns with an unknown value if the character code is
+    above a certain treshold (e.g: 256). The only limitation is that the value
+    must be bigger than the treshold as well. This function is useful, when
+    the characters above the treshold are handled in the same way.


+
Version 8.34 15-December-2013
-----------------------------


Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2013-12-29 04:42:14 UTC (rev 1419)
+++ code/trunk/pcre_jit_compile.c    2013-12-29 11:43:10 UTC (rev 1420)
@@ -369,7 +369,6 @@
   int endonly;
   /* Tables. */
   sljit_sw ctypes;
-  int digits[2 + MAX_RANGE_SIZE];
   /* Named capturing brackets. */
   pcre_uchar *name_table;
   sljit_sw name_count;
@@ -408,6 +407,7 @@
   jump_list *utfreadchar;
 #endif
 #ifdef COMPILE_PCRE8
+  jump_list *utfreadchar8;
   jump_list *utfreadtype8;
 #endif
 #endif /* SUPPORT_UTF */
@@ -2512,6 +2512,36 @@
 #endif /* SUPPORT_UTF && !COMPILE_PCRE32 */
 }


+static void read_char8(compiler_common *common)
+{
+/* Reads the precise value of a character into TMP1, if the character is
+less than 256. Otherwise it returns with a value greater or equal than 256. */
+DEFINE_COMPILER;
+#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
+struct sljit_jump *jump;
+#endif
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
+if (common->utf)
+ {
+#if defined COMPILE_PCRE8
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ add_jump(compiler, &common->utfreadchar8, JUMP(SLJIT_FAST_CALL));
+ JUMPHERE(jump);
+#elif defined COMPILE_PCRE16
+ /* Skip low surrogate if necessary. */
+ OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPHERE(jump);
+#endif /* COMPILE_PCRE[8|16] */
+ }
+#endif
+}
+
static void read_char8_type(compiler_common *common)
{
/* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
@@ -2538,11 +2568,10 @@
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
JUMPHERE(jump);
/* Skip low surrogate if necessary. */
- OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00);
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL);
- OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPHERE(jump);
#elif defined COMPILE_PCRE32
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
@@ -2686,6 +2715,33 @@
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

+static void do_utfreadchar8(compiler_common *common)
+{
+/* Fast decoding a UTF-8 character. TMP1 contains the first byte
+of the character (>= 0xc0). Return value in TMP1. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
+jump = JUMP(SLJIT_C_NOT_ZERO);
+/* Two byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
static void do_utfreadtype8(compiler_common *common)
{
/* Fast decoding a UTF-8 character type. TMP2 contains the first byte
@@ -2702,22 +2758,23 @@
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f);
+/* The upper 5 bits are known at this point. */
+compare = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0x3);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
-compare = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 255);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);

JUMPHERE(compare);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-JUMPHERE(jump);

/* We only have types for characters less than 256. */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+JUMPHERE(jump);
+OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

@@ -3717,6 +3774,7 @@
static BOOL check_ranges(compiler_common *common, int *ranges, jump_list **backtracks, BOOL readch)
{
DEFINE_COMPILER;
+int offset;

if (ranges[0] < 0 || ranges[0] > 4)
return FALSE;
@@ -3726,7 +3784,7 @@
add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));

if (readch)
- read_char(common);
+ read_char8(common);

switch(ranges[0])
{
@@ -3790,23 +3848,23 @@

   if (ranges[1] != 0)
     {
+    offset = 0;
     if (ranges[2] + 1 != ranges[3])
       {
       OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
       add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]));
-      ranges[4] -= ranges[2];
-      ranges[5] -= ranges[2];
+      offset = ranges[2];
       }
     else
       add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));


     if (ranges[4] + 1 != ranges[5])
       {
-      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
+      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4] - offset);
       add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
       }
     else
-      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
+      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4] - offset));
     return TRUE;
     }


@@ -3827,36 +3885,6 @@
}
}

-static void get_ctype_ranges(compiler_common *common, int flag, int *ranges)
-{
-int i, bit, length;
-const pcre_uint8 *ctypes = (const pcre_uint8*)common->ctypes;
-
-bit = ctypes[0] & flag;
-ranges[0] = -1;
-ranges[1] = bit != 0 ? 1 : 0;
-length = 0;
-
-for (i = 1; i < 256; i++)
-  if ((ctypes[i] & flag) != bit)
-    {
-    if (length >= MAX_RANGE_SIZE)
-      return;
-    ranges[2 + length] = i;
-    length++;
-    bit ^= flag;
-    }
-
-if (bit != 0)
-  {
-  if (length >= MAX_RANGE_SIZE)
-    return;
-  ranges[2 + length] = 256;
-  length++;
-  }
-ranges[0] = length;
-}
-
 static BOOL check_class_ranges(compiler_common *common, const pcre_uint8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks)
 {
 int ranges[2 + MAX_RANGE_SIZE];
@@ -4825,20 +4853,11 @@
   case OP_NOT_DIGIT:
   case OP_DIGIT:
   /* Digits are usually 0-9, so it is worth to optimize them. */
-  if (common->digits[0] == -2)
-    get_ctype_ranges(common, ctype_digit, common->digits);
   detect_partial_match(common, backtracks);
   /* Flip the starting bit in the negative case. */
-  if (type == OP_NOT_DIGIT)
-    common->digits[1] ^= 1;
-  if (!check_ranges(common, common->digits, backtracks, TRUE))
-    {
-    read_char8_type(common);
-    OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_digit);
-    add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_C_ZERO : SLJIT_C_NOT_ZERO));
-    }
-  if (type == OP_NOT_DIGIT)
-    common->digits[1] ^= 1;
+  read_char8_type(common);
+  OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_digit);
+  add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_C_ZERO : SLJIT_C_NOT_ZERO));
   return cc;


   case OP_NOT_WHITESPACE:
@@ -5273,7 +5292,7 @@
   case OP_CLASS:
   case OP_NCLASS:
   detect_partial_match(common, backtracks);
-  read_char(common);
+  read_char8(common);
   if (check_class_ranges(common, (const pcre_uint8 *)cc, type == OP_NCLASS, FALSE, backtracks))
     return cc + 32 / sizeof(pcre_uchar);


@@ -9335,7 +9354,6 @@
}
common->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
common->ctypes = (sljit_sw)(tables + ctypes_offset);
-common->digits[0] = -2;
common->name_table = ((pcre_uchar *)re) + re->name_table_offset;
common->name_count = re->name_count;
common->name_entry_size = re->name_entry_size;
@@ -9755,6 +9773,11 @@
}
#endif /* !COMPILE_PCRE32 */
#ifdef COMPILE_PCRE8
+if (common->utfreadchar8 != NULL)
+ {
+ set_jumps(common->utfreadchar8, LABEL());
+ do_utfreadchar8(common);
+ }
if (common->utfreadtype8 != NULL)
{
set_jumps(common->utfreadtype8, LABEL());