[Pcre-svn] [350] code/trunk: Improve first character match i…

トップ ページ
このメッセージを削除
著者: Subversion repository
日付:  
To: pcre-svn
題目: [Pcre-svn] [350] code/trunk: Improve first character match in JIT with SSE2 on x86.
Revision: 350
          http://www.exim.org/viewvc/pcre2?view=rev&revision=350
Author:   zherczeg
Date:     2015-08-23 02:54:04 +0100 (Sun, 23 Aug 2015)
Log Message:
-----------
Improve first character match in JIT with SSE2 on x86.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/pcre2_jit_test.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2015-08-18 10:39:59 UTC (rev 349)
+++ code/trunk/ChangeLog    2015-08-23 01:54:04 UTC (rev 350)
@@ -149,7 +149,9 @@
 only at the part of the subject that is relevant when the starting offset is 
 non-zero.


+41. Improve first character match in JIT with SSE2 on x86.

+
Version 10.20 30-June-2015
--------------------------


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2015-08-18 10:39:59 UTC (rev 349)
+++ code/trunk/src/pcre2_jit_compile.c    2015-08-23 01:54:04 UTC (rev 350)
@@ -600,11 +600,6 @@
 return count;
 }


-static int ones_in_half_byte[16] = {
- /* 0 */ 0, 1, 1, 2, /* 4 */ 1, 2, 2, 3,
- /* 8 */ 1, 2, 2, 3, /* 12 */ 2, 3, 3, 4
-};
-
/* Functions whose might need modification for all new supported opcodes:
next_opcode
check_opcode_types
@@ -3423,44 +3418,44 @@
}

#define MAX_N_CHARS 16
-#define MAX_N_BYTES 8
+#define MAX_DIFF_CHARS 6

-static SLJIT_INLINE void add_prefix_byte(sljit_ub byte, sljit_ub *bytes)
+static SLJIT_INLINE void add_prefix_char(PCRE2_UCHAR chr, PCRE2_UCHAR *chars)
{
-sljit_ub len = bytes[0];
-int i;
+PCRE2_UCHAR i, len;

+len = chars[0];
if (len == 255)
return;

if (len == 0)
{
- bytes[0] = 1;
- bytes[1] = byte;
+ chars[0] = 1;
+ chars[1] = chr;
return;
}

 for (i = len; i > 0; i--)
-  if (bytes[i] == byte)
+  if (chars[i] == chr)
     return;


-if (len >= MAX_N_BYTES - 1)
+if (len >= MAX_DIFF_CHARS - 1)
{
- bytes[0] = 255;
+ chars[0] = 255;
return;
}

len++;
-bytes[len] = byte;
-bytes[0] = len;
+chars[len] = chr;
+chars[0] = len;
}

-static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, sljit_ui *chars, sljit_ub *bytes, int max_chars, uint32_t *rec_count)
+static int scan_prefix(compiler_common *common, PCRE2_SPTR cc, PCRE2_UCHAR *chars, int max_chars, uint32_t *rec_count)
 {
 /* Recursive function, which scans prefix literals. */
 BOOL last, any, caseless;
 int len, repeat, len_save, consumed = 0;
-sljit_ui chr, mask;
+sljit_ui chr; /* Any unicode character. */
 PCRE2_SPTR alternative, cc_save, oc;
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
 PCRE2_UCHAR othercase[8];
@@ -3542,7 +3537,7 @@
 #ifdef SUPPORT_UNICODE
     if (common->utf && HAS_EXTRALEN(*cc)) len += GET_EXTRALEN(*cc);
 #endif
-    max_chars = scan_prefix(common, cc + len, chars, bytes, max_chars, rec_count);
+    max_chars = scan_prefix(common, cc + len, chars, max_chars, rec_count);
     if (max_chars == 0)
       return consumed;
     last = FALSE;
@@ -3565,7 +3560,7 @@
     alternative = cc + GET(cc, 1);
     while (*alternative == OP_ALT)
       {
-      max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars, rec_count);
+      max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars, rec_count);
       if (max_chars == 0)
         return consumed;
       alternative += GET(alternative, 1);
@@ -3677,27 +3672,14 @@


   if (any)
     {
-#if PCRE2_CODE_UNIT_WIDTH == 8
-    mask = 0xff;
-#elif PCRE2_CODE_UNIT_WIDTH == 16
-    mask = 0xffff;
-#elif PCRE2_CODE_UNIT_WIDTH == 32
-    mask = 0xffffffff;
-#else
-    SLJIT_ASSERT_STOP();
-#endif
-
     do
       {
-      chars[0] = mask;
-      chars[1] = mask;
-      bytes[0] = 255;
+      chars[0] = 255;


       consumed++;
       if (--max_chars == 0)
         return consumed;
-      chars += 2;
-      bytes += MAX_N_BYTES;
+      chars += MAX_DIFF_CHARS;
       }
     while (--repeat > 0);


@@ -3740,43 +3722,16 @@
     do
       {
       chr = *cc;
-#if PCRE2_CODE_UNIT_WIDTH == 32
-      if (SLJIT_UNLIKELY(chr == NOTACHAR))
-        return consumed;
-#endif
-      add_prefix_byte((sljit_ub)chr, bytes);
+      add_prefix_char(*cc, chars);


-      mask = 0;
       if (caseless)
-        {
-        add_prefix_byte((sljit_ub)*oc, bytes);
-        mask = *cc ^ *oc;
-        chr |= mask;
-        }
+        add_prefix_char(*oc, chars);


-#if PCRE2_CODE_UNIT_WIDTH == 32
-      if (chars[0] == NOTACHAR && chars[1] == 0)
-#else
-      if (chars[0] == NOTACHAR)
-#endif
-        {
-        chars[0] = chr;
-        chars[1] = mask;
-        }
-      else
-        {
-        mask |= chars[0] ^ chr;
-        chr |= mask;
-        chars[0] = chr;
-        chars[1] |= mask;
-        }
-
       len--;
       consumed++;
       if (--max_chars == 0)
         return consumed;
-      chars += 2;
-      bytes += MAX_N_BYTES;
+      chars += MAX_DIFF_CHARS;
       cc++;
       oc++;
       }
@@ -3795,60 +3750,447 @@
   }
 }


+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+
+static sljit_si character_to_int32(PCRE2_UCHAR chr)
+{
+sljit_si value = (sljit_si)chr;
+#if PCRE2_CODE_UNIT_WIDTH == 8
+#define SSE2_COMPARE_TYPE_INDEX 0
+return (value << 24) | (value << 16) | (value << 8) | value;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+#define SSE2_COMPARE_TYPE_INDEX 1
+return (value << 16) | value;
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+#define SSE2_COMPARE_TYPE_INDEX 2
+return value;
+#else
+#error "Unsupported unit width"
+#endif
+}
+
+static SLJIT_INLINE void fast_forward_first_char2_sse2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, jump_list **found)
+{
+DEFINE_COMPILER;
+struct sljit_label *start;
+struct sljit_jump *quit;
+struct sljit_jump *aligned;
+struct sljit_jump *nomatch;
+sljit_ub instruction[8];
+sljit_si tmp1_ind = sljit_get_register_index(TMP1);
+sljit_si tmp2_ind = sljit_get_register_index(TMP2);
+sljit_si str_ptr_ind = sljit_get_register_index(STR_PTR);
+BOOL load_twice = FALSE;
+PCRE2_UCHAR bit;
+
+bit = char1 ^ char2;
+if (!is_powerof2(bit))
+  bit = 0;
+
+if ((char1 != char2) && bit == 0)
+  load_twice = TRUE;
+
+OP2(SLJIT_ADD, TMP1, 0, STR_PTR, 0, SLJIT_IMM, 32 /* bytes */);
+quit = CMP(SLJIT_GREATER, TMP1, 0, STR_END, 0);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
+
+SLJIT_ASSERT(tmp1_ind < 8 && tmp2_ind == 1);
+
+/* MOVD xmm, r/m32 */
+instruction[0] = 0x66;
+instruction[1] = 0x0f;
+instruction[2] = 0x6e;
+instruction[3] = 0xc0 | (2 << 3) | tmp1_ind;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (char1 != char2)
+  {
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
+
+  /* MOVD xmm, r/m32 */
+  instruction[3] = 0xc0 | (3 << 3) | tmp1_ind;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+/* PSHUFD xmm1, xmm2/m128, imm8 */
+instruction[2] = 0x70;
+instruction[3] = 0xc0 | (2 << 3) | 2;
+instruction[4] = 0;
+sljit_emit_op_custom(compiler, instruction, 5);
+
+if (char1 != char2)
+  {
+  /* PSHUFD xmm1, xmm2/m128, imm8 */
+  instruction[3] = 0xc0 | (3 << 3) | 3;
+  instruction[4] = 0;
+  sljit_emit_op_custom(compiler, instruction, 5);
+  }
+
+OP2(SLJIT_AND | SLJIT_SET_E, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 0xf);
+aligned = JUMP(SLJIT_ZERO);
+
+OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
+
+/* MOVDQA xmm1, xmm2/m128 */
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+
+if (str_ptr_ind < 8)
+  {
+  instruction[2] = 0x6f;
+  instruction[3] = (0 << 3) | str_ptr_ind;
+  sljit_emit_op_custom(compiler, instruction, 4);
+
+  if (load_twice)
+    {
+    instruction[3] = (1 << 3) | str_ptr_ind;
+    sljit_emit_op_custom(compiler, instruction, 4);
+    }
+  }
+else
+  {
+  instruction[1] = 0x41;
+  instruction[2] = 0x0f;
+  instruction[3] = 0x6f;
+  instruction[4] = (0 << 3) | (str_ptr_ind & 0x7);
+  sljit_emit_op_custom(compiler, instruction, 5);
+
+  if (load_twice)
+    {
+    instruction[4] = (1 << 3) | str_ptr_ind;
+    sljit_emit_op_custom(compiler, instruction, 5);
+    }
+  instruction[1] = 0x0f;
+  }
+
+#else
+
+instruction[2] = 0x6f;
+instruction[3] = (0 << 3) | str_ptr_ind;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  instruction[3] = (1 << 3) | str_ptr_ind;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+#endif
+
+if (bit != 0)
+  {
+  /* POR xmm1, xmm2/m128 */
+  instruction[2] = 0xeb;
+  instruction[3] = 0xc0 | (0 << 3) | 3;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+/* PCMPEQB/W/D xmm1, xmm2/m128 */
+instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
+instruction[3] = 0xc0 | (0 << 3) | 2;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  instruction[3] = 0xc0 | (1 << 3) | 3;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+/* PMOVMSKB reg, xmm */
+instruction[2] = 0xd7;
+instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  OP1(SLJIT_MOV, TMP3, 0, TMP2, 0);
+  instruction[3] = 0xc0 | (tmp2_ind << 3) | 1;
+  sljit_emit_op_custom(compiler, instruction, 4);
+
+  OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+  OP1(SLJIT_MOV, TMP2, 0, TMP3, 0);
+  }
+
+OP2(SLJIT_ASHR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+nomatch = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0);
+
+/* BSF r32, r/m32 */
+
+instruction[0] = 0x0f;
+instruction[1] = 0xbc;
+instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
+sljit_emit_op_custom(compiler, instruction, 3);
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+add_jump(compiler, found, JUMP(SLJIT_JUMP));
+
+JUMPHERE(nomatch);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+
+JUMPHERE(aligned);
+
+/* SSE2 part */
+
+OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, 16);
+
+start = LABEL();
+
+instruction[0] = 0x66;
+instruction[1] = 0x0f;
+
+/* MOVDQA xmm1, xmm2/m128 */
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+
+if (str_ptr_ind < 8)
+  {
+  instruction[2] = 0x6f;
+  instruction[3] = (0 << 3) | str_ptr_ind;
+  sljit_emit_op_custom(compiler, instruction, 4);
+
+  if (load_twice)
+    {
+    instruction[3] = (1 << 3) | str_ptr_ind;
+    sljit_emit_op_custom(compiler, instruction, 4);
+    }
+  }
+else
+  {
+  instruction[1] = 0x41;
+  instruction[2] = 0x0f;
+  instruction[3] = 0x6f;
+  instruction[4] = (0 << 3) | (str_ptr_ind & 0x7);
+  sljit_emit_op_custom(compiler, instruction, 5);
+
+  if (load_twice)
+    {
+    instruction[4] = (1 << 3) | str_ptr_ind;
+    sljit_emit_op_custom(compiler, instruction, 5);
+    }
+  instruction[1] = 0x0f;
+  }
+
+#else
+
+instruction[2] = 0x6f;
+instruction[3] = (0 << 3) | str_ptr_ind;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  instruction[3] = (1 << 3) | str_ptr_ind;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+#endif
+
+if (bit != 0)
+  {
+  /* POR xmm1, xmm2/m128 */
+  instruction[2] = 0xeb;
+  instruction[3] = 0xc0 | (0 << 3) | 3;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+/* PCMPEQB/W/D xmm1, xmm2/m128 */
+instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
+instruction[3] = 0xc0 | (0 << 3) | 2;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  instruction[3] = 0xc0 | (1 << 3) | 3;
+  sljit_emit_op_custom(compiler, instruction, 4);
+  }
+
+/* PMOVMSKB reg, xmm */
+instruction[2] = 0xd7;
+instruction[3] = 0xc0 | (tmp1_ind << 3) | 0;
+sljit_emit_op_custom(compiler, instruction, 4);
+
+if (load_twice)
+  {
+  instruction[3] = 0xc0 | (tmp2_ind << 3) | 1;
+  sljit_emit_op_custom(compiler, instruction, 4);
+
+  OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+  }
+
+nomatch = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0);
+
+/* BSF r32, r/m32 */
+
+instruction[0] = 0x0f;
+instruction[1] = 0xbc;
+instruction[2] = 0xc0 | (tmp1_ind << 3) | tmp1_ind;
+sljit_emit_op_custom(compiler, instruction, 3);
+
+OP2(SLJIT_ADD, STR_END, 0, STR_END, 0, SLJIT_IMM, 16);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+add_jump(compiler, found, JUMP(SLJIT_JUMP));
+
+JUMPHERE(nomatch);
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
+CMPTO(SLJIT_LESS_EQUAL, STR_PTR, 0, STR_END, 0, start);
+
+OP2(SLJIT_ADD, STR_END, 0, STR_END, 0, SLJIT_IMM, 16);
+JUMPHERE(quit);
+}
+
+#undef SSE2_COMPARE_TYPE_INDEX
+
+#endif
+
+static void fast_forward_first_char2(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_si offset, BOOL firstline)
+{
+DEFINE_COMPILER;
+struct sljit_label *start;
+struct sljit_jump *quit;
+struct sljit_jump *found;
+PCRE2_UCHAR mask;
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+struct sljit_label *utf_start = NULL;
+struct sljit_jump *utf_quit = NULL;
+#endif
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+jump_list *found_list = NULL;
+#endif
+
+if (offset > 0)
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset));
+
+if (firstline)
+  {
+  SLJIT_ASSERT(common->first_line_end != 0);
+  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
+  OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end);
+  }
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf && offset > 0)
+  utf_start = LABEL();
+#endif
+
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+
+/* SSE2 accelerated first character search. */
+
+if (sljit_is_fpu_available())
+  fast_forward_first_char2_sse2(common, char1, char2, &found_list);
+
+#endif
+
+quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+
+start = LABEL();
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+
+if (char1 == char2)
+  found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1);
+else
+  {
+  mask = char1 ^ char2;
+  if (is_powerof2(mask))
+    {
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask);
+    found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, char1 | mask);
+    }
+  else
+    {
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char1);
+    OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
+    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, char2);
+    OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
+    found = JUMP(SLJIT_NOT_ZERO);
+    }
+  }
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, start);
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf && offset > 0)
+  utf_quit = JUMP(SLJIT_JUMP);
+#endif
+
+JUMPHERE(found);
+
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+set_jumps(found_list, LABEL());
+#endif
+
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf && offset > 0)
+  {
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
+  OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+#if PCRE2_CODE_UNIT_WIDTH == 8
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+  CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, utf_start);
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00, utf_start);
+#else
+#error "Unknown code width"
+#endif
+  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+  JUMPHERE(utf_quit);
+  }
+#endif
+
+JUMPHERE(quit);
+
+if (offset > 0)
+  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offset));
+
+if (firstline)
+  OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
+}
+
 static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common, BOOL firstline)
 {
 DEFINE_COMPILER;
 struct sljit_label *start;
 struct sljit_jump *quit;
-sljit_ui chars[MAX_N_CHARS * 2];
-sljit_ub bytes[MAX_N_CHARS * MAX_N_BYTES];
-sljit_ub ones[MAX_N_CHARS];
-int offsets[3];
-sljit_ui mask;
-sljit_ub *byte_set, *byte_set_end;
+struct sljit_jump *match;
+/* bytes[0] represent the number of characters between 0
+and MAX_N_BYTES - 1, 255 represents any character. */
+PCRE2_UCHAR chars[MAX_N_CHARS * MAX_DIFF_CHARS];
+sljit_si offset;
+PCRE2_UCHAR mask;
+PCRE2_UCHAR *char_set, *char_set_end;
 int i, max, from;
-int range_right = -1, range_len = 3 - 1;
+int range_right = -1, range_len;
 sljit_ub *update_table = NULL;
 BOOL in_range;
 uint32_t rec_count;


for (i = 0; i < MAX_N_CHARS; i++)
- {
- chars[i << 1] = NOTACHAR;
- chars[(i << 1) + 1] = 0;
- bytes[i * MAX_N_BYTES] = 0;
- }
+ chars[i * MAX_DIFF_CHARS] = 0;

rec_count = 10000;
-max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS, &rec_count);
+max = scan_prefix(common, common->start, chars, MAX_N_CHARS, &rec_count);

if (max <= 1)
return FALSE;

-for (i = 0; i < max; i++)
-  {
-  mask = chars[(i << 1) + 1];
-  ones[i] = ones_in_half_byte[mask & 0xf];
-  mask >>= 4;
-  while (mask != 0)
-    {
-    ones[i] += ones_in_half_byte[mask & 0xf];
-    mask >>= 4;
-    }
-  }
-
 in_range = FALSE;
-from = 0;   /* Prevent compiler "uninitialized" warning */
+/* Prevent compiler "uninitialized" warning */
+from = 0;
+range_len = 4 /* minimum length */ - 1;
 for (i = 0; i <= max; i++)
   {
-  if (in_range && (i - from) > range_len && (bytes[(i - 1) * MAX_N_BYTES] <= 4))
+  if (in_range && (i - from) > range_len && (chars[(i - 1) * MAX_DIFF_CHARS] < 255))
     {
     range_len = i - from;
     range_right = i - 1;
     }


-  if (i < max && bytes[i * MAX_N_BYTES] < 255)
+  if (i < max && chars[i * MAX_DIFF_CHARS] < 255)
     {
+    SLJIT_ASSERT(chars[i * MAX_DIFF_CHARS] > 0);
     if (!in_range)
       {
       in_range = TRUE;
@@ -3855,7 +4197,7 @@
       from = i;
       }
     }
-  else if (in_range)
+  else
     in_range = FALSE;
   }


@@ -3868,86 +4210,61 @@

   for (i = 0; i < range_len; i++)
     {
-    byte_set = bytes + ((range_right - i) * MAX_N_BYTES);
-    SLJIT_ASSERT(byte_set[0] > 0 && byte_set[0] < 255);
-    byte_set_end = byte_set + byte_set[0];
-    byte_set++;
-    while (byte_set <= byte_set_end)
+    char_set = chars + ((range_right - i) * MAX_DIFF_CHARS);
+    SLJIT_ASSERT(char_set[0] > 0 && char_set[0] < 255);
+    char_set_end = char_set + char_set[0];
+    char_set++;
+    while (char_set <= char_set_end)
       {
-      if (update_table[*byte_set] > IN_UCHARS(i))
-        update_table[*byte_set] = IN_UCHARS(i);
-      byte_set++;
+      if (update_table[(*char_set) & 0xff] > IN_UCHARS(i))
+        update_table[(*char_set) & 0xff] = IN_UCHARS(i);
+      char_set++;
       }
     }
   }


-offsets[0] = -1;
-offsets[1] = -1;
-offsets[2] = -1;
+offset = -1;
 /* Scan forward. */
 for (i = 0; i < max; i++)
-  if (ones[i] <= 2) {
-    offsets[0] = i;
-    break;
-  }
-
-if (offsets[0] < 0 && range_right < 0)
-  return FALSE;
-
-if (offsets[0] >= 0)
   {
-  /* Scan backward. */
-  for (i = max - 1; i > offsets[0]; i--)
-    if (ones[i] <= 2 && i != range_right)
+  if (offset == -1)
+    {
+    if (chars[i * MAX_DIFF_CHARS] <= 2)
+      offset = i;
+    }
+  else if (chars[offset * MAX_DIFF_CHARS] == 2 && chars[i * MAX_DIFF_CHARS] <= 2)
+    {
+    if (chars[i * MAX_DIFF_CHARS] == 1)
+      offset = i;
+    else
       {
-      offsets[1] = i;
-      break;
-      }
-
-  /* This case is handled better by fast_forward_first_char. */
-  if (offsets[1] == -1 && offsets[0] == 0 && range_right < 0)
-    return FALSE;
-
-  /* We only search for a middle character if there is no range check. */
-  if (offsets[1] >= 0 && range_right == -1)
-    {
-    /* Scan from middle. */
-    for (i = (offsets[0] + offsets[1]) / 2 + 1; i < offsets[1]; i++)
-      if (ones[i] <= 2)
+      mask = chars[offset * MAX_DIFF_CHARS + 1] ^ chars[offset * MAX_DIFF_CHARS + 2];
+      if (!is_powerof2(mask))
         {
-        offsets[2] = i;
-        break;
+        mask = chars[i * MAX_DIFF_CHARS + 1] ^ chars[i * MAX_DIFF_CHARS + 2];
+        if (is_powerof2(mask))
+          offset = i;
         }
-
-    if (offsets[2] == -1)
-      {
-      for (i = (offsets[0] + offsets[1]) / 2; i > offsets[0]; i--)
-        if (ones[i] <= 2)
-          {
-          offsets[2] = i;
-          break;
-          }
       }
     }
+  }


-  SLJIT_ASSERT(offsets[1] == -1 || (offsets[0] < offsets[1]));
-  SLJIT_ASSERT(offsets[2] == -1 || (offsets[0] < offsets[2] && offsets[1] > offsets[2]));
-
-  chars[0] = chars[offsets[0] << 1];
-  chars[1] = chars[(offsets[0] << 1) + 1];
-  if (offsets[2] >= 0)
-    {
-    chars[2] = chars[offsets[2] << 1];
-    chars[3] = chars[(offsets[2] << 1) + 1];
-    }
-  if (offsets[1] >= 0)
-    {
-    chars[4] = chars[offsets[1] << 1];
-    chars[5] = chars[(offsets[1] << 1) + 1];
-    }
+if (range_right < 0)
+  {
+  if (offset < 0)
+    return FALSE;
+  mask = chars[offset * MAX_DIFF_CHARS + chars[offset * MAX_DIFF_CHARS]];
+  fast_forward_first_char2(common, chars[offset * MAX_DIFF_CHARS + 1], mask, offset, firstline);
+  return TRUE;
   }


+if (range_right == offset)
+ offset = -1;
+
+SLJIT_ASSERT(offset == -1 || (chars[offset * MAX_DIFF_CHARS] >= 1 && chars[offset * MAX_DIFF_CHARS] <= 2));
+
max -= 1;
+SLJIT_ASSERT(max > 0);
if (firstline)
{
SLJIT_ASSERT(common->first_line_end != 0);
@@ -3961,62 +4278,80 @@
else
OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));

+SLJIT_ASSERT(range_right >= 0);
+
#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-if (range_right >= 0)
- OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table);
+OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table);
#endif

start = LABEL();
quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);

-SLJIT_ASSERT(range_right >= 0 || offsets[0] >= 0);
-
-if (range_right >= 0)
- {
#if PCRE2_CODE_UNIT_WIDTH == 8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right));
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right));
#else
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1);
#endif

#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0);
#else
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table);
#endif
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
- CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start);
- }
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start);

-if (offsets[0] >= 0)
+if (offset >= 0)
   {
-  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[0]));
-  if (offsets[1] >= 0)
-    OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[1]));
+  OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offset));
   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));


-  if (chars[1] != 0)
-    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[1]);
-  CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[0], start);
-  if (offsets[2] >= 0)
-    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[2] - 1));
-
-  if (offsets[1] >= 0)
+  if (chars[offset * MAX_DIFF_CHARS] == 1)
+    CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1], start);
+  else
     {
-    if (chars[5] != 0)
-      OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, chars[5]);
-    CMPTO(SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, chars[4], start);
+    mask = chars[offset * MAX_DIFF_CHARS + 1] ^ chars[offset * MAX_DIFF_CHARS + 2];
+    if (is_powerof2(mask))
+      {
+      OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, mask);
+      CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1] | mask, start);
+      }
+    else
+      {
+      match = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 1]);
+      CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[offset * MAX_DIFF_CHARS + 2], start);
+      JUMPHERE(match);
+      }
     }
+  }


-  if (offsets[2] >= 0)
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+if (common->utf && offset != 0)
+  {
+  if (offset < 0)
     {
-    if (chars[3] != 0)
-      OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[3]);
-    CMPTO(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[2], start);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
     }
-  OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+  else
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+#if PCRE2_CODE_UNIT_WIDTH == 8
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+  CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, start);
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+  OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
+  CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00, start);
+#else
+#error "Unknown code width"
+#endif
+  if (offset < 0)
+    OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   }
+#endif


+if (offset >= 0)
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
JUMPHERE(quit);

if (firstline)
@@ -4041,23 +4376,8 @@

static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, PCRE2_UCHAR first_char, BOOL caseless, BOOL firstline)
{
-DEFINE_COMPILER;
-struct sljit_label *start;
-struct sljit_jump *quit;
-struct sljit_jump *found;
-PCRE2_UCHAR oc, bit;
+PCRE2_UCHAR oc;

-if (firstline)
-  {
-  SLJIT_ASSERT(common->first_line_end != 0);
-  OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
-  OP1(SLJIT_MOV, STR_END, 0, SLJIT_MEM1(SLJIT_SP), common->first_line_end);
-  }
-
-start = LABEL();
-quit = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-
 oc = first_char;
 if (caseless)
   {
@@ -4067,33 +4387,8 @@
     oc = UCD_OTHERCASE(first_char);
 #endif
   }
-if (first_char == oc)
-  found = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, first_char);
-else
-  {
-  bit = first_char ^ oc;
-  if (is_powerof2(bit))
-    {
-    OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, bit);
-    found = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, first_char | bit);
-    }
-  else
-    {
-    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, first_char);
-    OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_EQUAL);
-    OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
-    OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_EQUAL);
-    found = JUMP(SLJIT_NOT_ZERO);
-    }
-  }


-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-JUMPTO(SLJIT_JUMP, start);
-JUMPHERE(found);
-JUMPHERE(quit);
-
-if (firstline)
- OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
+fast_forward_first_char2(common, first_char, oc, 0, firstline);
}

static SLJIT_INLINE void fast_forward_newline(compiler_common *common, BOOL firstline)

Modified: code/trunk/src/pcre2_jit_test.c
===================================================================
--- code/trunk/src/pcre2_jit_test.c    2015-08-18 10:39:59 UTC (rev 349)
+++ code/trunk/src/pcre2_jit_test.c    2015-08-23 01:54:04 UTC (rev 350)
@@ -247,7 +247,7 @@
     { M, A, 0, 0, "a\\z", "aaa" },
     { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },


-    /* Brackets. */
+    /* Brackets and alternatives. */
     { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
     { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
     { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
@@ -254,6 +254,10 @@
     { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
     { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
     { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
+    { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
+    { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
+    { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
+    { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },


     /* Greedy and non-greedy ? operators. */
     { MU, A, 0, 0, "(?:a)?a", "laab" },