[Pcre-svn] [1119] code/trunk/src/pcre2_jit_compile.c: Improv…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1119] code/trunk/src/pcre2_jit_compile.c: Improve SSE2 optimiztions in JIT.
Revision: 1119
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1119
Author:   zherczeg
Date:     2019-06-25 07:11:14 +0100 (Tue, 25 Jun 2019)
Log Message:
-----------
Improve SSE2 optimiztions in JIT.


Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2019-06-22 16:36:15 UTC (rev 1118)
+++ code/trunk/src/pcre2_jit_compile.c    2019-06-25 06:11:14 UTC (rev 1119)
@@ -5525,54 +5525,46 @@
 #endif
 }


-static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg)
+static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
{
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
sljit_u8 instruction[5];
-#else
-sljit_u8 instruction[4];
-#endif

SLJIT_ASSERT(dst_xmm_reg < 8);
+SLJIT_ASSERT(src_general_reg < 8);

/* MOVDQA xmm1, xmm2/m128 */
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-if (src_general_reg < 8)
+instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
+instruction[1] = 0x0f;
+instruction[2] = 0x6f;
+
+if (offset == 0)
{
- instruction[0] = 0x66;
- instruction[1] = 0x0f;
- instruction[2] = 0x6f;
instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
sljit_emit_op_custom(compiler, instruction, 4);
+ return;
}
-else
- {
- instruction[0] = 0x66;
- instruction[1] = 0x41;
- instruction[2] = 0x0f;
- instruction[3] = 0x6f;
- instruction[4] = (dst_xmm_reg << 3) | (src_general_reg & 0x7);
- sljit_emit_op_custom(compiler, instruction, 4);
- }
-#else
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
-instruction[2] = 0x6f;
-instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
-sljit_emit_op_custom(compiler, instruction, 4);
-#endif
+
+instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
+instruction[4] = (sljit_u8)offset;
+sljit_emit_op_custom(compiler, instruction, 5);
}

-static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR char2,
-  sljit_u32 bit, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
+typedef enum {
+    sse2_compare_match1,
+    sse2_compare_match1i,
+    sse2_compare_match2,
+} sse2_compare_type;
+
+static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler,
+  sse2_compare_type compare_type, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
 {
 sljit_u8 instruction[4];
 instruction[0] = 0x66;
 instruction[1] = 0x0f;


-if (char1 == char2 || bit != 0)
+if (compare_type != sse2_compare_match2)
   {
-  if (bit != 0)
+  if (compare_type == sse2_compare_match1i)
     {
     /* POR xmm1, xmm2/m128 */
     /* instruction[0] = 0x66; */
@@ -5626,6 +5618,7 @@
 #endif
 struct sljit_jump *quit;
 struct sljit_jump *partial_quit[2];
+sse2_compare_type compare_type = sse2_compare_match1;
 sljit_u8 instruction[8];
 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
@@ -5640,8 +5633,13 @@
 if (char1 != char2)
   {
   bit = char1 ^ char2;
+  compare_type = sse2_compare_match1i;
+
   if (!is_powerof2(bit))
+    {
     bit = 0;
+    compare_type = sse2_compare_match2;
+    }
   }


partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
@@ -5693,8 +5691,8 @@
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);

-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
+fast_forward_char_pair_sse2_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

/* PMOVMSKB reg, xmm */
/* instruction[0] = 0x66; */
@@ -5719,8 +5717,8 @@
if (common->mode == PCRE2_JIT_COMPLETE)
add_jump(compiler, &common->failed_match, partial_quit[1]);

-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
+fast_forward_char_pair_sse2_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);

 /* PMOVMSKB reg, xmm */
 /* instruction[0] = 0x66; */
@@ -5789,6 +5787,8 @@
   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
 {
 DEFINE_COMPILER;
+sse2_compare_type compare1_type = sse2_compare_match1;
+sse2_compare_type compare2_type = sse2_compare_match1;
 sljit_u32 bit1 = 0;
 sljit_u32 bit2 = 0;
 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
@@ -5840,11 +5840,13 @@
   bit1 = char1a ^ char1b;
   if (is_powerof2(bit1))
     {
+    compare1_type = sse2_compare_match1i;
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
     }
   else
     {
+    compare1_type = sse2_compare_match2;
     bit1 = 0;
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
@@ -5867,11 +5869,13 @@
   bit2 = char2a ^ char2b;
   if (is_powerof2(bit2))
     {
+    compare2_type = sse2_compare_match1i;
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
     }
   else
     {
+    compare2_type = sse2_compare_match2;
     bit2 = 0;
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
@@ -5915,47 +5919,15 @@
 restart = LABEL();
 #endif


-OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1 - offs2));
+OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, ~0xf);

-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind);
+load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);

-jump[0] = CMP(SLJIT_EQUAL, STR_PTR, 0, TMP1, 0);
+jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);

-load_from_mem_sse2(compiler, data2_ind, tmp1_reg_ind);
-
-/* MOVDQA xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x6f;
-instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PSLLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
-instruction[4] = diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* PSRLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-/* instruction[2] = 0x73; */
-instruction[3] = 0xc0 | (3 << 3) | data2_ind;
-instruction[4] = 16 - diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* POR xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xeb;
-instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
+load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
jump[1] = JUMP(SLJIT_JUMP);

JUMPHERE(jump[0]);
@@ -5979,8 +5951,8 @@

OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);

-fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);

/* PAND xmm1, xmm2/m128 */
/* instruction[0] = 0x66; */
@@ -6007,46 +5979,15 @@
/* Main loop. */
start = LABEL();

-load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind);
-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));

-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind);
+load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
+load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);

-/* PSRLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (3 << 3) | data2_ind;
-instruction[4] = 16 - diff;
-sljit_emit_op_custom(compiler, instruction, 5);
+fast_forward_char_pair_sse2_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);

-/* MOVDQA xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x6f;
-instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PSLLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
-instruction[4] = diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* POR xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xeb;
-instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
-fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
-
/* PAND xmm1, xmm2/m128 */
/* instruction[0] = 0x66; */
/* instruction[1] = 0x0f; */