Revision: 1119
http://www.exim.org/viewvc/pcre2?view=rev&revision=1119
Author: zherczeg
Date: 2019-06-25 07:11:14 +0100 (Tue, 25 Jun 2019)
Log Message:
-----------
Improve SSE2 optimiztions in JIT.
Modified Paths:
--------------
code/trunk/src/pcre2_jit_compile.c
Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c 2019-06-22 16:36:15 UTC (rev 1118)
+++ code/trunk/src/pcre2_jit_compile.c 2019-06-25 06:11:14 UTC (rev 1119)
@@ -5525,54 +5525,46 @@
#endif
}
-static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg)
+static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
{
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
sljit_u8 instruction[5];
-#else
-sljit_u8 instruction[4];
-#endif
SLJIT_ASSERT(dst_xmm_reg < 8);
+SLJIT_ASSERT(src_general_reg < 8);
/* MOVDQA xmm1, xmm2/m128 */
-#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
-if (src_general_reg < 8)
+instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
+instruction[1] = 0x0f;
+instruction[2] = 0x6f;
+
+if (offset == 0)
{
- instruction[0] = 0x66;
- instruction[1] = 0x0f;
- instruction[2] = 0x6f;
instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
sljit_emit_op_custom(compiler, instruction, 4);
+ return;
}
-else
- {
- instruction[0] = 0x66;
- instruction[1] = 0x41;
- instruction[2] = 0x0f;
- instruction[3] = 0x6f;
- instruction[4] = (dst_xmm_reg << 3) | (src_general_reg & 0x7);
- sljit_emit_op_custom(compiler, instruction, 4);
- }
-#else
-instruction[0] = 0x66;
-instruction[1] = 0x0f;
-instruction[2] = 0x6f;
-instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
-sljit_emit_op_custom(compiler, instruction, 4);
-#endif
+
+instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
+instruction[4] = (sljit_u8)offset;
+sljit_emit_op_custom(compiler, instruction, 5);
}
-static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR char2,
- sljit_u32 bit, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
+typedef enum {
+ sse2_compare_match1,
+ sse2_compare_match1i,
+ sse2_compare_match2,
+} sse2_compare_type;
+
+static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler,
+ sse2_compare_type compare_type, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
{
sljit_u8 instruction[4];
instruction[0] = 0x66;
instruction[1] = 0x0f;
-if (char1 == char2 || bit != 0)
+if (compare_type != sse2_compare_match2)
{
- if (bit != 0)
+ if (compare_type == sse2_compare_match1i)
{
/* POR xmm1, xmm2/m128 */
/* instruction[0] = 0x66; */
@@ -5626,6 +5618,7 @@
#endif
struct sljit_jump *quit;
struct sljit_jump *partial_quit[2];
+sse2_compare_type compare_type = sse2_compare_match1;
sljit_u8 instruction[8];
sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
@@ -5640,8 +5633,13 @@
if (char1 != char2)
{
bit = char1 ^ char2;
+ compare_type = sse2_compare_match1i;
+
if (!is_powerof2(bit))
+ {
bit = 0;
+ compare_type = sse2_compare_match2;
+ }
}
partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
@@ -5693,8 +5691,8 @@
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
+fast_forward_char_pair_sse2_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
/* PMOVMSKB reg, xmm */
/* instruction[0] = 0x66; */
@@ -5719,8 +5717,8 @@
if (common->mode == PCRE2_JIT_COMPLETE)
add_jump(compiler, &common->failed_match, partial_quit[1]);
-load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1, char2, bit, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
+load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
+fast_forward_char_pair_sse2_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
/* PMOVMSKB reg, xmm */
/* instruction[0] = 0x66; */
@@ -5789,6 +5787,8 @@
PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
{
DEFINE_COMPILER;
+sse2_compare_type compare1_type = sse2_compare_match1;
+sse2_compare_type compare2_type = sse2_compare_match1;
sljit_u32 bit1 = 0;
sljit_u32 bit2 = 0;
sljit_u32 diff = IN_UCHARS(offs1 - offs2);
@@ -5840,11 +5840,13 @@
bit1 = char1a ^ char1b;
if (is_powerof2(bit1))
{
+ compare1_type = sse2_compare_match1i;
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
}
else
{
+ compare1_type = sse2_compare_match2;
bit1 = 0;
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
@@ -5867,11 +5869,13 @@
bit2 = char2a ^ char2b;
if (is_powerof2(bit2))
{
+ compare2_type = sse2_compare_match1i;
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
}
else
{
+ compare2_type = sse2_compare_match2;
bit2 = 0;
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
@@ -5915,47 +5919,15 @@
restart = LABEL();
#endif
-OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1 - offs2));
+OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, ~0xf);
-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind);
+load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
-jump[0] = CMP(SLJIT_EQUAL, STR_PTR, 0, TMP1, 0);
+jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
-load_from_mem_sse2(compiler, data2_ind, tmp1_reg_ind);
-
-/* MOVDQA xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x6f;
-instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PSLLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
-instruction[4] = diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* PSRLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-/* instruction[2] = 0x73; */
-instruction[3] = 0xc0 | (3 << 3) | data2_ind;
-instruction[4] = 16 - diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* POR xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xeb;
-instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
+load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
jump[1] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[0]);
@@ -5979,8 +5951,8 @@
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
-fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
-fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
/* PAND xmm1, xmm2/m128 */
/* instruction[0] = 0x66; */
@@ -6007,46 +5979,15 @@
/* Main loop. */
start = LABEL();
-load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind);
-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
-load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind);
+load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
+load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
-/* PSRLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (3 << 3) | data2_ind;
-instruction[4] = 16 - diff;
-sljit_emit_op_custom(compiler, instruction, 5);
+fast_forward_char_pair_sse2_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
+fast_forward_char_pair_sse2_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
-/* MOVDQA xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x6f;
-instruction[3] = 0xc0 | (tmp_ind << 3) | data1_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-/* PSLLDQ xmm1, imm8 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0x73;
-instruction[3] = 0xc0 | (7 << 3) | tmp_ind;
-instruction[4] = diff;
-sljit_emit_op_custom(compiler, instruction, 5);
-
-/* POR xmm1, xmm2/m128 */
-/* instruction[0] = 0x66; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xeb;
-instruction[3] = 0xc0 | (data2_ind << 3) | tmp_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
-fast_forward_char_pair_sse2_compare(compiler, char1a, char1b, bit1, data1_ind, cmp1a_ind, cmp1b_ind, tmp_ind);
-fast_forward_char_pair_sse2_compare(compiler, char2a, char2b, bit2, data2_ind, cmp2a_ind, cmp2b_ind, tmp_ind);
-
/* PAND xmm1, xmm2/m128 */
/* instruction[0] = 0x66; */
/* instruction[1] = 0x0f; */