[Pcre-svn] [1008] code/trunk/src: Improve invalid UTF charac…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1008] code/trunk/src: Improve invalid UTF character reading in JIT.
Revision: 1008
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1008
Author:   zherczeg
Date:     2018-09-16 11:35:00 +0100 (Sun, 16 Sep 2018)
Log Message:
-----------
Improve invalid UTF character reading in JIT.


Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/pcre2_jit_test.c


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2018-09-15 17:10:39 UTC (rev 1007)
+++ code/trunk/src/pcre2_jit_compile.c    2018-09-16 10:35:00 UTC (rev 1008)
@@ -485,7 +485,7 @@
   jump_list *getucdtype;
 #if PCRE2_CODE_UNIT_WIDTH == 8
   jump_list *utfreadchar;
-  jump_list *utfreadchar_invalid_precise;
+  jump_list *utfreadnewline_invalid;
   jump_list *utfreadtype8;
   jump_list *utfpeakcharback;
 #endif
@@ -3464,8 +3464,8 @@
 }


#define READ_CHAR_UPDATE_STR_PTR 0x1
-#define READ_CHAR_UPDATE_STR_PTR_INVALID 0x2
-#define READ_CHAR_UPDATE_STR_PTR_PRECISE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UPDATE_STR_PTR_INVALID)
+#define READ_CHAR_UTF8_NEWLINE 0x2
+#define READ_CHAR_NEWLINE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UTF8_NEWLINE)
#define READ_CHAR_VALID_UTF 0x4

 static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max,
@@ -3501,8 +3501,8 @@
     {
     jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);


-    if (options & READ_CHAR_UPDATE_STR_PTR_INVALID)
-      add_jump(compiler, &common->utfreadchar_invalid_precise, JUMP(SLJIT_FAST_CALL));
+    if (options & READ_CHAR_UTF8_NEWLINE)
+      add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL));
     else
       add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));


@@ -3999,6 +3999,7 @@
undefined for invalid characters. */
DEFINE_COMPILER;
sljit_s32 i;
+sljit_s32 has_cmove = sljit_has_cpu_feature(SLJIT_HAS_CMOV);
struct sljit_jump *jump;
struct sljit_jump *buffer_end_close;
struct sljit_label *three_byte_entry;
@@ -4045,11 +4046,25 @@
three_byte_entry = LABEL();

OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
-exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+if (has_cmove)
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+ CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0xd800);
+ exit_invalid[3] = NULL;
+ }
+else
+ exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

-exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+if (has_cmove)
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+ CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
+ exit_invalid[4] = NULL;
+ }
+else
+ exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);

JUMPHERE(jump);
@@ -4062,7 +4077,14 @@
exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);

OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
-exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
+if (has_cmove)
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x100000);
+ CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR - 0x10000);
+ exit_invalid[6] = NULL;
+ }
+else
+ exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);

OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -4108,7 +4130,7 @@
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}

-static void do_utfreadchar_invalid_precise(compiler_common *common)
+static void do_utfreadnewline_invalid(compiler_common *common)
{
/* Slow decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1. */
@@ -4118,7 +4140,7 @@
struct sljit_label *two_byte_entry;
struct sljit_label *three_byte_entry;
struct sljit_label *exit_invalid_label;
-struct sljit_jump *exit_invalid[12];
+struct sljit_jump *exit_invalid[10];

sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);

@@ -4163,32 +4185,26 @@

three_byte_entry = LABEL();

-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
-exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20000);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

-exit_invalid[5] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+/* Range between 0xd800 - 0xdfff is not checked. */
+exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);

JUMPHERE(jump);

-/* Four-byte sequence. */
+/* No four byte long newline. */
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);

-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
-exit_invalid[7] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
-
-OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);

JUMPHERE(buffer_end_close);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
-exit_invalid[8] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+exit_invalid[6] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);

/* Two-byte sequence. */
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
@@ -4195,19 +4211,19 @@
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+exit_invalid[7] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);

OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
JUMPTO(SLJIT_ZERO, two_byte_entry);

/* Three-byte sequence. */
-exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);

OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-exit_invalid[11] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);

/* One will be substracted from STR_PTR later. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
@@ -4216,19 +4232,17 @@
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);

exit_invalid_label = LABEL();
-sljit_set_label(exit_invalid[4], exit_invalid_label);
+sljit_set_label(exit_invalid[5], exit_invalid_label);
sljit_set_label(exit_invalid[6], exit_invalid_label);
-sljit_set_label(exit_invalid[8], exit_invalid_label);
-sljit_set_label(exit_invalid[9], exit_invalid_label);
+sljit_set_label(exit_invalid[7], exit_invalid_label);

OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

exit_invalid_label = LABEL();
sljit_set_label(exit_invalid[2], exit_invalid_label);
-sljit_set_label(exit_invalid[5], exit_invalid_label);
-sljit_set_label(exit_invalid[7], exit_invalid_label);
-sljit_set_label(exit_invalid[10], exit_invalid_label);
-sljit_set_label(exit_invalid[11], exit_invalid_label);
+sljit_set_label(exit_invalid[4], exit_invalid_label);
+sljit_set_label(exit_invalid[8], exit_invalid_label);
+sljit_set_label(exit_invalid[9], exit_invalid_label);

 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -4717,7 +4731,7 @@
     mainloop = LABEL();
     /* Continual stores does not cause data dependency. */
     OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
-    read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
+    read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE);
     check_newlinechar(common, common->nltype, &newline, TRUE);
     CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
     JUMPHERE(end);
@@ -6347,7 +6361,7 @@
 loop = LABEL();
 common->ff_newline_shortcut = loop;


-read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
+read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_NEWLINE);
 lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
 if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
   foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -7272,7 +7286,7 @@
     OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
   else
 #endif
-    OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length);
 #elif PCRE2_CODE_UNIT_WIDTH == 16
 #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED
   if (context->length >= 4)
@@ -8633,11 +8647,14 @@
   if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
 #endif


-  if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr
-      && (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0))
+  if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE)
+    detect_partial_match(common, backtracks);
+
+  if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)
     {
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length));
-    add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));
+    if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE))
+      add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0));


     context.length = IN_UCHARS(length);
     context.sourcereg = -1;
@@ -8647,8 +8664,6 @@
     return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
     }


-  if (check_str_ptr)
-    detect_partial_match(common, backtracks);
 #ifdef SUPPORT_UNICODE
   if (common->utf)
     {
@@ -8658,26 +8673,28 @@
 #endif
     c = *cc;


-  if (type == OP_CHAR || !char_has_othercase(common, cc))
-    {
-    read_char(common, c, c, NULL, 0);
-    add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
-    return cc + length;
-    }
+  SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc));


+  if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)
+    add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
   oc = char_othercase(common, c);
   read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
-  bit = c ^ oc;
-  if (is_powerof2(bit))
+
+  SLJIT_ASSERT(!is_powerof2(c ^ oc));
+
+  if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
     {
-    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit);
-    add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
-    return cc + length;
+    OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
+    CMOV(SLJIT_EQUAL, TMP1, SLJIT_IMM, c);
+    add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
     }
-
-  jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
-  add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
-  JUMPHERE(jump[0]);
+  else
+    {
+    jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
+    add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
+    JUMPHERE(jump[0]);
+    }
   return cc + length;


case OP_NOT:
@@ -13732,10 +13749,10 @@
set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common);
}
-if (common->utfreadchar_invalid_precise != NULL)
+if (common->utfreadnewline_invalid != NULL)
{
- set_jumps(common->utfreadchar_invalid_precise, LABEL());
- do_utfreadchar_invalid_precise(common);
+ set_jumps(common->utfreadnewline_invalid, LABEL());
+ do_utfreadnewline_invalid(common);
}
if (common->utfreadtype8 != NULL)
{

Modified: code/trunk/src/pcre2_jit_test.c
===================================================================
--- code/trunk/src/pcre2_jit_test.c    2018-09-15 17:10:39 UTC (rev 1007)
+++ code/trunk/src/pcre2_jit_test.c    2018-09-16 10:35:00 UTC (rev 1008)
@@ -1757,7 +1757,7 @@


#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16)

-static int check_invalid_utf_result(int pattern_index, char *type, int result,
+static int check_invalid_utf_result(int pattern_index, const char *type, int result,
     int match_start, int match_end, PCRE2_SIZE *ovector)
 {
     if (match_start < 0) {
@@ -1926,7 +1926,16 @@
     { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
     { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
     { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},


+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
+    { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
+
     { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
 };