Revision: 1006
http://www.exim.org/viewvc/pcre2?view=rev&revision=1006
Author: zherczeg
Date: 2018-09-15 13:35:56 +0100 (Sat, 15 Sep 2018)
Log Message:
-----------
Add option bits for read_char in JIT.
Modified Paths:
--------------
code/trunk/src/pcre2_jit_compile.c
code/trunk/src/pcre2_jit_test.c
Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c 2018-09-14 15:15:51 UTC (rev 1005)
+++ code/trunk/src/pcre2_jit_compile.c 2018-09-15 12:35:56 UTC (rev 1006)
@@ -485,6 +485,7 @@
jump_list *getucdtype;
#if PCRE2_CODE_UNIT_WIDTH == 8
jump_list *utfreadchar;
+ jump_list *utfreadchar_invalid_precise;
jump_list *utfreadtype8;
jump_list *utfpeakcharback;
#endif
@@ -3462,8 +3463,13 @@
#endif /* SUPPORT_UNICODE */
}
-static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max,
- jump_list **backtracks, BOOL update_str_ptr)
+#define READ_CHAR_UPDATE_STR_PTR 0x1
+#define READ_CHAR_UPDATE_STR_PTR_INVALID 0x2
+#define READ_CHAR_UPDATE_STR_PTR_PRECISE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UPDATE_STR_PTR_INVALID)
+#define READ_CHAR_VALID_UTF 0x4
+
+static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max,
+ jump_list **backtracks, sljit_u32 options)
{
/* Reads the precise value of a character into TMP1, if the character is
between min and max (c >= min && c <= max). Otherwise it returns with a value
@@ -3476,24 +3482,30 @@
struct sljit_jump *jump2;
#endif
-SLJIT_UNUSED_ARG(update_str_ptr);
SLJIT_UNUSED_ARG(min);
SLJIT_UNUSED_ARG(max);
SLJIT_UNUSED_ARG(backtracks);
+SLJIT_UNUSED_ARG(options);
SLJIT_ASSERT(min <= max);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf)
{
- if (max < 128 && !update_str_ptr) return;
+ if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
- if (common->invalid_utf)
+ if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
{
jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
- add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+
+ if (options & READ_CHAR_UPDATE_STR_PTR_INVALID)
+ add_jump(compiler, &common->utfreadchar_invalid_precise, JUMP(SLJIT_FAST_CALL));
+ else
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+
if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
JUMPHERE(jump);
@@ -3504,7 +3516,7 @@
if (min >= 0x10000)
{
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7);
@@ -3516,19 +3528,19 @@
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
- if (!update_str_ptr)
+ if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
JUMPHERE(jump2);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
}
else if (min >= 0x800 && max <= 0xffff)
{
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf);
@@ -3536,13 +3548,13 @@
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
- if (!update_str_ptr)
+ if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
JUMPHERE(jump2);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
}
else if (max >= 0x800)
@@ -3557,7 +3569,7 @@
else
{
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
- if (!update_str_ptr)
+ if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
else
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
@@ -3565,39 +3577,37 @@
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
}
JUMPHERE(jump);
}
-#endif
-
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
+#elif PCRE2_CODE_UNIT_WIDTH == 16
if (common->utf)
{
- if (max < 0xd800 && !update_str_ptr) return;
+ if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
- if (max >= 0x10000 || common->invalid_utf)
+ if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
{
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ JUMPHERE(jump);
+ return;
+ }
- if (common->invalid_utf)
- {
- jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
- add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
- if (backtracks != NULL)
- add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
- }
- else
- {
- jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
- /* TMP2 contains the high surrogate. */
- OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
- OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
- OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
- OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
- }
+ if (max >= 0x10000)
+ {
+ OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
+ /* TMP2 contains the high surrogate. */
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
JUMPHERE(jump);
return;
}
@@ -3605,13 +3615,25 @@
/* Skip low surrogate if necessary. */
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
- if (update_str_ptr)
+ if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
if (max >= 0xd800)
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
JUMPHERE(jump);
}
-#endif
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+if (common->invalid_utf)
+ {
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+ else
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
+ CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
+ }
+ }
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
}
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -3646,6 +3668,7 @@
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+/* All values > 127 are zero in ctypes. */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
if (negated)
@@ -3700,14 +3723,15 @@
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
- OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
if (common->invalid_utf)
- add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f));
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2));
+ OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
if (common->invalid_utf)
- add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f));
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40));
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@@ -3718,6 +3742,7 @@
else if (common->invalid_utf)
{
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@@ -3970,6 +3995,122 @@
static void do_utfreadchar_invalid(compiler_common *common)
{
/* Slow decoding a UTF-8 character. TMP1 contains the first byte
+of the character (>= 0xc0). Return char value in TMP1. STR_PTR is
+undefined for invalid characters. */
+DEFINE_COMPILER;
+sljit_s32 i;
+struct sljit_jump *jump;
+struct sljit_jump *buffer_end_close;
+struct sljit_label *three_byte_entry;
+struct sljit_label *exit_invalid_label;
+struct sljit_jump *exit_invalid[11];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2);
+
+/* Usually more than 3 characters remained in the subject buffer. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+
+/* Not a valid start of a multi-byte sequence, no more bytes read. */
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2);
+
+buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Three-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+three_byte_entry = LABEL();
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
+exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Four-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
+exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
+
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(buffer_end_close);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+/* Three-byte sequence. */
+JUMPHERE(jump);
+exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+/* One will be substracted from STR_PTR later. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+
+/* Four byte sequences are not possible. */
+CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);
+
+exit_invalid_label = LABEL();
+for (i = 0; i < 11; i++)
+ sljit_set_label(exit_invalid[i], exit_invalid_label);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfreadchar_invalid_precise(compiler_common *common)
+{
+/* Slow decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
@@ -3987,7 +4128,7 @@
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
/* Not a valid start of a multi-byte sequence, no more bytes read. */
-exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc);
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc0);
buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
@@ -4576,7 +4717,7 @@
mainloop = LABEL();
/* Continual stores does not cause data dependency. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
- read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+ read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
check_newlinechar(common, common->nltype, &newline, TRUE);
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
JUMPHERE(end);
@@ -6206,7 +6347,7 @@
loop = LABEL();
common->ff_newline_shortcut = loop;
-read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -6451,7 +6592,8 @@
{
move_back(common, &invalid_utf, FALSE);
check_start_used_ptr(common);
- read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE);
+ /* No need precise read since match fails anyway. */
+ read_char(common, 0, READ_CHAR_MAX, &invalid_utf, READ_CHAR_UPDATE_STR_PTR);
}
/* Testing char type. */
@@ -7394,7 +7536,10 @@
/* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin;
-read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0);
+if ((cc[-1] & XCL_NOT) != 0)
+ read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
+else
+ read_char(common, min, max, NULL, 0);
if ((cc[-1] & XCL_HASPROP) == 0)
{
@@ -7920,13 +8065,13 @@
}
else
{
- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0);
- read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+ OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
+ read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
- OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
+ OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
}
JUMPHERE(jump[2]);
JUMPHERE(jump[3]);
@@ -8325,7 +8470,7 @@
case OP_ANY:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE);
+ read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@@ -8352,7 +8497,7 @@
{
if (common->invalid_utf)
{
- read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE);
+ read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
return cc;
}
@@ -8402,7 +8547,7 @@
case OP_ANYNL:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE);
+ read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */
end_list = NULL;
@@ -8425,7 +8570,12 @@
case OP_HSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE);
+
+ if (type == OP_NOT_HSPACE)
+ read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
+ else
+ read_char(common, 0x9, 0x3000, NULL, 0);
+
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -8435,7 +8585,12 @@
case OP_VSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE);
+
+ if (type == OP_NOT_VSPACE)
+ read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
+ else
+ read_char(common, 0xa, 0x2029, NULL, 0);
+
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -8477,6 +8632,7 @@
#ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
#endif
+
if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr
&& (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0))
{
@@ -8504,12 +8660,13 @@
if (type == OP_CHAR || !char_has_othercase(common, cc))
{
- read_char_range(common, c, c, NULL, FALSE);
+ read_char(common, c, c, NULL, 0);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
return cc + length;
}
+
oc = char_othercase(common, c);
- read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE);
+ read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -8517,6 +8674,7 @@
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
return cc + length;
}
+
jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
JUMPHERE(jump[0]);
@@ -8533,7 +8691,7 @@
{
#if PCRE2_CODE_UNIT_WIDTH == 8
c = *cc;
- if (c < 128)
+ if (c < 128 && !common->invalid_utf)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
if (type == OP_NOT || !char_has_othercase(common, cc))
@@ -8564,13 +8722,13 @@
if (type == OP_NOT || !char_has_othercase(common, cc))
{
- read_char_range(common, c, c, NULL, TRUE);
+ read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
}
else
{
oc = char_othercase(common, c);
- read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE);
+ read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -8592,9 +8750,15 @@
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
- read_char_range(common, 0, bit, NULL, type == OP_NCLASS);
+ if (type == OP_NCLASS)
+ read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
+ else
+ read_char(common, 0, bit, NULL, 0);
#else
- read_char_range(common, 0, 255, NULL, type == OP_NCLASS);
+ if (type == OP_NCLASS)
+ read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
+ else
+ read_char(common, 0, 255, NULL, 0);
#endif
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
@@ -8788,7 +8952,6 @@
int source_reg = COUNT_MATCH;
int source_end_reg = ARGUMENTS;
int char1_reg = STACK_LIMIT;
-BOOL saved_invalid_utf;
#endif /* SUPPORT_UNICODE */
if (ref)
@@ -8830,10 +8993,7 @@
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0);
- saved_invalid_utf = common->invalid_utf;
- common->invalid_utf = FALSE;
- read_char_range(common, 0, READ_CHAR_MAX, NULL, TRUE);
- common->invalid_utf = saved_invalid_utf;
+ read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR | READ_CHAR_VALID_UTF);
OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
@@ -8840,7 +9000,7 @@
OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0);
/* Read second character. */
- read_char_range(common, 0, READ_CHAR_MAX, &no_match, TRUE);
+ read_char(common, 0, READ_CHAR_MAX, &no_match, READ_CHAR_UPDATE_STR_PTR);
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
@@ -13572,6 +13732,11 @@
set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common);
}
+if (common->utfreadchar_invalid_precise != NULL)
+ {
+ set_jumps(common->utfreadchar_invalid_precise, LABEL());
+ do_utfreadchar_invalid_precise(common);
+ }
if (common->utfreadtype8 != NULL)
{
set_jumps(common->utfreadtype8, LABEL());
Modified: code/trunk/src/pcre2_jit_test.c
===================================================================
--- code/trunk/src/pcre2_jit_test.c 2018-09-14 15:15:51 UTC (rev 1005)
+++ code/trunk/src/pcre2_jit_test.c 2018-09-15 12:35:56 UTC (rev 1006)
@@ -1755,6 +1755,41 @@
}
}
+#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16)
+
+static int check_invalid_utf_result(int pattern_index, char *type, int result,
+ int match_start, int match_end, PCRE2_SIZE *ovector)
+{
+ if (match_start < 0) {
+ if (result != -1) {
+ printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
+ return 1;
+ }
+ return 0;
+ }
+
+ if (result <= 0) {
+ printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
+ return 1;
+ }
+
+ if (ovector[0] != (PCRE2_SIZE)match_start) {
+ printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
+ pattern_index, type, (int)ovector[0], match_start);
+ return 1;
+ }
+
+ if (ovector[1] != (PCRE2_SIZE)match_end) {
+ printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
+ pattern_index, type, (int)ovector[1], match_end);
+ return 1;
+ }
+
+ return 0;
+}
+
+#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16) */
+
#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
@@ -1767,121 +1802,132 @@
int start_offset;
int skip_left;
int skip_right;
- int expected_result;
+ int match_start;
+ int match_end;
const char *pattern[2];
const char *input;
};
static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
- { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" },
- { UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" },
- { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" },
- { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" },
- { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
+ { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
+ { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
+ { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
+ { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
+ { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
+ { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
+ { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
- { UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
- { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
- { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
- { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
- { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
- { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
- { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
+ { UDA, CPI, 4, 1, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
+ { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
+ { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
+ { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
+ { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+ { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
- { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" },
- { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" },
- { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
- { UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
- { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
- { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
- { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
- { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
+ { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
+ { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
+ { UDA, CPI, 3, 1, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
+ { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
+ { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
+ { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
+ { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
- { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" },
- { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" },
- { UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
- { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
- { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" },
- { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" },
- { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" },
+ { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
+ { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
+ { UDA, CPI, 2, 1, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
+ { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
+ { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80#" },
+ { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff#" },
+ { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf#" },
- { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" },
- { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" },
- { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
- { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
+ { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
+ { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
+ { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
+ { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
- { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" },
- { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" },
- { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" },
- { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" },
- { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" },
- { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" },
- { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
- { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
- { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+ { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
+ { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
+ { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
+ { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
+ { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
+ { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
+ { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
+ { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+ { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
- { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
+ { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
+
+ { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
};
#undef UDA
@@ -1889,17 +1935,18 @@
#undef CPI
static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current,
- int pattern_index, int i, pcre2_match_data_8 *mdata)
+ int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
{
pcre2_code_8 *code;
int result, errorcode;
PCRE2_SIZE length, erroroffset;
+ PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
if (current->pattern[i] == NULL)
return 1;
code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
- current->compile_options, &errorcode, &erroroffset, NULL);
+ current->compile_options, &errorcode, &erroroffset, ccontext);
if (!code) {
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@@ -1918,8 +1965,7 @@
result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
length, current->start_offset - current->skip_left, 0, mdata, NULL);
- if (result != current->expected_result) {
- printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_8(code);
return 0;
}
@@ -1929,8 +1975,7 @@
result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
- if (result != current->expected_result) {
- printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_8(code);
return 0;
}
@@ -1943,6 +1988,7 @@
static int invalid_utf8_regression_tests(void)
{
struct invalid_utf8_regression_test_case *current;
+ pcre2_compile_context_8 *ccontext;
pcre2_match_data_8 *mdata;
int total = 0, successful = 0;
int result;
@@ -1949,6 +1995,8 @@
printf("\nRunning invalid-utf8 JIT regression tests\n");
+ ccontext = pcre2_compile_context_create_8(NULL);
+ pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
mdata = pcre2_match_data_create_8(4, NULL);
for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
@@ -1956,9 +2004,9 @@
total++;
result = 1;
- if (!run_invalid_utf8_test(current, total - 1, 0, mdata))
+ if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
result = 0;
- if (!run_invalid_utf8_test(current, total - 1, 1, mdata))
+ if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
result = 0;
if (result) {
@@ -1974,6 +2022,7 @@
printf("\n");
pcre2_match_data_free_8(mdata);
+ pcre2_compile_context_free_8(ccontext);
if (total == successful) {
printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
@@ -2005,7 +2054,8 @@
int start_offset;
int skip_left;
int skip_right;
- int expected_result;
+ int match_start;
+ int match_end;
const PCRE2_UCHAR16 *pattern[2];
const PCRE2_UCHAR16 *input;
};
@@ -2024,41 +2074,41 @@
static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
- { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 },
- { UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 },
- { UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 },
- { UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 },
- { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 },
- { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 },
- { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 },
- { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 },
- { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 },
- { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 },
+ { UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 },
+ { UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 },
+ { UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 },
+ { UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 },
+ { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 },
+ { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 },
+ { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 },
+ { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 },
+ { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 },
+ { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 },
- { UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 },
- { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 },
- { UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 },
- { UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 },
- { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 },
- { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 },
- { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 },
- { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 },
- { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 },
- { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 },
+ { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 },
+ { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 },
+ { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 },
+ { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 },
+ { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 },
+ { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 },
- { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 },
- { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 },
+ { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 },
- { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 },
- { UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 },
- { UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 },
- { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 },
- { UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 },
- { UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 },
+ { UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 },
+ { UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 },
+ { UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 },
+ { UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 },
+ { UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 },
+ { UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 },
- { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+ { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
};
#undef UDA
@@ -2066,18 +2116,19 @@
#undef CPI
static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current,
- int pattern_index, int i, pcre2_match_data_16 *mdata)
+ int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
{
pcre2_code_16 *code;
int result, errorcode;
PCRE2_SIZE length, erroroffset;
const PCRE2_UCHAR16 *input;
+ PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
if (current->pattern[i] == NULL)
return 1;
code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
- current->compile_options, &errorcode, &erroroffset, NULL);
+ current->compile_options, &errorcode, &erroroffset, ccontext);
if (!code) {
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@@ -2102,8 +2153,7 @@
result = pcre2_jit_match_16(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, 0, mdata, NULL);
- if (result != current->expected_result) {
- printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_16(code);
return 0;
}
@@ -2113,8 +2163,7 @@
result = pcre2_jit_match_16(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
- if (result != current->expected_result) {
- printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_16(code);
return 0;
}
@@ -2127,6 +2176,7 @@
static int invalid_utf16_regression_tests(void)
{
struct invalid_utf16_regression_test_case *current;
+ pcre2_compile_context_16 *ccontext;
pcre2_match_data_16 *mdata;
int total = 0, successful = 0;
int result;
@@ -2133,6 +2183,8 @@
printf("\nRunning invalid-utf16 JIT regression tests\n");
+ ccontext = pcre2_compile_context_create_16(NULL);
+ pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
mdata = pcre2_match_data_create_16(4, NULL);
for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
@@ -2140,9 +2192,9 @@
total++;
result = 1;
- if (!run_invalid_utf16_test(current, total - 1, 0, mdata))
+ if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
result = 0;
- if (!run_invalid_utf16_test(current, total - 1, 1, mdata))
+ if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
result = 0;
if (result) {
@@ -2158,6 +2210,7 @@
printf("\n");
pcre2_match_data_free_16(mdata);
+ pcre2_compile_context_free_16(ccontext);
if (total == successful) {
printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");