[Pcre-svn] [1006] code/trunk/src: Add option bits for read

Author: Subversion repository
Date:
To: pcre-svn
Subject: [Pcre-svn] [1006] code/trunk/src: Add option bits for read_char in JIT.

Revision: 1006

          http://www.exim.org/viewvc/pcre2?view=rev&revision=1006
Author:   zherczeg
Date:     2018-09-15 13:35:56 +0100 (Sat, 15 Sep 2018)
Log Message:
-----------
Add option bits for read_char in JIT.

Modified Paths:
--------------
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/pcre2_jit_test.c

Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2018-09-14 15:15:51 UTC (rev 1005)
+++ code/trunk/src/pcre2_jit_compile.c    2018-09-15 12:35:56 UTC (rev 1006)
@@ -485,6 +485,7 @@
   jump_list *getucdtype;
 #if PCRE2_CODE_UNIT_WIDTH == 8
   jump_list *utfreadchar;
+  jump_list *utfreadchar_invalid_precise;
   jump_list *utfreadtype8;
   jump_list *utfpeakcharback;
 #endif
@@ -3462,8 +3463,13 @@
 #endif /* SUPPORT_UNICODE */
 }

-static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max,
- jump_list **backtracks, BOOL update_str_ptr)
+#define READ_CHAR_UPDATE_STR_PTR 0x1
+#define READ_CHAR_UPDATE_STR_PTR_INVALID 0x2
+#define READ_CHAR_UPDATE_STR_PTR_PRECISE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UPDATE_STR_PTR_INVALID)
+#define READ_CHAR_VALID_UTF 0x4
+
+static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max,
+ jump_list **backtracks, sljit_u32 options)
{
/* Reads the precise value of a character into TMP1, if the character is
between min and max (c >= min && c <= max). Otherwise it returns with a value
@@ -3476,24 +3482,30 @@
struct sljit_jump *jump2;
#endif

-SLJIT_UNUSED_ARG(update_str_ptr);
SLJIT_UNUSED_ARG(min);
SLJIT_UNUSED_ARG(max);
SLJIT_UNUSED_ARG(backtracks);
+SLJIT_UNUSED_ARG(options);
SLJIT_ASSERT(min <= max);

OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf)
{
- if (max < 128 && !update_str_ptr) return;
+ if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;

-  if (common->invalid_utf)
+  if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
     {
     jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
-    add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+
+    if (options & READ_CHAR_UPDATE_STR_PTR_INVALID)
+      add_jump(compiler, &common->utfreadchar_invalid_precise, JUMP(SLJIT_FAST_CALL));
+    else
+      add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+
     if (backtracks != NULL)
       add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
     JUMPHERE(jump);
@@ -3504,7 +3516,7 @@
   if (min >= 0x10000)
     {
     OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0);
-    if (update_str_ptr)
+    if (options & READ_CHAR_UPDATE_STR_PTR)
       OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
     OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7);
@@ -3516,19 +3528,19 @@
     OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
     OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
     OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
-    if (!update_str_ptr)
+    if (!(options & READ_CHAR_UPDATE_STR_PTR))
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
     OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
     OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
     OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
     JUMPHERE(jump2);
-    if (update_str_ptr)
+    if (options & READ_CHAR_UPDATE_STR_PTR)
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
     }
   else if (min >= 0x800 && max <= 0xffff)
     {
     OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0);
-    if (update_str_ptr)
+    if (options & READ_CHAR_UPDATE_STR_PTR)
       OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
     OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf);
@@ -3536,13 +3548,13 @@
     OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
     OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
     OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
-    if (!update_str_ptr)
+    if (!(options & READ_CHAR_UPDATE_STR_PTR))
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
     OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
     OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
     OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
     JUMPHERE(jump2);
-    if (update_str_ptr)
+    if (options & READ_CHAR_UPDATE_STR_PTR)
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
     }
   else if (max >= 0x800)
@@ -3557,7 +3569,7 @@
   else
     {
     OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
-    if (!update_str_ptr)
+    if (!(options & READ_CHAR_UPDATE_STR_PTR))
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
     else
       OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
@@ -3565,39 +3577,37 @@
     OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
     OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
     OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-    if (update_str_ptr)
+    if (options & READ_CHAR_UPDATE_STR_PTR)
       OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
     }
   JUMPHERE(jump);
   }
-#endif
-
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
+#elif PCRE2_CODE_UNIT_WIDTH == 16
 if (common->utf)
   {
-  if (max < 0xd800 && !update_str_ptr) return;
+  if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;

-  if (max >= 0x10000 || common->invalid_utf)
+  if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
     {
     OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+    jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+    add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+    if (backtracks != NULL)
+      add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+    JUMPHERE(jump);
+    return;
+    }

-    if (common->invalid_utf)
-      {
-      jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
-      add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
-      if (backtracks != NULL)
-        add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
-      }
-    else
-      {
-      jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
-      /* TMP2 contains the high surrogate. */
-      OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
-      OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
-      OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-      OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
-      OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
-      }
+  if (max >= 0x10000)
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+    jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
+    /* TMP2 contains the high surrogate. */
+    OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+    OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+    OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+    OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
+    OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
     JUMPHERE(jump);
     return;
     }
@@ -3605,13 +3615,25 @@
   /* Skip low surrogate if necessary. */
   OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
   jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
-  if (update_str_ptr)
+  if (options & READ_CHAR_UPDATE_STR_PTR)
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
   if (max >= 0xd800)
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
   JUMPHERE(jump);
   }
-#endif
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+if (common->invalid_utf)
+  {
+  if (backtracks != NULL)
+    add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+  else
+    {
+    OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
+    CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
+    }
+  }
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
 }

#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@@ -3646,6 +3668,7 @@
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));

+/* All values > 127 are zero in ctypes. */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);

if (negated)
@@ -3700,14 +3723,15 @@

     OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
     OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-    OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
+    OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
     if (common->invalid_utf)
-      add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f));
+      add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2));

+    OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
     OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
     OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
     if (common->invalid_utf)
-      add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f));
+      add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40));

     OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@@ -3718,6 +3742,7 @@
   else if (common->invalid_utf)
     {
     add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+    OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
     add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));

     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@@ -3970,6 +3995,122 @@
 static void do_utfreadchar_invalid(compiler_common *common)
 {
 /* Slow decoding a UTF-8 character. TMP1 contains the first byte
+of the character (>= 0xc0). Return char value in TMP1. STR_PTR is
+undefined for invalid characters. */
+DEFINE_COMPILER;
+sljit_s32 i;
+struct sljit_jump *jump;
+struct sljit_jump *buffer_end_close;
+struct sljit_label *three_byte_entry;
+struct sljit_label *exit_invalid_label;
+struct sljit_jump *exit_invalid[11];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2);
+
+/* Usually more than 3 characters remained in the subject buffer. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+
+/* Not a valid start of a multi-byte sequence, no more bytes read. */
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2);
+
+buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Three-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+three_byte_entry = LABEL();
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
+exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Four-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
+exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
+
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(buffer_end_close);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+/* Three-byte sequence. */
+JUMPHERE(jump);
+exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+/* One will be substracted from STR_PTR later. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+
+/* Four byte sequences are not possible. */
+CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);
+
+exit_invalid_label = LABEL();
+for (i = 0; i < 11; i++)
+  sljit_set_label(exit_invalid[i], exit_invalid_label);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfreadchar_invalid_precise(compiler_common *common)
+{
+/* Slow decoding a UTF-8 character. TMP1 contains the first byte
 of the character (>= 0xc0). Return char value in TMP1. */
 DEFINE_COMPILER;
 struct sljit_jump *jump;
@@ -3987,7 +4128,7 @@
 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));

/* Not a valid start of a multi-byte sequence, no more bytes read. */
-exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc);
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc0);

buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);

@@ -4576,7 +4717,7 @@
     mainloop = LABEL();
     /* Continual stores does not cause data dependency. */
     OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
-    read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+    read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
     check_newlinechar(common, common->nltype, &newline, TRUE);
     CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
     JUMPHERE(end);
@@ -6206,7 +6347,7 @@
 loop = LABEL();
 common->ff_newline_shortcut = loop;

-read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -6451,7 +6592,8 @@
{
move_back(common, &invalid_utf, FALSE);
check_start_used_ptr(common);
- read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE);
+ /* No need precise read since match fails anyway. */
+ read_char(common, 0, READ_CHAR_MAX, &invalid_utf, READ_CHAR_UPDATE_STR_PTR);
}

/* Testing char type. */
@@ -7394,7 +7536,10 @@

/* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin;
-read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0);
+if ((cc[-1] & XCL_NOT) != 0)
+ read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
+else
+ read_char(common, min, max, NULL, 0);

 if ((cc[-1] & XCL_HASPROP) == 0)
   {
@@ -7920,13 +8065,13 @@
       }
     else
       {
-      OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0);
-      read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
+      OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
+      read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
       add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
       add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
       sljit_set_current_flags(compiler, SLJIT_SET_Z);
       add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
-      OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
+      OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
       }
     JUMPHERE(jump[2]);
     JUMPHERE(jump[3]);
@@ -8325,7 +8470,7 @@
   case OP_ANY:
   if (check_str_ptr)
     detect_partial_match(common, backtracks);
-  read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE);
+  read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
   if (common->nltype == NLTYPE_FIXED && common->newline > 255)
     {
     jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@@ -8352,7 +8497,7 @@
     {
     if (common->invalid_utf)
       {
-      read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE);
+      read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
       return cc;
       }

@@ -8402,7 +8547,7 @@
   case OP_ANYNL:
   if (check_str_ptr)
     detect_partial_match(common, backtracks);
-  read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE);
+  read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
   jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
   /* We don't need to handle soft partial matching case. */
   end_list = NULL;
@@ -8425,7 +8570,12 @@
   case OP_HSPACE:
   if (check_str_ptr)
     detect_partial_match(common, backtracks);
-  read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE);
+
+  if (type == OP_NOT_HSPACE)
+    read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
+  else
+    read_char(common, 0x9, 0x3000, NULL, 0);
+
   add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
   sljit_set_current_flags(compiler, SLJIT_SET_Z);
   add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -8435,7 +8585,12 @@
   case OP_VSPACE:
   if (check_str_ptr)
     detect_partial_match(common, backtracks);
-  read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE);
+
+  if (type == OP_NOT_VSPACE)
+    read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
+  else
+    read_char(common, 0xa, 0x2029, NULL, 0);
+
   add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
   sljit_set_current_flags(compiler, SLJIT_SET_Z);
   add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -8477,6 +8632,7 @@
 #ifdef SUPPORT_UNICODE
   if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
 #endif
+
   if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr
       && (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0))
     {
@@ -8504,12 +8660,13 @@

   if (type == OP_CHAR || !char_has_othercase(common, cc))
     {
-    read_char_range(common, c, c, NULL, FALSE);
+    read_char(common, c, c, NULL, 0);
     add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
     return cc + length;
     }
+
   oc = char_othercase(common, c);
-  read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE);
+  read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
   bit = c ^ oc;
   if (is_powerof2(bit))
     {
@@ -8517,6 +8674,7 @@
     add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
     return cc + length;
     }
+
   jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
   add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
   JUMPHERE(jump[0]);
@@ -8533,7 +8691,7 @@
     {
 #if PCRE2_CODE_UNIT_WIDTH == 8
     c = *cc;
-    if (c < 128)
+    if (c < 128 && !common->invalid_utf)
       {
       OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
       if (type == OP_NOT || !char_has_othercase(common, cc))
@@ -8564,13 +8722,13 @@

   if (type == OP_NOT || !char_has_othercase(common, cc))
     {
-    read_char_range(common, c, c, NULL, TRUE);
+    read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
     add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
     }
   else
     {
     oc = char_othercase(common, c);
-    read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE);
+    read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
     bit = c ^ oc;
     if (is_powerof2(bit))
       {
@@ -8592,9 +8750,15 @@

 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
   bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
-  read_char_range(common, 0, bit, NULL, type == OP_NCLASS);
+  if (type == OP_NCLASS)
+    read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
+  else
+    read_char(common, 0, bit, NULL, 0);
 #else
-  read_char_range(common, 0, 255, NULL, type == OP_NCLASS);
+  if (type == OP_NCLASS)
+    read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
+  else
+    read_char(common, 0, 255, NULL, 0);
 #endif

if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
@@ -8788,7 +8952,6 @@
int source_reg = COUNT_MATCH;
int source_end_reg = ARGUMENTS;
int char1_reg = STACK_LIMIT;
-BOOL saved_invalid_utf;
#endif /* SUPPORT_UNICODE */

if (ref)
@@ -8830,10 +8993,7 @@
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0);

- saved_invalid_utf = common->invalid_utf;
- common->invalid_utf = FALSE;
- read_char_range(common, 0, READ_CHAR_MAX, NULL, TRUE);
- common->invalid_utf = saved_invalid_utf;
+ read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR | READ_CHAR_VALID_UTF);

OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
@@ -8840,7 +9000,7 @@
OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0);

/* Read second character. */
- read_char_range(common, 0, READ_CHAR_MAX, &no_match, TRUE);
+ read_char(common, 0, READ_CHAR_MAX, &no_match, READ_CHAR_UPDATE_STR_PTR);

CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);

@@ -13572,6 +13732,11 @@
set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common);
}
+if (common->utfreadchar_invalid_precise != NULL)
+ {
+ set_jumps(common->utfreadchar_invalid_precise, LABEL());
+ do_utfreadchar_invalid_precise(common);
+ }
if (common->utfreadtype8 != NULL)
{
set_jumps(common->utfreadtype8, LABEL());

Modified: code/trunk/src/pcre2_jit_test.c
===================================================================
--- code/trunk/src/pcre2_jit_test.c    2018-09-14 15:15:51 UTC (rev 1005)
+++ code/trunk/src/pcre2_jit_test.c    2018-09-15 12:35:56 UTC (rev 1006)
@@ -1755,6 +1755,41 @@
     }
 }

+#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16)
+
+static int check_invalid_utf_result(int pattern_index, char *type, int result,
+    int match_start, int match_end, PCRE2_SIZE *ovector)
+{
+    if (match_start < 0) {
+        if (result != -1) {
+            printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
+            return 1;
+        }
+        return 0;
+    }
+
+    if (result <= 0) {
+        printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
+        return 1;
+    }
+
+    if (ovector[0] != (PCRE2_SIZE)match_start) {
+        printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
+            pattern_index, type, (int)ovector[0], match_start);
+        return 1;
+    }
+
+    if (ovector[1] != (PCRE2_SIZE)match_end) {
+        printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
+            pattern_index, type, (int)ovector[1], match_end);
+        return 1;
+    }
+
+    return 0;
+}
+
+#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16) */
+
 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8

 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
@@ -1767,121 +1802,132 @@
     int start_offset;
     int skip_left;
     int skip_right;
-    int expected_result;
+    int match_start;
+    int match_end;
     const char *pattern[2];
     const char *input;
 };

 static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
-    { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" },
-    { UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" },
-    { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" },
-    { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" },
-    { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
+    { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+    { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
+    { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
+    { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
+    { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
+    { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
+    { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
+    { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
+    { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },

-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
-    { UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
-    { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
-    { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
-    { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
-    { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
-    { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
-    { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
+    { UDA, CPI, 4, 1, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
+    { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
+    { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
+    { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
+    { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
+    { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+    { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },

-    { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" },
-    { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" },
-    { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
-    { UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
-    { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
-    { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
-    { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
-    { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
+    { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
+    { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
+    { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
+    { UDA, CPI, 3, 1, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
+    { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
+    { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
+    { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
+    { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },

-    { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" },
-    { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" },
-    { UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
-    { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
-    { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" },
-    { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" },
-    { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" },
+    { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
+    { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
+    { UDA, CPI, 2, 1, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
+    { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
+    { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80#" },
+    { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff#" },
+    { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf#" },

-    { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" },
-    { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" },
-    { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
-    { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
+    { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
+    { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
+    { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
+    { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },

-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },

-    { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" },
-    { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" },
-    { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" },
-    { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" },
-    { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" },
-    { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" },
-    { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
-    { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
-    { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+    { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
+    { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
+    { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
+    { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
+    { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
+    { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
+    { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
+    { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+    { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },

-    { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
+
+    { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
 };

#undef UDA
@@ -1889,17 +1935,18 @@
#undef CPI

 static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current,
-    int pattern_index, int i, pcre2_match_data_8 *mdata)
+    int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
 {
     pcre2_code_8 *code;
     int result, errorcode;
     PCRE2_SIZE length, erroroffset;
+    PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);

     if (current->pattern[i] == NULL)
         return 1;

     code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
-        current->compile_options, &errorcode, &erroroffset, NULL);
+        current->compile_options, &errorcode, &erroroffset, ccontext);

     if (!code) {
         printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@@ -1918,8 +1965,7 @@
         result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
             length, current->start_offset - current->skip_left, 0, mdata, NULL);

-        if (result != current->expected_result) {
-            printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+        if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
             pcre2_code_free_8(code);
             return 0;
         }
@@ -1929,8 +1975,7 @@
         result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
             length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);

-        if (result != current->expected_result) {
-            printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+        if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
             pcre2_code_free_8(code);
             return 0;
         }
@@ -1943,6 +1988,7 @@
 static int invalid_utf8_regression_tests(void)
 {
     struct invalid_utf8_regression_test_case *current;
+    pcre2_compile_context_8 *ccontext;
     pcre2_match_data_8 *mdata;
     int total = 0, successful = 0;
     int result;
@@ -1949,6 +1995,8 @@

     printf("\nRunning invalid-utf8 JIT regression tests\n");

+    ccontext = pcre2_compile_context_create_8(NULL);
+    pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
     mdata = pcre2_match_data_create_8(4, NULL);

     for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
@@ -1956,9 +2004,9 @@
         total++;

         result = 1;
-        if (!run_invalid_utf8_test(current, total - 1, 0, mdata))
+        if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
             result = 0;
-        if (!run_invalid_utf8_test(current, total - 1, 1, mdata))
+        if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
             result = 0;

         if (result) {
@@ -1974,6 +2022,7 @@
         printf("\n");

     pcre2_match_data_free_8(mdata);
+    pcre2_compile_context_free_8(ccontext);

     if (total == successful) {
         printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
@@ -2005,7 +2054,8 @@
     int start_offset;
     int skip_left;
     int skip_right;
-    int expected_result;
+    int match_start;
+    int match_end;
     const PCRE2_UCHAR16 *pattern[2];
     const PCRE2_UCHAR16 *input;
 };
@@ -2024,41 +2074,41 @@
 static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };

 static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
-    { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 },
-    { UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 },
-    { UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 },
-    { UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 },
-    { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 },
-    { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 },
-    { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 },
-    { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 },
-    { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 },
-    { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 },
+    { UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 },
+    { UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 },
+    { UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 },
+    { UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 },
+    { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 },
+    { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 },
+    { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 },
+    { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 },
+    { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 },
+    { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 },

-    { UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 },
-    { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 },
-    { UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 },
-    { UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 },
-    { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 },
-    { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 },
-    { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 },
-    { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 },
-    { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 },
-    { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 },
+    { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 },
+    { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 },
+    { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 },
+    { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 },
+    { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 },
+    { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 },
+    { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 },
+    { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 },

-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 },
-    { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 },
-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 },
+    { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 },

-    { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 },
-    { UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 },
-    { UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 },
-    { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 },
-    { UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 },
-    { UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 },
+    { UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 },
+    { UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 },
+    { UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 },
+    { UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 },
+    { UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 },

-    { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+    { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
 };

#undef UDA
@@ -2066,18 +2116,19 @@
#undef CPI

 static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current,
-    int pattern_index, int i, pcre2_match_data_16 *mdata)
+    int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
 {
     pcre2_code_16 *code;
     int result, errorcode;
     PCRE2_SIZE length, erroroffset;
     const PCRE2_UCHAR16 *input;
+    PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);

     if (current->pattern[i] == NULL)
         return 1;

     code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
-        current->compile_options, &errorcode, &erroroffset, NULL);
+        current->compile_options, &errorcode, &erroroffset, ccontext);

     if (!code) {
         printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@@ -2102,8 +2153,7 @@
         result = pcre2_jit_match_16(code, (current->input + current->skip_left),
             length, current->start_offset - current->skip_left, 0, mdata, NULL);

-        if (result != current->expected_result) {
-            printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+        if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
             pcre2_code_free_16(code);
             return 0;
         }
@@ -2113,8 +2163,7 @@
         result = pcre2_jit_match_16(code, (current->input + current->skip_left),
             length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);

-        if (result != current->expected_result) {
-            printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+        if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
             pcre2_code_free_16(code);
             return 0;
         }
@@ -2127,6 +2176,7 @@
 static int invalid_utf16_regression_tests(void)
 {
     struct invalid_utf16_regression_test_case *current;
+    pcre2_compile_context_16 *ccontext;
     pcre2_match_data_16 *mdata;
     int total = 0, successful = 0;
     int result;
@@ -2133,6 +2183,8 @@

     printf("\nRunning invalid-utf16 JIT regression tests\n");

+    ccontext = pcre2_compile_context_create_16(NULL);
+    pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
     mdata = pcre2_match_data_create_16(4, NULL);

     for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
@@ -2140,9 +2192,9 @@
         total++;

         result = 1;
-        if (!run_invalid_utf16_test(current, total - 1, 0, mdata))
+        if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
             result = 0;
-        if (!run_invalid_utf16_test(current, total - 1, 1, mdata))
+        if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
             result = 0;

         if (result) {
@@ -2158,6 +2210,7 @@
         printf("\n");

     pcre2_match_data_free_16(mdata);
+    pcre2_compile_context_free_16(ccontext);

     if (total == successful) {
         printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");

This message is part of the following thread:
	the complete thread tree sorted by date

[Pcre-svn] [1006] code/trunk/src: Add option bits for read_c…