[Pcre-svn] [1089] code/trunk: Improved the invalid utf32 sup…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1089] code/trunk: Improved the invalid utf32 support of the JIT compiler.
Revision: 1089
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1089
Author:   zherczeg
Date:     2019-05-10 14:15:20 +0100 (Fri, 10 May 2019)
Log Message:
-----------
Improved the invalid utf32 support of the JIT compiler.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/src/pcre2_jit_compile.c
    code/trunk/src/pcre2_jit_test.c


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-04-22 12:39:38 UTC (rev 1088)
+++ code/trunk/ChangeLog    2019-05-10 13:15:20 UTC (rev 1089)
@@ -9,7 +9,10 @@
 check on this was ever implemented. This omission has been rectified; it fixes
 ClusterFuzz 14376.


+2. Improved the invalid utf32 support of the JIT compiler. Now it correctly
+detects invalid characters in the 0xd800-0xdfff range.

+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/src/pcre2_jit_compile.c
===================================================================
--- code/trunk/src/pcre2_jit_compile.c    2019-04-22 12:39:38 UTC (rev 1088)
+++ code/trunk/src/pcre2_jit_compile.c    2019-05-10 13:15:20 UTC (rev 1089)
@@ -696,11 +696,12 @@


 #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
   { \
-  if (ptr[-1] <= 0x7f) \
-    c = *ptr--; \
+  c = ptr[-1]; \
+  if (c <= 0x7f) \
+    ptr--; \
   else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \
     { \
-    c = ptr[-1] - 0x80; \
+    c -= 0x80; \
     \
     if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \
       { \
@@ -775,11 +776,12 @@


 #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
   { \
-  if (ptr[-1] < 0xd800 || ptr[-1] >= 0xe000) \
-    c = *ptr--; \
-  else if (ptr[-1] >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
+  c = ptr[-1]; \
+  if (c < 0xd800 || c >= 0xe000) \
+    ptr--; \
+  else if (c >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
     { \
-    c = (((ptr[-2] - 0xd800) << 10) | (ptr[-1] - 0xdc00)) + 0x10000; \
+    c = (((ptr[-2] - 0xd800) << 10) | (c - 0xdc00)) + 0x10000; \
     ptr -= 2; \
     } \
   else \
@@ -793,7 +795,7 @@


 #define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
   { \
-  if (ptr[0] < 0x110000) \
+  if (ptr[0] < 0xd800 || (ptr[0] >= 0xe000 && ptr[0] < 0x110000)) \
     c = *ptr++; \
   else \
     { \
@@ -801,6 +803,17 @@
     } \
   }


+#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
+  { \
+  c = ptr[-1]; \
+  if (ptr[-1] < 0xd800 || (ptr[-1] >= 0xe000 && ptr[-1] < 0x110000)) \
+    ptr--; \
+  else \
+    { \
+    invalid_action; \
+    } \
+  }
+
 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
 #endif /* SUPPORT_UNICODE */


@@ -3420,12 +3433,21 @@
 #elif PCRE2_CODE_UNIT_WIDTH == 32
 if (common->invalid_utf)
   {
+  if (max < 0xd800) return;
+
   if (backtracks != NULL)
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
     add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+    add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
+    }
   else
     {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
     OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
     CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
+    OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+    CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
     }
   }
 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
@@ -3490,8 +3512,12 @@
     JUMPHERE(jump);
   }
 #elif PCRE2_CODE_UNIT_WIDTH == 32
-  if (common->invalid_utf)
-    add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+if (common->invalid_utf)
+  {
+  OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+  add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+  add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
+  }
 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
 #endif /* SUPPORT_UNICODE */
 }
@@ -3677,11 +3703,18 @@
 if (common->invalid_utf)
   {
   if (backtracks != NULL)
+    {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
     add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+    add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
+    }
   else
     {
+    OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
     OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
     CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
+    OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+    CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
     }
   }
 #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
@@ -8402,12 +8435,12 @@
 PCRE2_SPTR start_subject = args->begin;
 PCRE2_SPTR end_subject = args->end;
 int lgb, rgb, ricount;
-PCRE2_SPTR prevcc, startcc, bptr;
+PCRE2_SPTR prevcc, endcc, bptr;
 BOOL first = TRUE;
 uint32_t c;


 prevcc = cc;
-startcc = NULL;
+endcc = NULL;
 do
   {
   GETCHARINC(c, cc);
@@ -8416,7 +8449,7 @@
   if (first)
     {
     lgb = rgb;
-    startcc = cc;
+    endcc = cc;
     first = FALSE;
     continue;
     }
@@ -8455,25 +8488,27 @@
        lgb != ucp_gbExtended_Pictographic)
     lgb = rgb;


- prevcc = startcc;
- startcc = cc;
+ prevcc = endcc;
+ endcc = cc;
}
while (cc < end_subject);

-return startcc;
+return endcc;
}

+#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
+
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
{
PCRE2_SPTR start_subject = args->begin;
PCRE2_SPTR end_subject = args->end;
int lgb, rgb, ricount;
-PCRE2_SPTR prevcc, startcc, bptr;
+PCRE2_SPTR prevcc, endcc, bptr;
BOOL first = TRUE;
uint32_t c;

 prevcc = cc;
-startcc = NULL;
+endcc = NULL;
 do
   {
   GETCHARINC_INVALID(c, cc, end_subject, break);
@@ -8482,7 +8517,7 @@
   if (first)
     {
     lgb = rgb;
-    startcc = cc;
+    endcc = cc;
     first = FALSE;
     continue;
     }
@@ -8520,16 +8555,14 @@
        lgb != ucp_gbExtended_Pictographic)
     lgb = rgb;


- prevcc = startcc;
- startcc = cc;
+ prevcc = endcc;
+ endcc = cc;
}
while (cc < end_subject);

-return startcc;
+return endcc;
}

-#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
-
 static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
 {
 PCRE2_SPTR start_subject = args->begin;
@@ -8800,8 +8833,10 @@
   if (common->invalid_utf)
     add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
 #else
-  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_extuni_no_utf));
-  add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
+  sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM,
+    common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_no_utf));
+  if (!common->utf || common->invalid_utf)
+    add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
 #endif


OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);

Modified: code/trunk/src/pcre2_jit_test.c
===================================================================
--- code/trunk/src/pcre2_jit_test.c    2019-04-22 12:39:38 UTC (rev 1088)
+++ code/trunk/src/pcre2_jit_test.c    2019-05-10 13:15:20 UTC (rev 1089)
@@ -1770,7 +1770,7 @@
     }
 }


-#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16 || defined SUPPORT_PCRE2_32)
+#if defined SUPPORT_UNICODE

 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
     int match_start, int match_end, PCRE2_SIZE *ovector)
@@ -1803,7 +1803,7 @@
     return 0;
 }


-#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16 || SUPPORT_PCRE2_32) */
+#endif /* SUPPORT_UNICODE */

#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8

@@ -2314,31 +2314,45 @@
static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 };
-static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 };
-static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 };
-static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 };
-static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 };
+static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0 };
+static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
+static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
+static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
+static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };

 static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
     { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
     { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
+    { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
+    { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
+    { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
+    { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },


     { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
     { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
     { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
+    { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
+    { UDA, CPI, 4, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },


-    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 },
-    { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 },
+    { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
+    { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },


     { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
     { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
+    { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
+    { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
+    { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },


-    { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
-    { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 },
-    { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
+    { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
+    { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
+    { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
+    { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },


-    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 },
-    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 },
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
+    { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },


     { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
 };