[Pcre-svn] [1415] code/trunk: The auto-possessification of …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1415] code/trunk: The auto-possessification of character sets were improved.
Revision: 1415
          http://vcs.pcre.org/viewvc?view=rev&revision=1415
Author:   zherczeg
Date:     2013-12-22 20:47:08 +0000 (Sun, 22 Dec 2013)


Log Message:
-----------
The auto-possessification of character sets were improved. The JIT compiler also optimizes more character set checks.

Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/pcre_compile.c
    code/trunk/pcre_jit_compile.c
    code/trunk/pcre_jit_test.c
    code/trunk/testdata/testinput5
    code/trunk/testdata/testoutput17
    code/trunk/testdata/testoutput5


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/ChangeLog    2013-12-22 20:47:08 UTC (rev 1415)
@@ -8,7 +8,11 @@
     When this flag is not set, PCRE can perform certain optimizations
     such as studying these XCLASS-es.


+2.  The auto-possessification of character sets were improved: a normal
+    and an extended character set can be compared now. Furthermore
+    the JIT compiler optimizes more character set checks.


+
Version 8.34 15-December-2013
-----------------------------


Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_compile.c    2013-12-22 20:47:08 UTC (rev 1415)
@@ -3070,8 +3070,11 @@
 const pcre_uint32 *ochr_ptr;
 const pcre_uint32 *list_ptr;
 const pcre_uchar *next_code;
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+const pcre_uchar *xclass_flags;
+#endif
 const pcre_uint8 *class_bitset;
-const pcre_uint32 *set1, *set2, *set_end;
+const pcre_uint8 *set1, *set2, *set_end;
 pcre_uint32 chr;
 BOOL accepted, invert_bits;


@@ -3202,12 +3205,12 @@
     if (base_list[0] == OP_CLASS)
 #endif
       {
-      set1 = (pcre_uint32 *)(base_end - base_list[2]);
+      set1 = (pcre_uint8 *)(base_end - base_list[2]);
       list_ptr = list;
       }
     else
       {
-      set1 = (pcre_uint32 *)(code - list[2]);
+      set1 = (pcre_uint8 *)(code - list[2]);
       list_ptr = base_list;
       }


@@ -3216,41 +3219,53 @@
       {
       case OP_CLASS:
       case OP_NCLASS:
-      set2 = (pcre_uint32 *)
+      set2 = (pcre_uint8 *)
         ((list_ptr == list ? code : base_end) - list_ptr[2]);
       break;


-      /* OP_XCLASS cannot be supported here, because its bitset
-      is not necessarily complete. E.g: [a-\0x{200}] is stored
-      as a character range, and the appropriate bits are not set. */
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+      case OP_XCLASS:
+      xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
+      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
+      if ((*xclass_flags & XCL_MAP) == 0)
+        {
+        /* No bits are set for characters < 256. */
+        if (list[1] == 0) return TRUE;
+        /* Might be an empty repeat. */
+        continue;
+        }
+      set2 = (pcre_uint8 *)(xclass_flags + 1);
+      break;
+#endif


       case OP_NOT_DIGIT:
-        invert_bits = TRUE;
-        /* Fall through */
+      invert_bits = TRUE;
+      /* Fall through */
       case OP_DIGIT:
-        set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
-        break;
+      set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
+      break;


       case OP_NOT_WHITESPACE:
-        invert_bits = TRUE;
-        /* Fall through */
+      invert_bits = TRUE;
+      /* Fall through */
       case OP_WHITESPACE:
-        set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
-        break;
+      set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
+      break;


       case OP_NOT_WORDCHAR:
-        invert_bits = TRUE;
-        /* Fall through */
+      invert_bits = TRUE;
+      /* Fall through */
       case OP_WORDCHAR:
-        set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
-        break;
+      set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
+      break;


       default:
       return FALSE;
       }


-    /* Compare 4 bytes to improve speed. */
-    set_end = set1 + (32 / 4);
+    /* Because the sets are unaligned, we need
+    to perform byte comparison here. */
+    set_end = set1 + 32;
     if (invert_bits)
       {
       do


Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_jit_compile.c    2013-12-22 20:47:08 UTC (rev 1415)
@@ -306,7 +306,7 @@
   int framesize;
 } then_trap_backtrack;


-#define MAX_RANGE_SIZE 6
+#define MAX_RANGE_SIZE 4

typedef struct compiler_common {
/* The sljit ceneric compiler. */
@@ -3503,22 +3503,28 @@
static BOOL check_ranges(compiler_common *common, int *ranges, jump_list **backtracks, BOOL readch)
{
DEFINE_COMPILER;
-struct sljit_jump *jump;

-if (ranges[0] < 0)
+if (ranges[0] < 0 || ranges[0] > 4)
return FALSE;

+/* No character is accepted. */
+if (ranges[0] == 0 && ranges[1] == 0)
+  add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
+
+if (readch)
+  read_char(common);
+
 switch(ranges[0])
   {
+  case 0:
+  /* When ranges[1] != 0, all characters are accepted. */
+  return TRUE;
+
   case 1:
-  if (readch)
-    read_char(common);
   add_jump(compiler, backtracks, CMP(ranges[1] == 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
   return TRUE;


   case 2:
-  if (readch)
-    read_char(common);
   if (ranges[2] + 1 != ranges[3])
     {
     OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
@@ -3529,8 +3535,6 @@
   return TRUE;


   case 3:
-  if (readch)
-    read_char(common);
   if (ranges[1] != 0)
     {
     add_jump(compiler, backtracks, CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
@@ -3541,50 +3545,70 @@
       }
     else
       add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
+    return TRUE;
     }
+
+  add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[2]));
+  if (ranges[3] + 1 != ranges[4])
+    {
+    OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3]);
+    add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
+    }
   else
+    add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3]));
+  return TRUE;
+
+  case 4:
+  if ((ranges[3] - ranges[2]) == (ranges[5] - ranges[4])
+      && (ranges[2] | (ranges[4] - ranges[2])) == ranges[4]
+      && is_powerof2(ranges[4] - ranges[2]))
     {
-    add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[2]));
-    if (ranges[3] + 1 != ranges[4])
+    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[2]);
+    if (ranges[4] + 1 != ranges[5])
       {
-      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3]);
-      add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
+      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
+      add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
       }
     else
-      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3]));
+      add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
+    return TRUE;
     }
-  return TRUE;


-  case 4:
-  if (ranges[2] + 1 == ranges[3] && ranges[4] + 1 == ranges[5])
+  if (ranges[1] != 0)
     {
-    if (readch)
-      read_char(common);
-    if (ranges[1] != 0)
+    if (ranges[2] + 1 != ranges[3])
       {
-      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
-      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
+      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
+      add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]));
+      ranges[4] -= ranges[2];
+      ranges[5] -= ranges[2];
       }
     else
+      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
+
+    if (ranges[4] + 1 != ranges[5])
       {
-      jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]);
-      add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
-      JUMPHERE(jump);
+      OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
+      add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
       }
+    else
+      add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
     return TRUE;
     }
-  if ((ranges[3] - ranges[2]) == (ranges[5] - ranges[4]) && is_powerof2(ranges[4] - ranges[2]))
+
+  OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
+  add_jump(compiler, backtracks, CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[2]));
+  if (ranges[3] + 1 != ranges[4])
     {
-    if (readch)
-      read_char(common);
-    OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[2]);
-    OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
-    add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
-    return TRUE;
+    OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]);
+    add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
     }
-  return FALSE;
+  else
+    add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]));
+  return TRUE;


default:
+ SLJIT_ASSERT_STOP();
return FALSE;
}
}

Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_jit_test.c    2013-12-22 20:47:08 UTC (rev 1415)
@@ -326,6 +326,12 @@
     { MUA, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
     { MUA, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
     { MUA, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
+    { MUA, 0, "x[bcef]+", "xaxdxecbfg" },
+    { MUA, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
+    { MUA, 0, "x[^befg]+", "xbxexacdhg" },
+    { MUA, 0, "x[^bcdl]+", "xlxbxaekmd" },
+    { MUA, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
+    { MUA, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },


     /* Unicode properties. */
     { MUAP, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },


Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testinput5    2013-12-22 20:47:08 UTC (rev 1415)
@@ -790,4 +790,6 @@
 /^a+[a\x{200}]/8BZ
     aa


+/[b-d\x{200}-\x{250}]*[ae-h]?#[\x{200}-\x{250}]{0,8}[\x00-\xff]*#[\x{200}-\x{250}]+[a-z]/8BZ
+
/-- End of testinput5 --/

Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testoutput17    2013-12-22 20:47:08 UTC (rev 1415)
@@ -539,10 +539,10 @@
         a*+
         [b-\xff\x{100}-\x{200}]?
         b#
-        [a-f]*
+        [a-f]*+
         [g-\xff\x{100}-\x{200}]*+
         #
-        [g-\xff\x{100}-\x{200}]*
+        [g-\xff\x{100}-\x{200}]*+
         [a-c]*+
         #
         [g-\xff\x{100}-\x{200}]*


Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5    2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testoutput5    2013-12-22 20:47:08 UTC (rev 1415)
@@ -1884,4 +1884,19 @@
     aa
  0: aa


+/[b-d\x{200}-\x{250}]*[ae-h]?#[\x{200}-\x{250}]{0,8}[\x00-\xff]*#[\x{200}-\x{250}]+[a-z]/8BZ
+------------------------------------------------------------------
+        Bra
+        [b-d\x{200}-\x{250}]*+
+        [ae-h]?+
+        #
+        [\x{200}-\x{250}]{0,8}+
+        [\x00-\xff]*
+        #
+        [\x{200}-\x{250}]++
+        [a-z]
+        Ket
+        End
+------------------------------------------------------------------
+
 /-- End of testinput5 --/