Revision: 1415
http://vcs.pcre.org/viewvc?view=rev&revision=1415
Author: zherczeg
Date: 2013-12-22 20:47:08 +0000 (Sun, 22 Dec 2013)
Log Message:
-----------
The auto-possessification of character sets were improved. The JIT compiler also optimizes more character set checks.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_compile.c
code/trunk/pcre_jit_compile.c
code/trunk/pcre_jit_test.c
code/trunk/testdata/testinput5
code/trunk/testdata/testoutput17
code/trunk/testdata/testoutput5
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/ChangeLog 2013-12-22 20:47:08 UTC (rev 1415)
@@ -8,7 +8,11 @@
When this flag is not set, PCRE can perform certain optimizations
such as studying these XCLASS-es.
+2. The auto-possessification of character sets were improved: a normal
+ and an extended character set can be compared now. Furthermore
+ the JIT compiler optimizes more character set checks.
+
Version 8.34 15-December-2013
-----------------------------
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_compile.c 2013-12-22 20:47:08 UTC (rev 1415)
@@ -3070,8 +3070,11 @@
const pcre_uint32 *ochr_ptr;
const pcre_uint32 *list_ptr;
const pcre_uchar *next_code;
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+const pcre_uchar *xclass_flags;
+#endif
const pcre_uint8 *class_bitset;
-const pcre_uint32 *set1, *set2, *set_end;
+const pcre_uint8 *set1, *set2, *set_end;
pcre_uint32 chr;
BOOL accepted, invert_bits;
@@ -3202,12 +3205,12 @@
if (base_list[0] == OP_CLASS)
#endif
{
- set1 = (pcre_uint32 *)(base_end - base_list[2]);
+ set1 = (pcre_uint8 *)(base_end - base_list[2]);
list_ptr = list;
}
else
{
- set1 = (pcre_uint32 *)(code - list[2]);
+ set1 = (pcre_uint8 *)(code - list[2]);
list_ptr = base_list;
}
@@ -3216,41 +3219,53 @@
{
case OP_CLASS:
case OP_NCLASS:
- set2 = (pcre_uint32 *)
+ set2 = (pcre_uint8 *)
((list_ptr == list ? code : base_end) - list_ptr[2]);
break;
- /* OP_XCLASS cannot be supported here, because its bitset
- is not necessarily complete. E.g: [a-\0x{200}] is stored
- as a character range, and the appropriate bits are not set. */
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
+ case OP_XCLASS:
+ xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
+ if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
+ if ((*xclass_flags & XCL_MAP) == 0)
+ {
+ /* No bits are set for characters < 256. */
+ if (list[1] == 0) return TRUE;
+ /* Might be an empty repeat. */
+ continue;
+ }
+ set2 = (pcre_uint8 *)(xclass_flags + 1);
+ break;
+#endif
case OP_NOT_DIGIT:
- invert_bits = TRUE;
- /* Fall through */
+ invert_bits = TRUE;
+ /* Fall through */
case OP_DIGIT:
- set2 = (pcre_uint32 *)(cd->cbits + cbit_digit);
- break;
+ set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
+ break;
case OP_NOT_WHITESPACE:
- invert_bits = TRUE;
- /* Fall through */
+ invert_bits = TRUE;
+ /* Fall through */
case OP_WHITESPACE:
- set2 = (pcre_uint32 *)(cd->cbits + cbit_space);
- break;
+ set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
+ break;
case OP_NOT_WORDCHAR:
- invert_bits = TRUE;
- /* Fall through */
+ invert_bits = TRUE;
+ /* Fall through */
case OP_WORDCHAR:
- set2 = (pcre_uint32 *)(cd->cbits + cbit_word);
- break;
+ set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
+ break;
default:
return FALSE;
}
- /* Compare 4 bytes to improve speed. */
- set_end = set1 + (32 / 4);
+ /* Because the sets are unaligned, we need
+ to perform byte comparison here. */
+ set_end = set1 + 32;
if (invert_bits)
{
do
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_jit_compile.c 2013-12-22 20:47:08 UTC (rev 1415)
@@ -306,7 +306,7 @@
int framesize;
} then_trap_backtrack;
-#define MAX_RANGE_SIZE 6
+#define MAX_RANGE_SIZE 4
typedef struct compiler_common {
/* The sljit ceneric compiler. */
@@ -3503,22 +3503,28 @@
static BOOL check_ranges(compiler_common *common, int *ranges, jump_list **backtracks, BOOL readch)
{
DEFINE_COMPILER;
-struct sljit_jump *jump;
-if (ranges[0] < 0)
+if (ranges[0] < 0 || ranges[0] > 4)
return FALSE;
+/* No character is accepted. */
+if (ranges[0] == 0 && ranges[1] == 0)
+ add_jump(compiler, backtracks, JUMP(SLJIT_JUMP));
+
+if (readch)
+ read_char(common);
+
switch(ranges[0])
{
+ case 0:
+ /* When ranges[1] != 0, all characters are accepted. */
+ return TRUE;
+
case 1:
- if (readch)
- read_char(common);
add_jump(compiler, backtracks, CMP(ranges[1] == 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
return TRUE;
case 2:
- if (readch)
- read_char(common);
if (ranges[2] + 1 != ranges[3])
{
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
@@ -3529,8 +3535,6 @@
return TRUE;
case 3:
- if (readch)
- read_char(common);
if (ranges[1] != 0)
{
add_jump(compiler, backtracks, CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
@@ -3541,50 +3545,70 @@
}
else
add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
+ return TRUE;
}
+
+ add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[2]));
+ if (ranges[3] + 1 != ranges[4])
+ {
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3]);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
+ }
else
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3]));
+ return TRUE;
+
+ case 4:
+ if ((ranges[3] - ranges[2]) == (ranges[5] - ranges[4])
+ && (ranges[2] | (ranges[4] - ranges[2])) == ranges[4]
+ && is_powerof2(ranges[4] - ranges[2]))
{
- add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[2]));
- if (ranges[3] + 1 != ranges[4])
+ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[2]);
+ if (ranges[4] + 1 != ranges[5])
{
- OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3]);
- add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
+ add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
}
else
- add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3]));
+ add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
+ return TRUE;
}
- return TRUE;
- case 4:
- if (ranges[2] + 1 == ranges[3] && ranges[4] + 1 == ranges[5])
+ if (ranges[1] != 0)
{
- if (readch)
- read_char(common);
- if (ranges[1] != 0)
+ if (ranges[2] + 1 != ranges[3])
{
- add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
- add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]));
+ ranges[4] -= ranges[2];
+ ranges[5] -= ranges[2];
}
else
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]));
+
+ if (ranges[4] + 1 != ranges[5])
{
- jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[2]);
- add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
- JUMPHERE(jump);
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
}
+ else
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[4]));
return TRUE;
}
- if ((ranges[3] - ranges[2]) == (ranges[5] - ranges[4]) && is_powerof2(ranges[4] - ranges[2]))
+
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[2]);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[2]));
+ if (ranges[3] + 1 != ranges[4])
{
- if (readch)
- read_char(common);
- OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[2]);
- OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[4]);
- add_jump(compiler, backtracks, CMP(ranges[1] != 0 ? SLJIT_C_LESS : SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, ranges[5] - ranges[4]));
- return TRUE;
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, ranges[4] - ranges[3]));
}
- return FALSE;
+ else
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, ranges[3] - ranges[2]));
+ return TRUE;
default:
+ SLJIT_ASSERT_STOP();
return FALSE;
}
}
Modified: code/trunk/pcre_jit_test.c
===================================================================
--- code/trunk/pcre_jit_test.c 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/pcre_jit_test.c 2013-12-22 20:47:08 UTC (rev 1415)
@@ -326,6 +326,12 @@
{ MUA, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
{ MUA, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
{ MUA, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
+ { MUA, 0, "x[bcef]+", "xaxdxecbfg" },
+ { MUA, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
+ { MUA, 0, "x[^befg]+", "xbxexacdhg" },
+ { MUA, 0, "x[^bcdl]+", "xlxbxaekmd" },
+ { MUA, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
+ { MUA, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
/* Unicode properties. */
{ MUAP, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testinput5 2013-12-22 20:47:08 UTC (rev 1415)
@@ -790,4 +790,6 @@
/^a+[a\x{200}]/8BZ
aa
+/[b-d\x{200}-\x{250}]*[ae-h]?#[\x{200}-\x{250}]{0,8}[\x00-\xff]*#[\x{200}-\x{250}]+[a-z]/8BZ
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testoutput17 2013-12-22 20:47:08 UTC (rev 1415)
@@ -539,10 +539,10 @@
a*+
[b-\xff\x{100}-\x{200}]?
b#
- [a-f]*
+ [a-f]*+
[g-\xff\x{100}-\x{200}]*+
#
- [g-\xff\x{100}-\x{200}]*
+ [g-\xff\x{100}-\x{200}]*+
[a-c]*+
#
[g-\xff\x{100}-\x{200}]*
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2013-12-22 16:27:35 UTC (rev 1414)
+++ code/trunk/testdata/testoutput5 2013-12-22 20:47:08 UTC (rev 1415)
@@ -1884,4 +1884,19 @@
aa
0: aa
+/[b-d\x{200}-\x{250}]*[ae-h]?#[\x{200}-\x{250}]{0,8}[\x00-\xff]*#[\x{200}-\x{250}]+[a-z]/8BZ
+------------------------------------------------------------------
+ Bra
+ [b-d\x{200}-\x{250}]*+
+ [ae-h]?+
+ #
+ [\x{200}-\x{250}]{0,8}+
+ [\x00-\xff]*
+ #
+ [\x{200}-\x{250}]++
+ [a-z]
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput5 --/