Revision: 1423
http://vcs.pcre.org/viewvc?view=rev&revision=1423
Author: zherczeg
Date: 2013-12-31 07:57:56 +0000 (Tue, 31 Dec 2013)
Log Message:
-----------
JIT: Improved matching of newlines.
Modified Paths:
--------------
code/trunk/pcre_jit_compile.c
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput4
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2013-12-30 19:05:36 UTC (rev 1422)
+++ code/trunk/pcre_jit_compile.c 2013-12-31 07:57:56 UTC (rev 1423)
@@ -363,8 +363,10 @@
BOOL positive_assert;
/* Newline control. */
int nltype;
+ pcre_uint32 nlmax;
int newline;
int bsr_nltype;
+ pcre_uint32 bsr_nlmax;
/* Dollar endonly. */
int endonly;
/* Tables. */
@@ -522,6 +524,8 @@
#define GET_LOCAL_BASE(dst, dstw, offset) \
sljit_get_local_base(compiler, (dst), (dstw), (offset))
+#define READ_CHAR_ANY 0x7fffffff
+
static pcre_uchar* bracketend(pcre_uchar* cc)
{
SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND));
@@ -2626,7 +2630,7 @@
static SLJIT_INLINE void read_char(compiler_common *common)
{
-read_char_max(common, 0x7fffffff, TRUE);
+read_char_max(common, READ_CHAR_ANY, TRUE);
}
static void read_char8_type(compiler_common *common, BOOL full_read)
@@ -2730,28 +2734,35 @@
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
}
-static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpiftrue)
+static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpifmatch)
{
/* Character comes in TMP1. Checks if it is a newline. TMP2 may be destroyed. */
DEFINE_COMPILER;
+struct sljit_jump *jump;
if (nltype == NLTYPE_ANY)
{
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
- add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
+ add_jump(compiler, backtracks, JUMP(jumpifmatch ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
}
else if (nltype == NLTYPE_ANYCRLF)
{
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_CR);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL);
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
- OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL);
- add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
+ if (jumpifmatch)
+ {
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR));
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
+ }
+ else
+ {
+ jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
+ JUMPHERE(jump);
+ }
}
else
{
SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
- add_jump(compiler, backtracks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
+ add_jump(compiler, backtracks, CMP(jumpifmatch ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
}
@@ -2828,6 +2839,10 @@
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
+OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_NOT_ZERO);
+/* This code runs only in 8 bit mode. No need to shift the value. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
@@ -2949,7 +2964,7 @@
mainloop = LABEL();
/* Continual stores does not cause data dependency. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end, STR_PTR, 0);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
check_newlinechar(common, common->nltype, &newline, TRUE);
CMPTO(SLJIT_C_LESS, STR_PTR, 0, STR_END, 0, mainloop);
JUMPHERE(end);
@@ -3517,7 +3532,7 @@
skip_char_back(common);
loop = LABEL();
-read_char(common);
+read_char_max(common, common->nlmax, TRUE);
lastchar = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -4945,7 +4960,7 @@
case OP_ANY:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@@ -5013,7 +5028,7 @@
case OP_ANYNL:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, common->bsr_nlmax, FALSE);
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */
end_list = NULL;
@@ -5035,7 +5050,7 @@
case OP_NOT_HSPACE:
case OP_HSPACE:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, 0x3000, type == OP_NOT_HSPACE);
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
return cc;
@@ -5043,7 +5058,7 @@
case OP_NOT_VSPACE:
case OP_VSPACE:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, 0x2029, type == OP_NOT_VSPACE);
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
return cc;
@@ -5142,7 +5157,7 @@
else
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, STR_PTR, 0);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO));
@@ -5190,7 +5205,7 @@
else
{
skip_char_back(common);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
check_newlinechar(common, common->nltype, backtracks, FALSE);
}
JUMPHERE(jump[0]);
@@ -5265,8 +5280,8 @@
#endif
return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
}
+
detect_partial_match(common, backtracks);
- read_char(common);
#ifdef SUPPORT_UTF
if (common->utf)
{
@@ -5275,12 +5290,15 @@
else
#endif
c = *cc;
+
if (type == OP_CHAR || !char_has_othercase(common, cc))
{
+ read_char_max(common, c, FALSE);
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
return cc + length;
}
oc = char_othercase(common, c);
+ read_char_max(common, c > oc ? c : oc, FALSE);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -5288,11 +5306,9 @@
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
return cc + length;
}
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, c);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL);
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
- OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL);
- add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO));
+ jump[0] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
+ JUMPHERE(jump[0]);
return cc + length;
case OP_NOT:
@@ -9420,6 +9436,7 @@
case PCRE_NEWLINE_ANYCRLF: common->newline = (CHAR_CR << 8) | CHAR_NL; common->nltype = NLTYPE_ANYCRLF; break;
default: return;
}
+common->nlmax = READ_CHAR_ANY;
if ((re->options & PCRE_BSR_ANYCRLF) != 0)
common->bsr_nltype = NLTYPE_ANYCRLF;
else if ((re->options & PCRE_BSR_UNICODE) != 0)
@@ -9432,6 +9449,7 @@
common->bsr_nltype = NLTYPE_ANY;
#endif
}
+common->bsr_nlmax = READ_CHAR_ANY;
common->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
common->ctypes = (sljit_sw)(tables + ctypes_offset);
common->name_table = ((pcre_uchar *)re) + re->name_table_offset;
@@ -9444,6 +9462,23 @@
#ifdef SUPPORT_UCP
common->use_ucp = (re->options & PCRE_UCP) != 0;
#endif
+if (common->utf)
+ {
+ if (common->nltype == NLTYPE_ANY)
+ common->nlmax = 0x2029;
+ else if (common->nltype == NLTYPE_ANYCRLF)
+ common->nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL;
+ else
+ {
+ /* We only care about the first newline character. */
+ common->nlmax = common->newline & 0xff;
+ }
+
+ if (common->bsr_nltype == NLTYPE_ANY)
+ common->bsr_nlmax = 0x2029;
+ else
+ common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL;
+ }
#endif /* SUPPORT_UTF */
ccend = bracketend(rootbacktrack.cc);
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2013-12-30 19:05:36 UTC (rev 1422)
+++ code/trunk/testdata/testinput4 2013-12-31 07:57:56 UTC (rev 1423)
@@ -716,7 +716,10 @@
/^a+[a\x{200}]/8
aa
-/.\B.\B./8
+/^.\B.\B./8
\x{10123}\x{10124}\x{10125}
+/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8
+ #\x{10000}#\x{100}#\x{10ffff}#
+
/-- End of testinput4 --/
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2013-12-30 19:05:36 UTC (rev 1422)
+++ code/trunk/testdata/testoutput4 2013-12-31 07:57:56 UTC (rev 1423)
@@ -1263,8 +1263,12 @@
aa
0: aa
-/.\B.\B./8
+/^.\B.\B./8
\x{10123}\x{10124}\x{10125}
0: \x{10123}\x{10124}\x{10125}
+/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8
+ #\x{10000}#\x{100}#\x{10ffff}#
+ 0: #\x{10000}#\x{100}#\x{10ffff}#
+
/-- End of testinput4 --/