Revision: 915
http://vcs.pcre.org/viewvc?view=rev&revision=915
Author: zherczeg
Date: 2012-02-14 13:05:39 +0000 (Tue, 14 Feb 2012)
Log Message:
-----------
Improved \X and back reference partial matching
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/pcre_internal.h
code/trunk/pcre_jit_compile.c
code/trunk/testdata/testinput2
code/trunk/testdata/testinput5
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput2
code/trunk/testdata/testoutput5
code/trunk/testdata/testoutput6
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/ChangeLog 2012-02-14 13:05:39 UTC (rev 915)
@@ -14,7 +14,9 @@
4. Partial matching support is added to the JIT compiler.
+5. Improved \X and back reference partial matching.
+
Version 8.30 04-February-2012
-----------------------------
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_exec.c 2012-02-14 13:05:39 UTC (rev 915)
@@ -147,7 +147,7 @@
match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
BOOL caseless)
{
-PCRE_PUCHAR eptr_start = eptr;
+int matched_length = length;
register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
#ifdef PCRE_DEBUG
@@ -186,14 +186,16 @@
reference, not along the subject (earlier code did this wrong). */
PCRE_PUCHAR endptr = p + length;
+ PCRE_PUCHAR eptr_start = eptr;
while (p < endptr)
{
int c, d;
- if (eptr >= md->end_subject) return -1;
+ if (eptr >= md->end_subject) return -((int)(eptr - eptr_start) + 1);
GETCHARINC(c, eptr);
GETCHARINC(d, p);
if (c != d && c != UCD_OTHERCASE(d)) return -1;
}
+ matched_length = (int)(eptr - eptr_start);
}
else
#endif
@@ -202,7 +204,13 @@
/* The same code works when not in UTF-8 mode and in UTF-8 mode when there
is no UCP support. */
{
- if (eptr + length > md->end_subject) return -1;
+ if (eptr + length > md->end_subject)
+ {
+ if (md->partial == 0)
+ return -1;
+ length = (int)(md->end_subject - eptr);
+ matched_length = -(length + 1);
+ }
while (length-- > 0)
{
if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
@@ -217,11 +225,17 @@
else
{
- if (eptr + length > md->end_subject) return -1;
+ if (eptr + length > md->end_subject)
+ {
+ if (md->partial == 0)
+ return -1;
+ length = (int)(md->end_subject - eptr);
+ matched_length = -(length + 1);
+ }
while (length-- > 0) if (*p++ != *eptr++) return -1;
}
-return (int)(eptr - eptr_start);
+return matched_length;
}
@@ -2595,6 +2609,10 @@
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
+ if (md->partial != 0 && eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ }
ecode++;
break;
#endif
@@ -2660,6 +2678,7 @@
default: /* No repeat follows */
if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
{
+ eptr += -(length + 1);
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
@@ -2685,6 +2704,7 @@
int slength;
if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
{
+ eptr += -(slength + 1);
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
@@ -2708,6 +2728,7 @@
if (fi >= max) RRETURN(MATCH_NOMATCH);
if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
{
+ eptr += -(slength + 1);
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
@@ -2726,7 +2747,10 @@
int slength;
if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
{
+ /* Restore the eptr after the check. */
+ eptr += -(slength + 1);
CHECK_PARTIAL();
+ eptr -= -(slength + 1);
break;
}
eptr += slength;
@@ -4165,6 +4189,10 @@
eptr += len;
}
}
+ if (md->partial != 0 && eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ }
}
else
@@ -4948,6 +4976,10 @@
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
+ if (md->partial != 0 && eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ }
}
}
else
@@ -5491,6 +5523,10 @@
if (UCD_CATEGORY(c) != ucp_M) break;
eptr += len;
}
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ }
}
/* eptr is now past the end of the maximum run */
Modified: code/trunk/pcre_internal.h
===================================================================
--- code/trunk/pcre_internal.h 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_internal.h 2012-02-14 13:05:39 UTC (rev 915)
@@ -1944,7 +1944,7 @@
/* JIT compiling modes. The function list is indexed by them. */
enum { JIT_COMPILE, JIT_PARTIAL_SOFT_COMPILE, JIT_PARTIAL_HARD_COMPILE,
- JIT_NUMBER_OF_COMPILE_TYPES };
+ JIT_NUMBER_OF_COMPILE_MODES };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/pcre_jit_compile.c 2012-02-14 13:05:39 UTC (rev 915)
@@ -163,10 +163,10 @@
} jit_arguments;
typedef struct executable_functions {
- void *executable_funcs[JIT_NUMBER_OF_COMPILE_TYPES];
+ void *executable_funcs[JIT_NUMBER_OF_COMPILE_MODES];
PUBL(jit_callback) callback;
void *userdata;
- sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_TYPES];
+ sljit_uw executable_sizes[JIT_NUMBER_OF_COMPILE_MODES];
} executable_functions;
typedef struct jump_list {
@@ -2590,10 +2590,10 @@
while (src1 < end1)
{
if (src2 >= end2)
- return 0;
+ return (pcre_uchar*)1;
GETCHARINC(c1, src1);
GETCHARINC(c2, src2);
- if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return 0;
+ if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return NULL;
}
return src2;
}
@@ -3288,6 +3288,12 @@
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
JUMPHERE(jump[0]);
+ if (common->mode == JIT_PARTIAL_HARD_COMPILE)
+ {
+ jump[0] = CMP(SLJIT_C_LESS, STR_PTR, 0, STR_END, 0);
+ check_partial(common);
+ JUMPHERE(jump[0]);
+ }
return cc;
#endif
@@ -3727,6 +3733,8 @@
DEFINE_COMPILER;
int offset = GET2(cc, 1) << 1;
struct sljit_jump *jump = NULL;
+struct sljit_jump *partial;
+struct sljit_jump *nopartial;
OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), OVECTOR(offset));
/* OVECTOR(1) contains the "string begin - 1" constant. */
@@ -3741,16 +3749,22 @@
if (withchecks)
jump = CMP(SLJIT_C_EQUAL, TMP1, 0, TMP2, 0);
- if (common->mode != JIT_COMPILE)
- fallback_at_str_end(common, fallbacks);
-
/* Needed to save important temporary registers. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0, STACK_TOP, 0);
OP1(SLJIT_MOV, SLJIT_TEMPORARY_REG2, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_TEMPORARY_REG2), SLJIT_OFFSETOF(jit_arguments, ptr), STR_PTR, 0);
sljit_emit_ijump(compiler, SLJIT_CALL3, SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
OP1(SLJIT_MOV, STACK_TOP, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS0);
- add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
+ if (common->mode == JIT_COMPILE)
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_LESS_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1));
+ else
+ {
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
+ nopartial = CMP(SLJIT_C_NOT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 1);
+ check_partial(common);
+ add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
+ JUMPHERE(nopartial);
+ }
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
}
else
@@ -3760,14 +3774,30 @@
if (withchecks)
jump = JUMP(SLJIT_C_ZERO);
- if (common->mode != JIT_COMPILE)
- fallback_at_str_end(common, fallbacks);
-
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ partial = CMP(SLJIT_C_GREATER, STR_PTR, 0, STR_END, 0);
+ if (common->mode == JIT_COMPILE)
+ add_jump(compiler, fallbacks, partial);
- add_jump(compiler, fallbacks, CMP(SLJIT_C_GREATER, STR_PTR, 0, STR_END, 0));
add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
+
+ if (common->mode != JIT_COMPILE)
+ {
+ nopartial = JUMP(SLJIT_JUMP);
+ JUMPHERE(partial);
+ /* TMP2 -= STR_END - STR_PTR */
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, STR_PTR, 0);
+ OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, STR_END, 0);
+ partial = CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, 0);
+ OP1(SLJIT_MOV, STR_PTR, 0, STR_END, 0);
+ add_jump(compiler, *cc == OP_REF ? &common->casefulcmp : &common->caselesscmp, JUMP(SLJIT_FAST_CALL));
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0));
+ JUMPHERE(partial);
+ check_partial(common);
+ add_jump(compiler, fallbacks, JUMP(SLJIT_JUMP));
+ JUMPHERE(nopartial);
+ }
}
if (jump != NULL)
@@ -7027,7 +7057,7 @@
{
int i;
executable_functions *functions = (executable_functions *)executable_funcs;
-for (i = 0; i < JIT_NUMBER_OF_COMPILE_TYPES; i++)
+for (i = 0; i < JIT_NUMBER_OF_COMPILE_MODES; i++)
{
if (functions->executable_funcs[i] != NULL)
sljit_free_code(functions->executable_funcs[i]);
Modified: code/trunk/testdata/testinput2
===================================================================
--- code/trunk/testdata/testinput2 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput2 2012-02-14 13:05:39 UTC (rev 915)
@@ -3601,4 +3601,43 @@
/(?=a(*:x))(?=a(*:y)c|)/K+
ab
+/(..)\1/
+ ab\P
+ aba\P
+ abab\P
+
+/(..)\1/i
+ ab\P
+ abA\P
+ aBAb\P
+
+/(..)\1{2,}/
+ ab\P
+ aba\P
+ abab\P
+ ababa\P
+ ababab\P
+ ababab\P\P
+ abababa\P
+ abababa\P\P
+
+/(..)\1{2,}/i
+ ab\P
+ aBa\P
+ aBAb\P
+ AbaBA\P
+ abABAb\P
+ aBAbaB\P\P
+ abABabA\P
+ abaBABa\P\P
+
+/(..)\1{2,}?x/i
+ ab\P
+ abA\P
+ aBAb\P
+ abaBA\P
+ abAbaB\P
+ abaBabA\P
+ abAbABaBx\P
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testinput5
===================================================================
--- code/trunk/testdata/testinput5 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput5 2012-02-14 13:05:39 UTC (rev 915)
@@ -693,4 +693,43 @@
\x{2027}\x{2030}\x{2028}\x{2029}
\x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
+/(..)\1/8
+ ab\P
+ aba\P
+ abab\P
+
+/(..)\1/8i
+ ab\P
+ abA\P
+ aBAb\P
+
+/(..)\1{2,}/8
+ ab\P
+ aba\P
+ abab\P
+ ababa\P
+ ababab\P
+ ababab\P\P
+ abababa\P
+ abababa\P\P
+
+/(..)\1{2,}/8i
+ ab\P
+ aBa\P
+ aBAb\P
+ AbaBA\P
+ abABAb\P
+ aBAbaB\P\P
+ abABabA\P
+ abaBABa\P\P
+
+/(..)\1{2,}?x/8i
+ ab\P
+ abA\P
+ aBAb\P
+ abaBA\P
+ abAbaB\P
+ abaBabA\P
+ abAbABaBx\P
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testinput6 2012-02-14 13:05:39 UTC (rev 915)
@@ -816,4 +816,28 @@
Ⱥ
ⱥ
+/\X/
+ a\P
+ a\P\P
+
+/\Xa/
+ aa\P
+ aa\P\P
+
+/\X{2}/
+ aa\P
+ aa\P\P
+
+/\X+a/
+ a\P
+ aa\P
+ aa\P\P
+
+/\X+?a/
+ a\P
+ ab\P
+ aa\P
+ aa\P\P
+ aba\P
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput2 2012-02-14 13:05:39 UTC (rev 915)
@@ -12011,4 +12011,79 @@
0+ ab
MK: x
+/(..)\1/
+ ab\P
+Partial match: ab
+ aba\P
+Partial match: aba
+ abab\P
+ 0: abab
+ 1: ab
+
+/(..)\1/i
+ ab\P
+Partial match: ab
+ abA\P
+Partial match: abA
+ aBAb\P
+ 0: aBAb
+ 1: aB
+
+/(..)\1{2,}/
+ ab\P
+Partial match: ab
+ aba\P
+Partial match: aba
+ abab\P
+Partial match: abab
+ ababa\P
+Partial match: ababa
+ ababab\P
+ 0: ababab
+ 1: ab
+ ababab\P\P
+Partial match: ababab
+ abababa\P
+ 0: ababab
+ 1: ab
+ abababa\P\P
+Partial match: abababa
+
+/(..)\1{2,}/i
+ ab\P
+Partial match: ab
+ aBa\P
+Partial match: aBa
+ aBAb\P
+Partial match: aBAb
+ AbaBA\P
+Partial match: AbaBA
+ abABAb\P
+ 0: abABAb
+ 1: ab
+ aBAbaB\P\P
+Partial match: aBAbaB
+ abABabA\P
+ 0: abABab
+ 1: ab
+ abaBABa\P\P
+Partial match: abaBABa
+
+/(..)\1{2,}?x/i
+ ab\P
+Partial match: ab
+ abA\P
+Partial match: abA
+ aBAb\P
+Partial match: aBAb
+ abaBA\P
+Partial match: abaBA
+ abAbaB\P
+Partial match: abAbaB
+ abaBabA\P
+Partial match: abaBabA
+ abAbABaBx\P
+ 0: abAbABaBx
+ 1: ab
+
/-- End of testinput2 --/
Modified: code/trunk/testdata/testoutput5
===================================================================
--- code/trunk/testdata/testoutput5 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput5 2012-02-14 13:05:39 UTC (rev 915)
@@ -1651,4 +1651,79 @@
\x09\x0e\x{84}\x{86}\x{85}\x0a\x0b\x0c\x0d
0: \x{85}\x{0a}\x{0b}\x{0c}\x{0d}
+/(..)\1/8
+ ab\P
+Partial match: ab
+ aba\P
+Partial match: aba
+ abab\P
+ 0: abab
+ 1: ab
+
+/(..)\1/8i
+ ab\P
+Partial match: ab
+ abA\P
+Partial match: abA
+ aBAb\P
+ 0: aBAb
+ 1: aB
+
+/(..)\1{2,}/8
+ ab\P
+Partial match: ab
+ aba\P
+Partial match: aba
+ abab\P
+Partial match: abab
+ ababa\P
+Partial match: ababa
+ ababab\P
+ 0: ababab
+ 1: ab
+ ababab\P\P
+Partial match: ababab
+ abababa\P
+ 0: ababab
+ 1: ab
+ abababa\P\P
+Partial match: abababa
+
+/(..)\1{2,}/8i
+ ab\P
+Partial match: ab
+ aBa\P
+Partial match: aBa
+ aBAb\P
+Partial match: aBAb
+ AbaBA\P
+Partial match: AbaBA
+ abABAb\P
+ 0: abABAb
+ 1: ab
+ aBAbaB\P\P
+Partial match: aBAbaB
+ abABabA\P
+ 0: abABab
+ 1: ab
+ abaBABa\P\P
+Partial match: abaBABa
+
+/(..)\1{2,}?x/8i
+ ab\P
+Partial match: ab
+ abA\P
+Partial match: abA
+ aBAb\P
+Partial match: aBAb
+ abaBA\P
+Partial match: abaBA
+ abAbaB\P
+Partial match: abAbaB
+ abaBabA\P
+Partial match: abaBabA
+ abAbABaBx\P
+ 0: abAbABaBx
+ 1: ab
+
/-- End of testinput5 --/
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2012-02-13 06:04:50 UTC (rev 914)
+++ code/trunk/testdata/testoutput6 2012-02-14 13:05:39 UTC (rev 915)
@@ -1375,4 +1375,42 @@
ⱥ
0: \x{2c65}
+/\X/
+ a\P
+ 0: a
+ a\P\P
+Partial match: a
+
+/\Xa/
+ aa\P
+ 0: aa
+ aa\P\P
+ 0: aa
+
+/\X{2}/
+ aa\P
+ 0: aa
+ aa\P\P
+Partial match: aa
+
+/\X+a/
+ a\P
+Partial match: a
+ aa\P
+ 0: aa
+ aa\P\P
+Partial match: aa
+
+/\X+?a/
+ a\P
+Partial match: a
+ ab\P
+Partial match: ab
+ aa\P
+ 0: aa
+ aa\P\P
+ 0: aa
+ aba\P
+ 0: aba
+
/-- End of testinput6 --/