Revision: 1047
http://vcs.pcre.org/viewvc?view=rev&revision=1047
Author: zherczeg
Date: 2012-09-28 16:06:38 +0100 (Fri, 28 Sep 2012)
Log Message:
-----------
Case folding in JIT and removing unnecessary spaces
Modified Paths:
--------------
code/trunk/pcre_compile.c
code/trunk/pcre_exec.c
code/trunk/pcre_jit_compile.c
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_compile.c 2012-09-28 15:06:38 UTC (rev 1047)
@@ -2996,18 +2996,18 @@
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE) == negated;
-#ifdef SUPPORT_UCP
+#ifdef SUPPORT_UCP
case PT_CLIST:
p = PRIV(ucd_caseless_sets) + prop->caseset;
for (;;)
{
if ((unsigned int)c < *p) return !negated;
if ((unsigned int)c == *p++) return negated;
- }
+ }
break; /* Control never reaches here */
-#endif
+#endif
}
-
+
return FALSE;
}
#endif /* SUPPORT_UCP */
@@ -3109,12 +3109,12 @@
if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
return FALSE;
-
+
/* If the previous item is a character, get its value. */
if (op_code == OP_CHAR || op_code == OP_CHARI ||
op_code == OP_NOT || op_code == OP_NOTI)
- {
+ {
#ifdef SUPPORT_UTF
GETCHARTEST(c, previous);
#else
@@ -3133,19 +3133,19 @@
{
int ocs = UCD_CASESET(next);
if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, FALSE);
- }
+ }
#endif
switch(op_code)
{
case OP_CHAR:
return c != next;
-
+
/* For CHARI (caseless character) we must check the other case. If we have
Unicode property support, we can use it to test the other case of
high-valued characters. We know that next can have only one other case,
because multi-other-case characters are dealt with above. */
-
+
case OP_CHARI:
if (c == next) return FALSE;
#ifdef SUPPORT_UTF
@@ -3184,39 +3184,39 @@
else
#endif /* SUPPORT_UTF */
return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Not UTF */
-
+
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
-
+
case OP_DIGIT:
return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
-
+
case OP_NOT_DIGIT:
return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
-
+
case OP_WHITESPACE:
return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
-
+
case OP_NOT_WHITESPACE:
return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
-
+
case OP_WORDCHAR:
return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
-
+
case OP_NOT_WORDCHAR:
return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
-
+
case OP_HSPACE:
case OP_NOT_HSPACE:
switch(next)
{
HSPACE_CASES:
return op_code == OP_NOT_HSPACE;
-
+
default:
return op_code != OP_NOT_HSPACE;
}
-
+
case OP_ANYNL:
case OP_VSPACE:
case OP_NOT_VSPACE:
@@ -3224,23 +3224,23 @@
{
VSPACE_CASES:
return op_code == OP_NOT_VSPACE;
-
+
default:
return op_code != OP_NOT_VSPACE;
}
-
+
#ifdef SUPPORT_UCP
case OP_PROP:
return check_char_prop(next, previous[0], previous[1], FALSE);
-
+
case OP_NOTPROP:
return check_char_prop(next, previous[0], previous[1], TRUE);
#endif
-
+
default:
return FALSE;
}
- }
+ }
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
@@ -3278,7 +3278,7 @@
{
HSPACE_CASES:
return -next != ESC_h;
-
+
default:
return -next == ESC_h;
}
@@ -3289,7 +3289,7 @@
{
VSPACE_CASES:
return -next != ESC_v;
-
+
default:
return -next == ESC_v;
}
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_exec.c 2012-09-28 15:06:38 UTC (rev 1047)
@@ -200,12 +200,12 @@
if (c != d && c != d + ur->other_case)
{
const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
- for (;;)
- {
- if (c < *pp) return -1;
- if (c == *pp++) break;
+ for (;;)
+ {
+ if (c < *pp) return -1;
+ if (c == *pp++) break;
}
- }
+ }
}
}
else
@@ -2583,17 +2583,17 @@
c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
break;
-
+
case PT_CLIST:
cp = PRIV(ucd_caseless_sets) + prop->caseset;
for (;;)
{
if (c < *cp)
- { if (op == OP_PROP) RRETURN(MATCH_NOMATCH); else break; }
- if (c == *cp++)
- { if (op == OP_PROP) break; else RRETURN(MATCH_NOMATCH); }
- }
- break;
+ { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
+ if (c == *cp++)
+ { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
+ }
+ break;
/* This should never occur */
@@ -4200,12 +4200,12 @@
for (;;)
{
if (c < *cp)
- { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+ { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
if (c == *cp++)
- { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
- }
+ { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+ }
}
- break;
+ break;
/* This should not occur */
@@ -4935,11 +4935,11 @@
cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
for (;;)
{
- if (c < *cp)
- { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
+ if (c < *cp)
+ { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
if (c == *cp++)
- { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
- }
+ { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
+ }
}
/* Control never gets here */
Modified: code/trunk/pcre_jit_compile.c
===================================================================
--- code/trunk/pcre_jit_compile.c 2012-09-25 16:27:58 UTC (rev 1046)
+++ code/trunk/pcre_jit_compile.c 2012-09-28 15:06:38 UTC (rev 1047)
@@ -3481,9 +3481,11 @@
static const pcre_uchar *SLJIT_CALL do_utf_caselesscmp(pcre_uchar *src1, jit_arguments *args, pcre_uchar *end1)
{
/* This function would be ineffective to do in JIT level. */
-int c1, c2;
+pcre_uint32 c1, c2;
const pcre_uchar *src2 = args->uchar_ptr;
const pcre_uchar *end2 = args->end;
+const ucd_record *ur;
+const pcre_uint32 *pp;
while (src1 < end1)
{
@@ -3491,7 +3493,16 @@
return (pcre_uchar*)1;
GETCHARINC(c1, src1);
GETCHARINC(c2, src2);
- if (c1 != c2 && c1 != UCD_OTHERCASE(c2)) return NULL;
+ ur = GET_UCD(c2);
+ if (c1 != c2 && c1 != c2 + ur->other_case)
+ {
+ pp = PRIV(ucd_caseless_sets) + ur->caseset;
+ for (;;)
+ {
+ if (c1 < *pp) return NULL;
+ if (c1 == *pp++) break;
+ }
+ }
}
return src2;
}
@@ -3683,18 +3694,17 @@
DEFINE_COMPILER;
jump_list *found = NULL;
jump_list **list = (*cc & XCL_NOT) == 0 ? &found : backtracks;
-unsigned int c;
-int compares;
+pcre_int32 c, charoffset;
+const pcre_uint32 *other_cases;
struct sljit_jump *jump = NULL;
pcre_uchar *ccbegin;
+int compares, invertcmp, numberofcmps;
#ifdef SUPPORT_UCP
BOOL needstype = FALSE, needsscript = FALSE, needschar = FALSE;
BOOL charsaved = FALSE;
int typereg = TMP1, scriptreg = TMP1;
unsigned int typeoffset;
#endif
-int invertcmp, numberofcmps;
-unsigned int charoffset;
/* Although SUPPORT_UTF must be defined, we are
not necessary in utf mode even in 8 bit mode. */
@@ -3792,6 +3802,10 @@
needschar = TRUE;
break;
+ case PT_CLIST:
+ needschar = TRUE;
+ break;
+
default:
SLJIT_ASSERT_STOP();
break;
@@ -4001,6 +4015,20 @@
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_LESS_EQUAL);
jump = JUMP(SLJIT_C_NOT_ZERO ^ invertcmp);
break;
+
+ case PT_CLIST:
+ other_cases = PRIV(ucd_caseless_sets) + cc[1];
+
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, *other_cases++ - charoffset);
+ COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
+
+ while (*other_cases < NOTACHAR)
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, *other_cases++ - charoffset);
+ COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
+ }
+ jump = JUMP(SLJIT_C_NOT_ZERO ^ invertcmp);
+ break;
}
cc += 2;
}