Revision: 342
http://vcs.pcre.org/viewvc?view=rev&revision=342
Author: ph10
Date: 2008-04-20 18:10:13 +0100 (Sun, 20 Apr 2008)
Log Message:
-----------
Slight performance improvement by using the new OP_ALLANY opcode for cases of
the metacharacter "." when DOTALL is set. Also, some tidies consequent upon its
invention.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/HACKING
code/trunk/pcre_compile.c
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_study.c
code/trunk/testdata/testoutput10
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/ChangeLog 2008-04-20 17:10:13 UTC (rev 342)
@@ -86,6 +86,13 @@
it was being rejected as not supported by pcre_dfa_exec(), even though
other assertions are supported. I have made pcre_dfa_exec() support
(*FAIL).
+
+16. The implementation of 13c above involved the invention of a new opcode,
+ OP_ALLANY, which is like OP_ANY but doesn't check the /s flag. Since /s
+ cannot be changed at match time, I realized I could make a small
+ improvement to matching performance by compiling OP_ALLANY instead of
+ OP_ANY for "." when DOTALL was set, and then removing the runtime tests
+ on the OP_ANY path.
Version 7.6 28-Jan-08
Modified: code/trunk/HACKING
===================================================================
--- code/trunk/HACKING 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/HACKING 2008-04-20 17:10:13 UTC (rev 342)
@@ -125,7 +125,8 @@
These items are all just one byte long
OP_END end of pattern
- OP_ANY match any character
+ OP_ANY match any one character other than newline
+ OP_ALLANY match any one character, including newline
OP_ANYBYTE match any single byte, even in UTF-8 mode
OP_SOD match start of data: \A
OP_SOM, start of match (subject + offset): \G
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_compile.c 2008-04-20 17:10:13 UTC (rev 342)
@@ -1301,6 +1301,7 @@
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
+ case OP_ALLANY:
branchlength++;
cc++;
break;
@@ -1679,6 +1680,7 @@
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
+ case OP_ALLANY:
case OP_ANYBYTE:
case OP_CHAR:
case OP_CHARNC:
@@ -2665,7 +2667,7 @@
zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte;
previous = code;
- *code++ = OP_ANY;
+ *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break;
@@ -5753,14 +5755,14 @@
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
}
- /* .* is not anchored unless DOTALL is set and it isn't in brackets that
- are or may be referenced. */
+ /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
+ it isn't in brackets that are or may be referenced. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
- op == OP_TYPEPOSSTAR) &&
- (*options & PCRE_DOTALL) != 0)
+ op == OP_TYPEPOSSTAR))
{
- if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
+ if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
+ return FALSE;
}
/* Check for explicit anchoring */
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_dfa_exec.c 2008-04-20 17:10:13 UTC (rev 342)
@@ -739,7 +739,7 @@
/*-----------------------------------------------------------------*/
case OP_ANY:
- if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
+ if (clen > 0 && !IS_NEWLINE(ptr))
{ ADD_NEW(state_offset + 1, 0); }
break;
@@ -877,10 +877,7 @@
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
@@ -903,10 +900,7 @@
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSQUERY)
@@ -928,10 +922,7 @@
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSSTAR)
@@ -951,10 +942,7 @@
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (++count >= GET2(code, 1))
@@ -975,10 +963,7 @@
{
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 &&
- (d != OP_ANY ||
- (ims & PCRE_DOTALL) != 0 ||
- !IS_NEWLINE(ptr)
- ) &&
+ (d != OP_ANY || !IS_NEWLINE(ptr)) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{
if (codevalue == OP_TYPEPOSUPTO)
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_exec.c 2008-04-20 17:10:13 UTC (rev 342)
@@ -1429,16 +1429,12 @@
/* Match a single character type; inline for speed */
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ALLANY:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++;
break;
@@ -2955,8 +2951,7 @@
case OP_ANY:
for (i = 1; i <= min; i++)
{
- if (eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -3180,15 +3175,11 @@
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = 1; i <= min; i++)
{
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
+ eptr++;
}
- else eptr += min;
break;
case OP_ALLANY:
@@ -3449,14 +3440,13 @@
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
- IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
+ case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3609,13 +3599,13 @@
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
+ (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
switch(ctype)
{
- case OP_ANY: /* This is the DOTALL case */
+ case OP_ANY: /* This is the non-NL case */
case OP_ALLANY:
case OP_ANYBYTE:
break;
@@ -3870,43 +3860,24 @@
case OP_ANY:
if (max < INT_MAX)
{
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
- else
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
}
/* Handle unlimited UTF-8 repeat */
else
{
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
+ while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
- else
- {
- eptr = md->end_subject;
- }
}
break;
@@ -4108,16 +4079,12 @@
switch(ctype)
{
case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
+ for (i = min; i < max; i++)
{
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
+ eptr++;
}
- /* For DOTALL case, fall through */
+ break;
case OP_ALLANY:
case OP_ANYBYTE:
Modified: code/trunk/pcre_study.c
===================================================================
--- code/trunk/pcre_study.c 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/pcre_study.c 2008-04-20 17:10:13 UTC (rev 342)
@@ -348,6 +348,7 @@
switch(tcode[1])
{
case OP_ANY:
+ case OP_ALLANY:
return SSB_FAIL;
case OP_NOT_DIGIT:
Modified: code/trunk/testdata/testoutput10
===================================================================
--- code/trunk/testdata/testoutput10 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/testdata/testoutput10 2008-04-20 17:10:13 UTC (rev 342)
@@ -21,7 +21,7 @@
------------------------------------------------------------------
0 21 Bra
3 9 CBra 1
- 8 Any*
+ 8 AllAny*
10 X
12 6 Alt
15 ^
@@ -37,7 +37,7 @@
0 25 Bra
3 9 Bra
6 04 Opt
- 8 Any*
+ 8 AllAny*
10 X
12 8 Alt
15 04 Opt
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2008-04-19 16:41:04 UTC (rev 341)
+++ code/trunk/testdata/testoutput2 2008-04-20 17:10:13 UTC (rev 342)
@@ -1126,7 +1126,7 @@
/.*X/IDZs
------------------------------------------------------------------
Bra
- Any*
+ AllAny*
X
Ket
End
@@ -1160,7 +1160,7 @@
------------------------------------------------------------------
Bra
CBra 1
- Any*
+ AllAny*
X
Alt
^
@@ -1179,7 +1179,7 @@
------------------------------------------------------------------
Bra
CBra 1
- Any*
+ AllAny*
X
Alt
^
@@ -1199,7 +1199,7 @@
Bra
Bra
04 Opt
- Any*
+ AllAny*
X
Alt
04 Opt
@@ -1212,8 +1212,8 @@
------------------------------------------------------------------
Capturing subpattern count = 0
Partial matching not supported
-No options
-First char at start or follows newline
+Options: anchored
+No first char
No need char
/\Biss\B/I+