Revision: 1311
http://vcs.pcre.org/viewvc?view=rev&revision=1311
Author: ph10
Date: 2013-04-22 18:35:23 +0100 (Mon, 22 Apr 2013)
Log Message:
-----------
Use tail recursion in maximizing character and character type repetitions, to
reduce stack usage.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput1
code/trunk/testdata/testinput4
code/trunk/testdata/testoutput1
code/trunk/testdata/testoutput2
code/trunk/testdata/testoutput4
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/ChangeLog 2013-04-22 17:35:23 UTC (rev 1311)
@@ -134,6 +134,9 @@
35. Implement PCRE_NEVER_UTF to lock out the use of UTF, in particular,
blocking (*UTF) etc.
+
+36. In the interpreter, maximizing pattern repetitions for characters and
+ character types now use tail recursion, which reduces stack usage.
Version 8.32 30-November-2012
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/pcre_exec.c 2013-04-22 17:35:23 UTC (rev 1311)
@@ -3387,7 +3387,22 @@
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. */
+ /* Common code for all repeated single-character matches. We first check
+ for the minimum number of characters. If the minimum equals the maximum, we
+ are done. Otherwise, if minimizing, check the rest of the pattern for a
+ match; if there isn't one, advance up to the maximum, one character at a
+ time.
+
+ If maximizing, advance up to the maximum number of matching characters,
+ until eptr is past the end of the maximum run. If possessive, we are
+ then done (no backing up). Otherwise, match at this position; anything
+ other than no match is immediately returned. For nomatch, back up one
+ character, unless we are matching \R and the last thing matched was
+ \r\n, in which case, back up two bytes. When we reach the first optional
+ character position, we can save stack by doing a tail recurse.
+
+ The various UTF/non-UTF and caseful/caseless cases are handled separately,
+ for speed. */
REPEATCHAR:
#ifdef SUPPORT_UTF
@@ -3471,13 +3486,13 @@
}
}
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
+ /* if (eptr == pp) { RRETURN(MATCH_NOMATCH); } */
#ifdef SUPPORT_UCP
eptr--;
BACKCHAR(eptr);
@@ -3577,10 +3592,10 @@
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3635,10 +3650,10 @@
if (fc != RAWUCHARTEST(eptr)) break;
eptr++;
}
- if (possessive) continue;
-
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
eptr--;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3815,7 +3830,7 @@
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = 1; i <= min; i++)
@@ -3853,7 +3868,7 @@
}
}
else
-#endif
+#endif /*SUPPORT_UTF */
/* Not UTF mode */
{
for (fi = min;; fi++)
@@ -3895,17 +3910,18 @@
if (fc == d || (unsigned int)foc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
else
-#endif
+#endif /* SUPPORT_UTF */
/* Not UTF mode */
{
for (i = min; i < max; i++)
@@ -3918,9 +3934,10 @@
if (fc == *eptr || foc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -4030,12 +4047,13 @@
if (fc == d) break;
eptr += len;
}
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
}
}
@@ -4053,9 +4071,10 @@
if (fc == *eptr) break;
eptr++;
}
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@@ -5613,12 +5632,13 @@
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
if (utf) BACKCHAR(eptr);
}
}
@@ -5655,13 +5675,13 @@
/* eptr is now past the end of the maximum run */
- if (possessive) continue;
-
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
for (;;) /* Move back over one extended */
{
if (!utf) c = *eptr; else
@@ -5936,18 +5956,13 @@
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character, unless we are matching \R and the last thing matched was
- \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
+ if (possessive) continue; /* No backtracking */
for(;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
+ eptr--;
BACKCHAR(eptr);
if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
@@ -6185,15 +6200,10 @@
RRETURN(PCRE_ERROR_INTERNAL);
}
- /* eptr is now past the end of the maximum run. If possessive, we are
- done (no backing up). Otherwise, match at this position; anything other
- than no match is immediately returned. For nomatch, back up one
- character (byte), unless we are matching \R and the last thing matched
- was \r\n, in which case, back up two bytes. */
-
- if (possessive) continue;
- while (eptr >= pp)
+ if (possessive) continue; /* No backtracking */
+ for (;;)
{
+ if (eptr == pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
Modified: code/trunk/testdata/testinput1
===================================================================
--- code/trunk/testdata/testinput1 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testinput1 2013-04-22 17:35:23 UTC (rev 1311)
@@ -5588,4 +5588,24 @@
/(*THEN:m(m)(?&y)(?(DEFINE)(?<y>b))/K
abc
+/^\d*\w{4}/
+ 1234
+ 123
+
+/^[^b]*\w{4}/
+ aaaa
+ aaa
+
+/^[^b]*\w{4}/i
+ aaaa
+ aaa
+
+/^a*\w{4}/
+ aaaa
+ aaa
+
+/^a*\w{4}/i
+ aaaa
+ aaa
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testinput4
===================================================================
--- code/trunk/testdata/testinput4 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testinput4 2013-04-22 17:35:23 UTC (rev 1311)
@@ -691,4 +691,32 @@
\x{fdee}
\x{fdef}
+/^\d*\w{4}/8
+ 1234
+ 123
+
+/^\p{Any}*\d{4}/8
+ 1234
+ 123
+
+/^\X*\w{4}/8
+ 1234
+ 123
+
+/^[^b]*\w{4}/8
+ aaaa
+ aaa
+
+/^[^b]*\w{4}/8i
+ aaaa
+ aaa
+
+/^\x{100}*.{4}/8
+ \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+
+/^\x{100}*.{4}/8i
+ \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+
/-- End of testinput4 --/
Modified: code/trunk/testdata/testoutput1
===================================================================
--- code/trunk/testdata/testoutput1 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput1 2013-04-22 17:35:23 UTC (rev 1311)
@@ -9172,4 +9172,34 @@
0: b
MK: m(m
+/^\d*\w{4}/
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^[^b]*\w{4}/
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^[^b]*\w{4}/i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^a*\w{4}/
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^a*\w{4}/i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
/-- End of testinput1 --/
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput2 2013-04-22 17:35:23 UTC (rev 1311)
@@ -4348,7 +4348,7 @@
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\M
Minimum match() limit = 32768
-Minimum match() recursion limit = 42
+Minimum match() recursion limit = 29
No match
/(aaa(?C1)bbb|ab)/I
Modified: code/trunk/testdata/testoutput4
===================================================================
--- code/trunk/testdata/testoutput4 2013-04-06 06:51:09 UTC (rev 1310)
+++ code/trunk/testdata/testoutput4 2013-04-22 17:35:23 UTC (rev 1311)
@@ -1227,4 +1227,46 @@
\x{fdef}
0: \x{fdef}
+/^\d*\w{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^\p{Any}*\d{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^\X*\w{4}/8
+ 1234
+ 0: 1234
+ 123
+No match
+
+/^[^b]*\w{4}/8
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^[^b]*\w{4}/8i
+ aaaa
+ 0: aaaa
+ aaa
+No match
+
+/^\x{100}*.{4}/8
+ \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+No match
+
+/^\x{100}*.{4}/8i
+ \x{100}\x{100}\x{100}\x{100}
+ 0: \x{100}\x{100}\x{100}\x{100}
+ \x{100}\x{100}\x{100}
+No match
+
/-- End of testinput4 --/