Revision: 424
http://www.exim.org/viewvc/pcre2?view=rev&revision=424
Author: ph10
Date: 2015-11-11 09:42:26 +0000 (Wed, 11 Nov 2015)
Log Message:
-----------
Small optimizations in pcre2_study.c
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcre2api.3
code/trunk/src/pcre2_study.c
code/trunk/testdata/testoutput15
code/trunk/testdata/testoutput17
code/trunk/testdata/testoutput2
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/ChangeLog 2015-11-11 09:42:26 UTC (rev 424)
@@ -282,7 +282,9 @@
81. Check for integer overflow in minimum length calculation and cap it at
65535.
+82. Small optimizations in code for finding the minimum matching length.
+
Version 10.20 30-June-2015
--------------------------
Modified: code/trunk/doc/pcre2api.3
===================================================================
--- code/trunk/doc/pcre2api.3 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/doc/pcre2api.3 2015-11-11 09:42:26 UTC (rev 424)
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "05 November 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "10 November 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@@ -1684,8 +1684,11 @@
.sp
PCRE2_INFO_MATCHEMPTY
.sp
-Return 1 if the pattern can match an empty string, otherwise 0. The third
-argument should point to an \fBuint32_t\fP variable.
+Return 1 if the pattern might match an empty string, otherwise 0. The third
+argument should point to an \fBuint32_t\fP variable. When a pattern contains
+recursive subroutine calls it is not always possible to determine whether or
+not it can match an empty string. PCRE2 takes a cautious approach and returns 1
+in such cases.
.sp
PCRE2_INFO_MATCHLIMIT
.sp
@@ -3084,6 +3087,6 @@
.rs
.sp
.nf
-Last updated: 05 November 2015
+Last updated: 10 November 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi
Modified: code/trunk/src/pcre2_study.c
===================================================================
--- code/trunk/src/pcre2_study.c 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/src/pcre2_study.c 2015-11-11 09:42:26 UTC (rev 424)
@@ -104,19 +104,22 @@
register int branchlength = 0;
register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
-/* A large and/or complex regex can take too long to process. */
+/* If this is a "could be empty" group, its minimum length is 0. */
-if ((*countptr)++ > 1000) return -1;
+if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
/* Skip over capturing bracket number */
-if (*code == OP_CBRA || *code == OP_SCBRA ||
- *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
+if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
-/* Scan along the opcodes for this branch. If we get to the end of the
-branch, check the length against that of the other branches. If the accumulated
-length passes 16-bits, stop and return it. */
+/* A large and/or complex regex can take too long to process. */
+if ((*countptr)++ > 1000) return -1;
+
+/* Scan along the opcodes for this branch. If we get to the end of the branch,
+check the length against that of the other branches. If the accumulated length
+passes 16-bits, stop. */
+
for (;;)
{
int d, min, recno;
@@ -1543,25 +1546,29 @@
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
}
-/* Find the minimum length of subject string. */
+/* Find the minimum length of subject string. If it can match an empty string,
+the minimum length is already known. */
-switch(min = find_minlength(re, code, code, utf, NULL, &count))
+if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
{
- case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
- break; /* Leave minlength unchanged (will be zero) */
+ switch(min = find_minlength(re, code, code, utf, NULL, &count))
+ {
+ case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
+ break; /* Leave minlength unchanged (will be zero) */
+
+ case -2:
+ return 2; /* missing capturing bracket */
+
+ case -3:
+ return 3; /* unrecognized opcode */
+
+ default:
+ if (min > UINT16_MAX) min = UINT16_MAX;
+ re->minlength = min;
+ break;
+ }
+ }
- case -2:
- return 2; /* missing capturing bracket */
-
- case -3:
- return 3; /* unrecognized opcode */
-
- default:
- if (min > UINT16_MAX) min = UINT16_MAX;
- re->minlength = min;
- break;
- }
-
return 0;
}
Modified: code/trunk/testdata/testoutput15
===================================================================
--- code/trunk/testdata/testoutput15 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/testdata/testoutput15 2015-11-11 09:42:26 UTC (rev 424)
@@ -252,7 +252,7 @@
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
-Subject length lower bound = 1
+Subject length lower bound = 0
abcd
0: a
1: a
@@ -262,7 +262,7 @@
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
abcd
0: ab
1: ab
@@ -272,7 +272,7 @@
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
abcd
0: ab
1: ab
Modified: code/trunk/testdata/testoutput17
===================================================================
--- code/trunk/testdata/testoutput17 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/testdata/testoutput17 2015-11-11 09:42:26 UTC (rev 424)
@@ -416,7 +416,7 @@
/(a|(?R))/I
Capturing subpattern count = 1
May match empty string
-Subject length lower bound = 1
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: a (JIT)
@@ -427,7 +427,7 @@
/(ab|(bc|(de|(?R))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: ab (JIT)
@@ -438,7 +438,7 @@
/(ab|(bc|(de|(?1))))/I
Capturing subpattern count = 3
May match empty string
-Subject length lower bound = 2
+Subject length lower bound = 0
JIT compilation was successful
abcd
0: ab (JIT)
Modified: code/trunk/testdata/testoutput2
===================================================================
--- code/trunk/testdata/testoutput2 2015-11-10 14:50:51 UTC (rev 423)
+++ code/trunk/testdata/testoutput2 2015-11-11 09:42:26 UTC (rev 424)
@@ -3960,7 +3960,7 @@
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
-Subject length lower bound = 3
+Subject length lower bound = 2
a=a
0: a=a
1: a