Revision: 606
http://vcs.pcre.org/viewvc?view=rev&revision=606
Author: ph10
Date: 2011-06-06 18:46:22 +0100 (Mon, 06 Jun 2011)
Log Message:
-----------
Tidy the API for _pcre_valid_utf8() to a more suitable form for a future public
release. Also make -s in pcretest force a study for every regex.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/doc/pcretest.1
code/trunk/pcre_compile.c
code/trunk/pcre_dfa_exec.c
code/trunk/pcre_exec.c
code/trunk/pcre_valid_utf8.c
code/trunk/pcretest.c
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/ChangeLog 2011-06-06 17:46:22 UTC (rev 606)
@@ -74,6 +74,13 @@
opcodes that mean there is no starting character; this means that when new
ones are added and accidentally left out of pcre_study(), testing should
pick them up.
+
+14. The -s option of pcretest has been documented for ages as being an old
+ synonym of -m (show memory usage). I have changed it to mean "force study
+ for every regex", that is, assume /S for every regex. This is similar to -i
+ and -d etc. It's slightly incompatible, but I'm hoping nobody is still
+ using it. It makes it easier to run collection of tests with study enabled,
+ and thereby test pcre_study() more easily.
Version 8.12 15-Jan-2011
Modified: code/trunk/doc/pcretest.1
===================================================================
--- code/trunk/doc/pcretest.1 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/doc/pcretest.1 2011-06-06 17:46:22 UTC (rev 606)
@@ -56,8 +56,7 @@
.TP 10
\fB-m\fP
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding \fB/M\fP to each regular expression. For compatibility
-with earlier versions of pcretest, \fB-s\fP is a synonym for \fB-m\fP.
+equivalent to adding \fB/M\fP to each regular expression.
.TP 10
\fB-o\fP \fIosize\fP
Set the number of elements in the output vector that is used when calling
@@ -79,6 +78,10 @@
On Unix-like systems, set the size of the run-time stack to \fIsize\fP
megabytes.
.TP 10
+\fB-s\fP
+Behave as if each regex has the \fB/S\fP modifier; in other words, force each
+regex to be studied.
+.TP 10
\fB-t\fP
Run each compile, study, and match many times with a timer, and output
resulting time per compile or match (in milliseconds). Do not set \fB-m\fP with
@@ -789,6 +792,6 @@
.rs
.sp
.nf
-Last updated: 07 May 2011
+Last updated: 06 June 2011
Copyright (c) 1997-2011 University of Cambridge.
.fi
Modified: code/trunk/pcre_compile.c
===================================================================
--- code/trunk/pcre_compile.c 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_compile.c 2011-06-06 17:46:22 UTC (rev 606)
@@ -6976,13 +6976,12 @@
/* Can't support UTF8 unless PCRE has been compiled to include the code. The
return of an error code from _pcre_valid_utf8() is a new feature, introduced in
-release 8.13. The only use we make of it here is to adjust the offset value to
-the end of the string for a short string error, for compatibility with previous
-versions. */
+release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
+not used here. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
- (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0)
+ (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0)
{
errorcode = ERR44;
goto PCRE_EARLY_ERROR_RETURN2;
Modified: code/trunk/pcre_dfa_exec.c
===================================================================
--- code/trunk/pcre_dfa_exec.c 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_dfa_exec.c 2011-06-06 17:46:22 UTC (rev 606)
@@ -3123,23 +3123,21 @@
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- int errorcode;
- int tb = _pcre_valid_utf8((uschar *)subject, length, &errorcode);
- if (tb >= 0)
+ int erroroffset;
+ int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
+ if (errorcode != 0)
{
if (offsetcount >= 2)
{
- offsets[0] = tb;
+ offsets[0] = erroroffset;
offsets[1] = errorcode;
}
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
}
- if (start_offset > 0 && start_offset < length)
- {
- tb = ((USPTR)subject)[start_offset] & 0xc0;
- if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ if (start_offset > 0 && start_offset < length &&
+ (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
+ return PCRE_ERROR_BADUTF8_OFFSET;
}
#endif
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_exec.c 2011-06-06 17:46:22 UTC (rev 606)
@@ -5999,28 +5999,29 @@
return PCRE_ERROR_BADPARTIAL;
/* Check a UTF-8 string if required. Pass back the character offset and error
-code if a results vector is available. */
+code for an invalid string if a results vector is available. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
- int errorcode;
- int tb = _pcre_valid_utf8((USPTR)subject, length, &errorcode);
- if (tb >= 0)
+ int erroroffset;
+ int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
+ if (errorcode != 0)
{
if (offsetcount >= 2)
{
- offsets[0] = tb;
+ offsets[0] = erroroffset;
offsets[1] = errorcode;
}
return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
- }
- if (start_offset > 0 && start_offset < length)
- {
- tb = ((USPTR)subject)[start_offset] & 0xc0;
- if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
- }
+ }
+
+ /* Check that a start_offset points to the start of a UTF-8 character. */
+
+ if (start_offset > 0 && start_offset < length &&
+ (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
+ return PCRE_ERROR_BADUTF8_OFFSET;
}
#endif
Modified: code/trunk/pcre_valid_utf8.c
===================================================================
--- code/trunk/pcre_valid_utf8.c 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcre_valid_utf8.c 2011-06-06 17:46:22 UTC (rev 606)
@@ -68,7 +68,7 @@
characters is still checked.
From release 8.13 more information about the details of the error are passed
-back in the error code:
+back in the returned value:
PCRE_UTF8_ERR0 No error
PCRE_UTF8_ERR1 Missing 1 byte at the end of the string
@@ -96,14 +96,14 @@
Arguments:
string points to the string
length length of string, or -1 if the string is zero-terminated
- errp pointer to an error code variable
+ errp pointer to an error position offset variable
-Returns: < 0 if the string is a valid UTF-8 string
- >= 0 otherwise; the value is the offset of the bad character
+Returns: = 0 if the string is a valid UTF-8 string
+ > 0 otherwise, setting the offset of the bad character
*/
int
-_pcre_valid_utf8(USPTR string, int length, int *errorcode)
+_pcre_valid_utf8(USPTR string, int length, int *erroroffset)
{
#ifdef SUPPORT_UTF8
register USPTR p;
@@ -114,8 +114,6 @@
length = p - string;
}
-*errorcode = PCRE_UTF8_ERR0;
-
for (p = string; length-- > 0; p++)
{
register int ab, c, d;
@@ -125,21 +123,21 @@
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
- *errorcode = PCRE_UTF8_ERR20;
- return p - string;
+ *erroroffset = p - string;
+ return PCRE_UTF8_ERR20;
}
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
- *errorcode = PCRE_UTF8_ERR21;
- return p - string;
+ *erroroffset = p - string;
+ return PCRE_UTF8_ERR21;
}
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
if (length < ab)
{
- *errorcode = ab - length; /* Codes ERR1 to ERR5 */
- return p - string; /* Missing bytes */
+ *erroroffset = p - string; /* Missing bytes */
+ return ab - length; /* Codes ERR1 to ERR5 */
}
length -= ab; /* Length remaining */
@@ -147,8 +145,8 @@
if (((d = *(++p)) & 0xc0) != 0x80)
{
- *errorcode = PCRE_UTF8_ERR6;
- return p - string - 1;
+ *erroroffset = p - string - 1;
+ return PCRE_UTF8_ERR6;
}
/* For each length, check that the remaining bytes start with the 0x80 bit
@@ -162,8 +160,8 @@
case 1: if ((c & 0x3e) == 0)
{
- *errorcode = PCRE_UTF8_ERR15;
- return p - string - 1;
+ *erroroffset = p - string - 1;
+ return PCRE_UTF8_ERR15;
}
break;
@@ -174,18 +172,18 @@
case 2:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *errorcode = PCRE_UTF8_ERR7;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
- *errorcode = PCRE_UTF8_ERR16;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
- *errorcode = PCRE_UTF8_ERR14;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR14;
}
break;
@@ -196,23 +194,23 @@
case 3:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *errorcode = PCRE_UTF8_ERR7;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *errorcode = PCRE_UTF8_ERR8;
- return p - string - 3;
+ *erroroffset = p - string - 3;
+ return PCRE_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
- *errorcode = PCRE_UTF8_ERR17;
- return p - string - 3;
+ *erroroffset = p - string - 3;
+ return PCRE_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
- *errorcode = PCRE_UTF8_ERR13;
- return p - string - 3;
+ *erroroffset = p - string - 3;
+ return PCRE_UTF8_ERR13;
}
break;
@@ -227,23 +225,23 @@
case 4:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *errorcode = PCRE_UTF8_ERR7;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *errorcode = PCRE_UTF8_ERR8;
- return p - string - 3;
+ *erroroffset = p - string - 3;
+ return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
- *errorcode = PCRE_UTF8_ERR9;
- return p - string - 4;
+ *erroroffset = p - string - 4;
+ return PCRE_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
- *errorcode = PCRE_UTF8_ERR18;
- return p - string - 4;
+ *erroroffset = p - string - 4;
+ return PCRE_UTF8_ERR18;
}
break;
@@ -253,28 +251,28 @@
case 5:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
- *errorcode = PCRE_UTF8_ERR7;
- return p - string - 2;
+ *erroroffset = p - string - 2;
+ return PCRE_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
- *errorcode = PCRE_UTF8_ERR8;
- return p - string - 3;
+ *erroroffset = p - string - 3;
+ return PCRE_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
- *errorcode = PCRE_UTF8_ERR9;
- return p - string - 4;
+ *erroroffset = p - string - 4;
+ return PCRE_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
- *errorcode = PCRE_UTF8_ERR10;
- return p - string - 5;
+ *erroroffset = p - string - 5;
+ return PCRE_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
- *errorcode = PCRE_UTF8_ERR19;
- return p - string - 5;
+ *erroroffset = p - string - 5;
+ return PCRE_UTF8_ERR19;
}
break;
}
@@ -285,8 +283,8 @@
if (ab > 3)
{
- *errorcode = (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
- return p - string - ab;
+ *erroroffset = p - string - ab;
+ return (ab == 4)? PCRE_UTF8_ERR11 : PCRE_UTF8_ERR12;
}
}
@@ -295,7 +293,7 @@
(void)(length);
#endif
-return -1; /* This indicates success */
+return PCRE_UTF8_ERR0; /* This indicates success */
}
/* End of pcre_valid_utf8.c */
Modified: code/trunk/pcretest.c
===================================================================
--- code/trunk/pcretest.c 2011-06-03 18:18:30 UTC (rev 605)
+++ code/trunk/pcretest.c 2011-06-06 17:46:22 UTC (rev 606)
@@ -1186,7 +1186,7 @@
#endif
printf(" -q quiet: do not output PCRE version number at start\n");
printf(" -S <n> set stack size to <n> megabytes\n");
-printf(" -s output store (memory) used information\n"
+printf(" -s force each pattern to be studied\n"
" -t time compilation and execution\n");
printf(" -t <n> time compilation and execution, repeating <n> times\n");
printf(" -tm time execution (matching) only\n");
@@ -1214,6 +1214,7 @@
int timeitm = 0;
int showinfo = 0;
int showstore = 0;
+int force_study = 0;
int quiet = 0;
int size_offsets = 45;
int size_offsets_max;
@@ -1262,8 +1263,8 @@
{
unsigned char *endptr;
- if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
- showstore = 1;
+ if (strcmp(argv[op], "-m") == 0) showstore = 1;
+ else if (strcmp(argv[op], "-s") == 0) force_study = 1;
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
else if (strcmp(argv[op], "-b") == 0) debug = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
@@ -1807,10 +1808,10 @@
true_size = ((real_pcre *)re)->size;
regex_gotten_store = gotten_store;
- /* If /S was present, study the regexp to generate additional info to
+ /* If -s or /S was present, study the regexp to generate additional info to
help with the matching. */
- if (do_study)
+ if (do_study || force_study)
{
if (timeit > 0)
{
@@ -2050,7 +2051,7 @@
so messes up the test suite. (And with the /F option, it might be
flipped.) */
- if (do_study)
+ if (do_study || force_study)
{
if (extra == NULL)
fprintf(outfile, "Study returned NULL\n");