Revision: 192
http://www.exim.org/viewvc/pcre2?view=rev&revision=192
Author: ph10
Date: 2015-02-06 16:47:15 +0000 (Fri, 06 Feb 2015)
Log Message:
-----------
Give an internal error for a bad opcode during auto-possessification. This can
stop a loop when compiling an invalid UTF string with PCRE2_NO_UTF_CHECK.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/src/pcre2_auto_possess.c
code/trunk/src/pcre2_compile.c
code/trunk/src/pcre2_error.c
code/trunk/src/pcre2_internal.h
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2015-02-06 09:29:31 UTC (rev 191)
+++ code/trunk/ChangeLog 2015-02-06 16:47:15 UTC (rev 192)
@@ -55,7 +55,14 @@
10. The error message for an invalid quantifier has been changed from "nothing
to repeat" to "quantifier does not follow a repeatable item".
+11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but
+scanning the compiled pattern in subsequent auto-possessification can get out
+of step and lead to an unknown opcode. Previously this could have caused an
+infinite loop. Now it generates an "internal error" error. This is a tidyup,
+not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an
+undefined outcome.
+
Version 10.00 05-January-2015
-----------------------------
Modified: code/trunk/src/pcre2_auto_possess.c
===================================================================
--- code/trunk/src/pcre2_auto_possess.c 2015-02-06 09:29:31 UTC (rev 191)
+++ code/trunk/src/pcre2_auto_possess.c 2015-02-06 16:47:15 UTC (rev 192)
@@ -1090,17 +1090,20 @@
*************************************************/
/* Replaces single character iterations with their possessive alternatives
-if appropriate. This function modifies the compiled opcode!
+if appropriate. This function modifies the compiled opcode! Hitting a
+non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a
+bad UTF string was compiled with PCRE2_NO_UTF_CHECK.
Arguments:
code points to start of the byte code
utf TRUE in UTF mode
cb compile data block
-Returns: nothing
+Returns: 0 for success
+ -1 if a non-existant opcode is encountered
*/
-void
+int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
{
register PCRE2_UCHAR c;
@@ -1111,7 +1114,9 @@
for (;;)
{
c = *code;
-
+
+ if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */
+
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
{
c -= get_repeat_base(c) - OP_STAR;
@@ -1207,7 +1212,7 @@
switch(c)
{
case OP_END:
- return;
+ return 0;
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
Modified: code/trunk/src/pcre2_compile.c
===================================================================
--- code/trunk/src/pcre2_compile.c 2015-02-06 09:29:31 UTC (rev 191)
+++ code/trunk/src/pcre2_compile.c 2015-02-06 16:47:15 UTC (rev 192)
@@ -573,7 +573,7 @@
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
- ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79 };
+ ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@@ -7802,7 +7802,9 @@
}
/* Fill in any forward references that are required. There may be repeated
-references; optimize for them, as searching a large regex takes time. */
+references; optimize for them, as searching a large regex takes time. The
+test of errorcode inside the loop means that nothing is done if it is already
+non-zero. */
if (cb.hwm > cb.start_workspace)
{
@@ -7832,23 +7834,23 @@
ccontext->memctl.memory_data);
cb.start_workspace = NULL;
-/* Give an error if there's back reference to a non-existent capturing
-subpattern. */
+/* After a successful compile, give an error if there's back reference to a
+non-existent capturing subpattern. Then, unless disabled, check whether any
+single character iterators can be auto-possessified. The function overwrites
+the appropriate opcode values, so the type of the pointer must be cast. NOTE:
+the intermediate variable "temp" is used in this code because at least one
+compiler gives a warning about loss of "const" attribute if the cast
+(PCRE2_UCHAR *)codestart is used directly in the function call. */
-if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
-
-/* Unless disabled, check whether any single character iterators can be
-auto-possessified. The function overwrites the appropriate opcode values, so
-the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
-used in this code because at least one compiler gives a warning about loss of
-"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
-function call. */
-
-if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
+if (errorcode == 0)
{
- PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
- PRIV(auto_possessify)(temp, utf, &cb);
- }
+ if (re->top_backref > re->top_bracket) errorcode = ERR15;
+ else if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
+ {
+ PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
+ if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+ }
+ }
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
@@ -7858,7 +7860,7 @@
exceptional ones forgo this. We scan the pattern to check that they are fixed
length, and set their lengths. */
-if (cb.check_lookbehind)
+if (errorcode == 0 && cb.check_lookbehind)
{
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;
Modified: code/trunk/src/pcre2_error.c
===================================================================
--- code/trunk/src/pcre2_error.c 2015-02-06 09:29:31 UTC (rev 191)
+++ code/trunk/src/pcre2_error.c 2015-02-06 16:47:15 UTC (rev 192)
@@ -159,6 +159,8 @@
"character code point value in \\u.... sequence is too large\0"
"digits missing in \\x{} or \\o{}\0"
"syntax error in (?(VERSION condition\0"
+ /* 80 */
+ "internal error: unknown opcode in auto_possessify()\0"
;
/* Match-time and UTF error texts are in the same format. */
Modified: code/trunk/src/pcre2_internal.h
===================================================================
--- code/trunk/src/pcre2_internal.h 2015-02-06 09:29:31 UTC (rev 191)
+++ code/trunk/src/pcre2_internal.h 2015-02-06 16:47:15 UTC (rev 192)
@@ -1882,7 +1882,7 @@
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
-extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
const compile_block *);
extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,