Revision: 778
http://vcs.pcre.org/viewvc?view=rev&revision=778
Author: ph10
Date: 2011-12-01 17:38:47 +0000 (Thu, 01 Dec 2011)
Log Message:
-----------
Fix bug with caseless matching of characters of different lengths when the
shorter is right at the end of the subject.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput6
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2011-12-01 15:15:31 UTC (rev 777)
+++ code/trunk/ChangeLog 2011-12-01 17:38:47 UTC (rev 778)
@@ -89,6 +89,10 @@
21. Retrieve executable code size support for the JIT compiler and fixing
some warnings.
+
+22. A caseless match of a UTF-8 character whose other case uses fewer bytes did
+ not work when the shorter character appeared right at the end of the
+ subject string.
Version 8.20 21-Oct-2011
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2011-12-01 15:15:31 UTC (rev 777)
+++ code/trunk/pcre_exec.c 2011-12-01 17:38:47 UTC (rev 778)
@@ -417,7 +417,7 @@
same response. */
/* These macros pack up tests that are used for partial matching, and which
-appears several times in the code. We set the "hit end" flag if the pointer is
+appear several times in the code. We set the "hit end" flag if the pointer is
at the end of the subject and also past the start of the subject (i.e.
something has been matched). For hard partial matching, we then return
immediately. The second one is used when we already know we are past the end of
@@ -3037,31 +3037,36 @@
}
break;
- /* Match a single character, caselessly */
+ /* Match a single character, caselessly. If we are at the end of the
+ subject, give up immediately. */
case OP_CHARI:
+ if (eptr >= md->end_subject)
+ {
+ SCHECK_PARTIAL();
+ RRETURN(MATCH_NOMATCH);
+ }
+
#ifdef SUPPORT_UTF8
if (utf8)
{
length = 1;
ecode++;
GETCHARLEN(fc, ecode, length);
-
- if (length > md->end_subject - eptr)
- {
- CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
- }
-
+
/* If the pattern character's value is < 128, we have only one byte, and
- can use the fast lookup table. */
+ we know that its other case must also be one byte long, so we can use the
+ fast lookup table. We know that there is at least one byte left in the
+ subject. */
if (fc < 128)
{
if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
}
- /* Otherwise we must pick up the subject character */
+ /* Otherwise we must pick up the subject character. Note that we cannot
+ use the value of "length" to check for sufficient bytes left, because the
+ other case of the character may have more or fewer bytes. */
else
{
@@ -3086,11 +3091,6 @@
/* Non-UTF-8 mode */
{
- if (md->end_subject - eptr < 1)
- {
- SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
- RRETURN(MATCH_NOMATCH);
- }
if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
ecode += 2;
}
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2011-12-01 15:15:31 UTC (rev 777)
+++ code/trunk/testdata/testinput6 2011-12-01 17:38:47 UTC (rev 778)
@@ -802,4 +802,18 @@
** Failers
a\xFCb
+/ⱥ/8i
+ ⱥ
+ Ⱥx
+ Ⱥ
+
+/[ⱥ]/8i
+ ⱥ
+ Ⱥx
+ Ⱥ
+
+/Ⱥ/8i
+ Ⱥ
+ ⱥ
+
/-- End of testinput6 --/
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2011-12-01 15:15:31 UTC (rev 777)
+++ code/trunk/testdata/testoutput6 2011-12-01 17:38:47 UTC (rev 778)
@@ -1353,4 +1353,26 @@
a\xFCb
No match
+/ⱥ/8i
+ ⱥ
+ 0: \x{2c65}
+ Ⱥx
+ 0: \x{23a}
+ Ⱥ
+ 0: \x{23a}
+
+/[ⱥ]/8i
+ ⱥ
+ 0: \x{2c65}
+ Ⱥx
+ 0: \x{23a}
+ Ⱥ
+ 0: \x{23a}
+
+/Ⱥ/8i
+ Ⱥ
+ 0: \x{23a}
+ ⱥ
+ 0: \x{2c65}
+
/-- End of testinput6 --/