Revision: 1350
http://vcs.pcre.org/viewvc?view=rev&revision=1350
Author: ph10
Date: 2013-07-26 11:03:38 +0100 (Fri, 26 Jul 2013)
Log Message:
-----------
Fix backup bugs with \X repeat matches.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/pcre_exec.c
code/trunk/testdata/testinput6
code/trunk/testdata/testoutput6
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2013-07-08 16:36:43 UTC (rev 1349)
+++ code/trunk/ChangeLog 2013-07-26 10:03:38 UTC (rev 1350)
@@ -37,6 +37,20 @@
8. Implemented PCRE_INFO_MATCH_EMPTY, which yields 1 if the pattern can match
an empty string. If it can, pcretest shows this in its information output.
+
+9. Fixed two related bugs that applied to Unicode extended grapheme clusters
+ that were repeated with a maximizing qualifier (e.g. \X* or \X{2,5}) when
+ matched by pcre_exec() without using JIT:
+
+ (a) If the rest of the pattern did not match after a maximal run of
+ grapheme clusters, the code for backing up to try with fewer of them
+ did not always back up over a full grapheme when characters that do not
+ have the modifier quality were involved, e.g. Hangul syllables.
+
+ (b) If the match point in a subject started with modifier character, and
+ there was no match, the code could incorrectly back up beyond the match
+ point, and potentially beyond the first character in the subject,
+ leading to a segfault or an incorrect match result.
Version 8.33 28-May-2013
Modified: code/trunk/pcre_exec.c
===================================================================
--- code/trunk/pcre_exec.c 2013-07-08 16:36:43 UTC (rev 1349)
+++ code/trunk/pcre_exec.c 2013-07-26 10:03:38 UTC (rev 1350)
@@ -5637,7 +5637,7 @@
}
}
- /* Match extended Unicode sequences. We will get here only if the
+ /* Match extended Unicode grapheme clusters. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
@@ -5670,21 +5670,41 @@
/* eptr is now past the end of the maximum run */
if (possessive) continue; /* No backtracking */
+
for(;;)
{
- if (eptr == pp) goto TAIL_RECURSE;
+ int lgb, rgb;
+ PCRE_PUCHAR fptr;
+
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+
+ /* Backtracking over an extended grapheme cluster involves inspecting
+ the previous two characters (if present) to see if a break is
+ permitted between them. */
+
eptr--;
- for (;;) /* Move back over one extended */
+ if (!utf) c = *eptr; else
{
- if (!utf) c = *eptr; else
+ BACKCHAR(eptr);
+ GETCHAR(c, eptr);
+ }
+ rgb = UCD_GRAPHBREAK(c);
+
+ for (;;)
+ {
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ fptr = eptr - 1;
+ if (!utf) c = *fptr; else
{
- BACKCHAR(eptr);
- GETCHAR(c, eptr);
+ BACKCHAR(fptr);
+ GETCHAR(c, fptr);
}
- if (UCD_CATEGORY(c) != ucp_M) break;
- eptr--;
+ lgb = UCD_GRAPHBREAK(c);
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ eptr = fptr;
+ rgb = lgb;
}
}
}
Modified: code/trunk/testdata/testinput6
===================================================================
--- code/trunk/testdata/testinput6 2013-07-08 16:36:43 UTC (rev 1349)
+++ code/trunk/testdata/testinput6 2013-07-26 10:03:38 UTC (rev 1350)
@@ -882,6 +882,15 @@
\x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}X
\x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}X
+/\X*Z/8Y
+ A\x{300}
+
+/\X*(.)/8Y
+ A\x{1111}\x{ae4c}\x{1169}
+
+/\X?abc/8Y
+\xff\x7f\x00\x00\x03\x00\x41\xcc\x80\x41\x{300}\x61\x62\x63\x00\>06\?
+
/-- --/
/\x{1e9e}+/8i
Modified: code/trunk/testdata/testoutput6
===================================================================
--- code/trunk/testdata/testoutput6 2013-07-08 16:36:43 UTC (rev 1349)
+++ code/trunk/testdata/testoutput6 2013-07-26 10:03:38 UTC (rev 1350)
@@ -1548,6 +1548,19 @@
0: \x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}X
0+
+/\X*Z/8Y
+ A\x{300}
+No match
+
+/\X*(.)/8Y
+ A\x{1111}\x{ae4c}\x{1169}
+ 0: A\x{1111}
+ 1: \x{1111}
+
+/\X?abc/8Y
+\xff\x7f\x00\x00\x03\x00\x41\xcc\x80\x41\x{300}\x61\x62\x63\x00\>06\?
+ 0: A\x{300}abc
+
/-- --/
/\x{1e9e}+/8i