OK, here's the patch to fix up GlobalReplace for the C++ wrapper.
craig
--cut here--
Index: pcrecpp.h
===================================================================
--- pcrecpp.h (revision 470)
+++ pcrecpp.h (working copy)
@@ -674,6 +674,7 @@
int TryMatch(const StringPiece& text,
int startpos,
Anchor anchor,
+ bool empty_ok,
int *vec,
int vecsize) const;
Index: pcrecpp.cc
===================================================================
--- pcrecpp.cc (revision 470)
+++ pcrecpp.cc (working copy)
@@ -331,7 +331,7 @@
bool RE::Replace(const StringPiece& rewrite,
string *str) const {
int vec[kVecSize];
- int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
+ int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
return false;
@@ -384,49 +384,64 @@
string out;
int start = 0;
int lastend = -1;
+ bool last_match_was_empty_string = false;
while (start <= static_cast<int>(str->length())) {
- int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
- if (matches <= 0)
- break;
- int matchstart = vec[0], matchend = vec[1];
- assert(matchstart >= start);
- assert(matchend >= matchstart);
- if (matchstart == matchend && matchstart == lastend) {
- // advance one character if we matched an empty string at the same
- // place as the last match occurred
- matchend = start + 1;
- // If the current char is CR and we're in CRLF mode, skip LF too.
- // Note it's better to call pcre_fullinfo() than to examine
- // all_options(), since options_ could have changed bewteen
- // compile-time and now, but this is simpler and safe enough.
- // Modified by PH to add ANY and ANYCRLF.
- if (start+1 < static_cast<int>(str->length()) &&
- (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
- (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
- NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
- NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
- ) {
- matchend++;
- }
- // We also need to advance more than one char if we're in utf8 mode.
+ // If the previous match was for the empty string, we shouldn't
+ // just match again: we'll match in the same way and get an
+ // infinite loop. Instead, we do the match in a special way:
+ // anchored -- to force another try at the same position --
+ // and with a flag saying that this time, ignore empty matches.
+ // If this special match returns, that means there's a non-empty
+ // match at this position as well, and we can continue. If not,
+ // we do what perl does, and just advance by one.
+ // Notice that perl prints '@@@' for this;
+ // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
+ int matches;
+ if (last_match_was_empty_string) {
+ matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
+ if (matches <= 0) {
+ int matchend = start + 1; // advance one character.
+ // If the current char is CR and we're in CRLF mode, skip LF too.
+ // Note it's better to call pcre_fullinfo() than to examine
+ // all_options(), since options_ could have changed bewteen
+ // compile-time and now, but this is simpler and safe enough.
+ // Modified by PH to add ANY and ANYCRLF.
+ if (matchend < static_cast<int>(str->length()) &&
+ (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
+ (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
+ NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
+ NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
+ matchend++;
+ }
+ // We also need to advance more than one char if we're in utf8 mode.
#ifdef SUPPORT_UTF8
- if (options_.utf8()) {
- while (matchend < static_cast<int>(str->length()) &&
- ((*str)[matchend] & 0xc0) == 0x80)
- matchend++;
+ if (options_.utf8()) {
+ while (matchend < static_cast<int>(str->length()) &&
+ ((*str)[matchend] & 0xc0) == 0x80)
+ matchend++;
+ }
+#endif
+ if (start < static_cast<int>(str->length()))
+ out.append(*str, start, matchend - start);
+ start = matchend;
+ last_match_was_empty_string = false;
+ continue;
}
-#endif
- if (matchend <= static_cast<int>(str->length()))
- out.append(*str, start, matchend - start);
- start = matchend;
} else {
- out.append(*str, start, matchstart - start);
- Rewrite(&out, rewrite, *str, vec, matches);
- start = matchend;
- lastend = matchend;
- count++;
+ matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
+ if (matches <= 0)
+ break;
}
+ int matchstart = vec[0], matchend = vec[1];
+ assert(matchstart >= start);
+ assert(matchend >= matchstart);
+ out.append(*str, start, matchstart - start);
+ Rewrite(&out, rewrite, *str, vec, matches);
+ start = matchend;
+ lastend = matchend;
+ count++;
+ last_match_was_empty_string = (matchstart == matchend);
}
if (count == 0)
@@ -442,7 +457,7 @@
const StringPiece& text,
string *out) const {
int vec[kVecSize];
- int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
+ int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
return false;
out->erase();
@@ -488,6 +503,7 @@
int RE::TryMatch(const StringPiece& text,
int startpos,
Anchor anchor,
+ bool empty_ok,
int *vec,
int vecsize) const {
pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
@@ -505,12 +521,19 @@
extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
extra.match_limit_recursion = options_.match_limit_recursion();
}
+
+ int options = 0;
+ if (anchor != UNANCHORED)
+ options |= PCRE_ANCHORED;
+ if (!empty_ok)
+ options |= PCRE_NOTEMPTY;
+
int rc = pcre_exec(re, // The regular expression object
&extra,
(text.data() == NULL) ? "" : text.data(),
text.size(),
startpos,
- (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
+ options,
vec,
vecsize);
@@ -540,7 +563,7 @@
int* vec,
int vecsize) const {
assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
- int matches = TryMatch(text, 0, anchor, vec, vecsize);
+ int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
assert(matches >= 0); // TryMatch never returns negatives
if (matches == 0)
return false;
Index: pcrecpp_unittest.cc
===================================================================
--- pcrecpp_unittest.cc (revision 470)
+++ pcrecpp_unittest.cc (working copy)
@@ -268,8 +268,8 @@
"bb",
"bbbbbb",
"bb",
- "bb",
- 1 },
+ "bbbb",
+ 2 },
{ "b*",
"bb",
"aaaaa",
@@ -294,6 +294,19 @@
"bbaa\r\naa\r\n",
"bbabbabb\r\nbbabbabb\r\nbb",
7 },
+ // Check empty-string matching (it's tricky!)
+ { "aa|b*",
+ "@",
+ "aa",
+ "@",
+ "@@",
+ 2 },
+ { "b*|aa",
+ "@",
+ "aa",
+ "@aa",
+ "@@@",
+ 3 },
#ifdef SUPPORT_UTF8
{ "b*",
"bb",