Re: [pcre-dev] match point reset bug?

Top Page
Delete this message
Author: Craig Silverstein
Date:  
To: ph10
CC: pcre-dev
Subject: Re: [pcre-dev] match point reset bug?
OK, here's the patch to fix up GlobalReplace for the C++ wrapper.

craig

--cut here--

Index: pcrecpp.h
===================================================================
--- pcrecpp.h    (revision 470)
+++ pcrecpp.h    (working copy)
@@ -674,6 +674,7 @@
   int TryMatch(const StringPiece& text,
                int startpos,
                Anchor anchor,
+               bool empty_ok,
                int *vec,
                int vecsize) const;


Index: pcrecpp.cc
===================================================================
--- pcrecpp.cc    (revision 470)
+++ pcrecpp.cc    (working copy)
@@ -331,7 +331,7 @@
 bool RE::Replace(const StringPiece& rewrite,
                  string *str) const {
   int vec[kVecSize];
-  int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
+  int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
   if (matches == 0)
     return false;


@@ -384,49 +384,64 @@
string out;
int start = 0;
int lastend = -1;
+ bool last_match_was_empty_string = false;

   while (start <= static_cast<int>(str->length())) {
-    int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
-    if (matches <= 0)
-      break;
-    int matchstart = vec[0], matchend = vec[1];
-    assert(matchstart >= start);
-    assert(matchend >= matchstart);
-    if (matchstart == matchend && matchstart == lastend) {
-      // advance one character if we matched an empty string at the same
-      // place as the last match occurred
-      matchend = start + 1;
-      // If the current char is CR and we're in CRLF mode, skip LF too.
-      // Note it's better to call pcre_fullinfo() than to examine
-      // all_options(), since options_ could have changed bewteen
-      // compile-time and now, but this is simpler and safe enough.
-      // Modified by PH to add ANY and ANYCRLF.
-      if (start+1 < static_cast<int>(str->length()) &&
-          (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
-          (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
-           NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
-           NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
-          ) {
-        matchend++;
-      }
-      // We also need to advance more than one char if we're in utf8 mode.
+    // If the previous match was for the empty string, we shouldn't
+    // just match again: we'll match in the same way and get an
+    // infinite loop.  Instead, we do the match in a special way:
+    // anchored -- to force another try at the same position --
+    // and with a flag saying that this time, ignore empty matches.
+    // If this special match returns, that means there's a non-empty
+    // match at this position as well, and we can continue.  If not,
+    // we do what perl does, and just advance by one.
+    // Notice that perl prints '@@@' for this;
+    //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
+    int matches;
+    if (last_match_was_empty_string) {
+      matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
+      if (matches <= 0) {
+        int matchend = start + 1;     // advance one character.
+        // If the current char is CR and we're in CRLF mode, skip LF too.
+        // Note it's better to call pcre_fullinfo() than to examine
+        // all_options(), since options_ could have changed bewteen
+        // compile-time and now, but this is simpler and safe enough.
+        // Modified by PH to add ANY and ANYCRLF.
+        if (matchend < static_cast<int>(str->length()) &&
+            (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
+            (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
+             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
+             NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
+          matchend++;
+        }
+        // We also need to advance more than one char if we're in utf8 mode.
 #ifdef SUPPORT_UTF8
-      if (options_.utf8()) {
-        while (matchend < static_cast<int>(str->length()) &&
-               ((*str)[matchend] & 0xc0) == 0x80)
-          matchend++;
+        if (options_.utf8()) {
+          while (matchend < static_cast<int>(str->length()) &&
+                 ((*str)[matchend] & 0xc0) == 0x80)
+            matchend++;
+        }
+#endif
+        if (start < static_cast<int>(str->length()))
+          out.append(*str, start, matchend - start);
+        start = matchend;
+        last_match_was_empty_string = false;
+        continue;
       }
-#endif
-      if (matchend <= static_cast<int>(str->length()))
-        out.append(*str, start, matchend - start);
-      start = matchend;
     } else {
-      out.append(*str, start, matchstart - start);
-      Rewrite(&out, rewrite, *str, vec, matches);
-      start = matchend;
-      lastend = matchend;
-      count++;
+      matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
+      if (matches <= 0)
+        break;
     }
+    int matchstart = vec[0], matchend = vec[1];
+    assert(matchstart >= start);
+    assert(matchend >= matchstart);
+    out.append(*str, start, matchstart - start);
+    Rewrite(&out, rewrite, *str, vec, matches);
+    start = matchend;
+    lastend = matchend;
+    count++;
+    last_match_was_empty_string = (matchstart == matchend);
   }


   if (count == 0)
@@ -442,7 +457,7 @@
                  const StringPiece& text,
                  string *out) const {
   int vec[kVecSize];
-  int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
+  int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
   if (matches == 0)
     return false;
   out->erase();
@@ -488,6 +503,7 @@
 int RE::TryMatch(const StringPiece& text,
                  int startpos,
                  Anchor anchor,
+                 bool empty_ok,
                  int *vec,
                  int vecsize) const {
   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
@@ -505,12 +521,19 @@
     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
     extra.match_limit_recursion = options_.match_limit_recursion();
   }
+
+  int options = 0;
+  if (anchor != UNANCHORED)
+    options |= PCRE_ANCHORED;
+  if (!empty_ok)
+    options |= PCRE_NOTEMPTY;
+
   int rc = pcre_exec(re,              // The regular expression object
                      &extra,
                      (text.data() == NULL) ? "" : text.data(),
                      text.size(),
                      startpos,
-                     (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
+                     options,
                      vec,
                      vecsize);


@@ -540,7 +563,7 @@
                      int* vec,
                      int vecsize) const {
   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
-  int matches = TryMatch(text, 0, anchor, vec, vecsize);
+  int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
   assert(matches >= 0);  // TryMatch never returns negatives
   if (matches == 0)
     return false;
Index: pcrecpp_unittest.cc
===================================================================
--- pcrecpp_unittest.cc    (revision 470)
+++ pcrecpp_unittest.cc    (working copy)
@@ -268,8 +268,8 @@
       "bb",
       "bbbbbb",
       "bb",
-      "bb",
-      1 },
+      "bbbb",
+      2 },
     { "b*",
       "bb",
       "aaaaa",
@@ -294,6 +294,19 @@
       "bbaa\r\naa\r\n",
       "bbabbabb\r\nbbabbabb\r\nbb",
       7 },
+    // Check empty-string matching (it's tricky!)
+    { "aa|b*",
+      "@",
+      "aa",
+      "@",
+      "@@",
+      2 },
+    { "b*|aa",
+      "@",
+      "aa",
+      "@aa",
+      "@@@",
+      3 },
 #ifdef SUPPORT_UTF8
     { "b*",
       "bb",