[Pcre-svn] [1735] code/trunk: Fix two C++ wrapper bugs, unn…

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1735] code/trunk: Fix two C++ wrapper bugs, unnoticed for years.
Revision: 1735
          http://vcs.pcre.org/viewvc?view=rev&revision=1735
Author:   ph10
Date:     2018-06-26 17:51:43 +0100 (Tue, 26 Jun 2018)
Log Message:
-----------
Fix two C++ wrapper bugs, unnoticed for years. 


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/configure.ac
    code/trunk/pcrecpp.cc
    code/trunk/pcrecpp_unittest.cc


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2018-05-30 15:42:40 UTC (rev 1734)
+++ code/trunk/ChangeLog    2018-06-26 16:51:43 UTC (rev 1735)
@@ -5,6 +5,24 @@
 development is happening in the PCRE2 10.xx series.



+Version 8.43 25-June-2018
+-------------------------
+
+1. Some time ago the config macro SUPPORT_UTF8 was changed to SUPPORT_UTF
+because it also applies to UTF-16 and UTF-32. However, this change was not made
+in the pcre2cpp files; consequently the C++ wrapper has from then been compiled
+with a bug in it, which would have been picked up by the unit test except that
+it also had its UTF8 code cut out. The bug was in a global replace when moving
+forward after matching an empty string.
+
+2. The C++ wrapper got broken a long time ago (version 7.3, August 2007) when
+(*CR) was invented (assuming it was the first such start-of-pattern option).
+The wrapper could never handle such patterns because it wraps patterns in
+(?:...)\z in order to support end anchoring. I have hacked in some code to fix
+this, that is, move the wrapping till after any existing start-of-pattern
+special settings.
+
+
Version 8.42 20-March-2018
--------------------------


Modified: code/trunk/configure.ac
===================================================================
--- code/trunk/configure.ac    2018-05-30 15:42:40 UTC (rev 1734)
+++ code/trunk/configure.ac    2018-06-26 16:51:43 UTC (rev 1735)
@@ -9,9 +9,9 @@
 dnl be defined as -RC2, for example. For real releases, it should be empty.


m4_define(pcre_major, [8])
-m4_define(pcre_minor, [42])
-m4_define(pcre_prerelease, [])
-m4_define(pcre_date, [2018-03-20])
+m4_define(pcre_minor, [43])
+m4_define(pcre_prerelease, [-RC1])
+m4_define(pcre_date, [2018-06-25])

# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.

Modified: code/trunk/pcrecpp.cc
===================================================================
--- code/trunk/pcrecpp.cc    2018-05-30 15:42:40 UTC (rev 1734)
+++ code/trunk/pcrecpp.cc    2018-06-26 16:51:43 UTC (rev 1735)
@@ -80,6 +80,24 @@
 // If the user doesn't ask for any options, we just use this one
 static RE_Options default_options;


+// Specials for the start of patterns. See comments where start_options is used
+// below. (PH June 2018)
+static const char *start_options[] = {
+  "(*UTF8)",
+  "(*UTF)",
+  "(*UCP)",
+  "(*NO_START_OPT)",
+  "(*NO_AUTO_POSSESS)",
+  "(*LIMIT_RECURSION=",
+  "(*LIMIT_MATCH=",
+  "(*CRLF)",
+  "(*CR)",
+  "(*BSR_UNICODE)",
+  "(*BSR_ANYCRLF)",
+  "(*ANYCRLF)",
+  "(*ANY)",
+  "" };
+
 void RE::Init(const string& pat, const RE_Options* options) {
   pattern_ = pat;
   if (options == NULL) {
@@ -135,7 +153,49 @@
   } else {
     // Tack a '\z' at the end of RE.  Parenthesize it first so that
     // the '\z' applies to all top-level alternatives in the regexp.
-    string wrapped = "(?:";  // A non-counting grouping operator
+
+    /* When this code was written (for PCRE 6.0) it was enough just to
+    parenthesize the entire pattern. Unfortunately, when the feature of
+    starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
+    this code was never updated. This bug was not noticed till 2018, long after
+    PCRE became obsolescent and its maintainer no longer around. Since PCRE is
+    frozen, I have added a hack to check for all the existing "start of
+    pattern" specials - knowing that no new ones will ever be added. I am not a
+    C++ programmer, so the code style is no doubt crude. It is also
+    inefficient, but is only run when the pattern starts with "(*".
+    PH June 2018. */
+
+    string wrapped = "";
+
+    if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
+      int kk, klen, kmat;
+      for (;;) {   // Loop for any number of leading items
+
+        for (kk = 0; start_options[kk][0] != 0; kk++) {
+          klen = strlen(start_options[kk]);
+          kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
+          if (kmat >= 0) break;
+        }
+        if (kmat != 0) break;  // Not found
+
+        // If the item ended in "=" we must copy digits up to ")".
+
+        if (start_options[kk][klen-1] == '=') {
+          while (isdigit(pattern_.c_str()[klen])) klen++;
+          if (pattern_.c_str()[klen] != ')') break;  // Syntax error
+          klen++;
+        }
+
+        // Move the item from the pattern to the start of the wrapped string.
+
+        wrapped += pattern_.substr(0, klen);
+        pattern_.erase(0, klen);
+      }
+    }
+
+    // Wrap the rest of the pattern.
+
+    wrapped += "(?:";  // A non-counting grouping operator
     wrapped += pattern_;
     wrapped += ")\\z";
     re = pcre_compile(wrapped.c_str(), pcre_options,
@@ -415,7 +475,7 @@
           matchend++;
         }
         // We also need to advance more than one char if we're in utf8 mode.
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
         if (options_.utf8()) {
           while (matchend < static_cast<int>(str->length()) &&
                  ((*str)[matchend] & 0xc0) == 0x80)


Modified: code/trunk/pcrecpp_unittest.cc
===================================================================
--- code/trunk/pcrecpp_unittest.cc    2018-05-30 15:42:40 UTC (rev 1734)
+++ code/trunk/pcrecpp_unittest.cc    2018-06-26 16:51:43 UTC (rev 1735)
@@ -309,7 +309,7 @@
       "@aa",
       "@@@",
       3 },
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
     { "b*",
       "bb",
       "\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8",   // utf8
@@ -327,7 +327,7 @@
     { "", NULL, NULL, NULL, NULL, 0 }
   };


-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
const bool support_utf8 = true;
#else
const bool support_utf8 = false;
@@ -535,7 +535,7 @@
}

 static void TestQuoteMetaUtf8() {
-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
   TestQuoteMeta("xyz", pcrecpp::UTF8());            // No fancy utf8
   TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8());       // 2-byte utf8 (degree symbol)
@@ -1178,7 +1178,7 @@
     CHECK(re.error().empty());  // Must have no error
   }


-#ifdef SUPPORT_UTF8
+#ifdef SUPPORT_UTF
   // Check UTF-8 handling
   {
     printf("Testing UTF-8 handling\n");
@@ -1202,7 +1202,25 @@
     CHECK(re_test1.FullMatch(utf8_string));
     RE re_test2("...", pcrecpp::UTF8());
     CHECK(re_test2.FullMatch(utf8_string));
+    
+    // PH added these tests for leading option settings
+    
+    RE re_testZ1("(*UTF8)...");
+    CHECK(re_testZ1.FullMatch(utf8_string));


+    RE re_testZ2("(*UTF)...");
+    CHECK(re_testZ2.FullMatch(utf8_string));
+
+    RE re_testZ3("(*UCP)(*UTF)...");
+    CHECK(re_testZ3.FullMatch(utf8_string));
+
+    RE re_testZ4("(*UCP)(*LIMIT_MATCH=1000)(*UTF)...");
+    CHECK(re_testZ4.FullMatch(utf8_string));
+ 
+    RE re_testZ5("(*UCP)(*LIMIT_MATCH=1000)(*ANY)(*UTF)...");
+    CHECK(re_testZ5.FullMatch(utf8_string));
+ 
+
     // Check that '.' matches one byte or UTF-8 character
     // according to the mode.
     string ss;
@@ -1248,7 +1266,7 @@
     CHECK(!match_sentence.FullMatch(target));
     CHECK(!match_sentence_re.FullMatch(target));
   }
-#endif  /* def SUPPORT_UTF8 */
+#endif  /* def SUPPORT_UTF */


printf("Testing error reporting\n");