Actually, I think I prefer solution 2:
} 2) The pcrecpp interface should magically translate embedded
} null to \x00 so that libpcre matches on them as expected.
It's not been a goal of QuoteMeta to behave exactly like perl's
QuoteMeta (Philip: should it be?), so I have no problem special-casing
our handling of NUL. It's also a much simpler diff (which I include
below).
Philip, what do you think?
craig
--cut here--
Index: pcrecpp.h
===================================================================
--- pcrecpp.h (revision 320)
+++ pcrecpp.h (working copy)
@@ -620,6 +620,9 @@
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
+ // Note QuoteMeta behaves the same as perl's QuoteMeta function,
+ // *except* that it escapes the NUL character (\0) as backslash + 0,
+ // rather than backslash + NUL.
static string QuoteMeta(const StringPiece& unquoted);
Index: pcrecpp.cc
===================================================================
--- pcrecpp.cc (revision 320)
+++ pcrecpp.cc (working copy)
@@ -441,21 +441,27 @@
// Note that it's legal to escape a character even if it has no
// special meaning in a regular expression -- so this function does
// that. (This also makes it identical to the perl function of the
- // same name; see `perldoc -f quotemeta`.)
+ // same name; see `perldoc -f quotemeta`.) The one exception is
+ // escaping NUL: rather than doing backslash + NUL, like perl does,
+ // we do '\0', because pcre itself doesn't take embedded NUL chars.
for (int ii = 0; ii < unquoted.size(); ++ii) {
// Note that using 'isalnum' here raises the benchmark time from
// 32ns to 58ns:
- if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
- (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
- (unquoted[ii] < '0' || unquoted[ii] > '9') &&
- unquoted[ii] != '_' &&
- // If this is the part of a UTF8 or Latin1 character, we need
- // to copy this byte without escaping. Experimentally this is
- // what works correctly with the regexp library.
- !(unquoted[ii] & 128)) {
+ if (unquoted[ii] == '\0') {
+ result += "\\0";
+ } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
+ unquoted[ii] != '_' &&
+ // If this is the part of a UTF8 or Latin1 character, we need
+ // to copy this byte without escaping. Experimentally this is
+ // what works correctly with the regexp library.
+ !(unquoted[ii] & 128)) {
result += '\\';
+ result += unquoted[ii];
+ } else {
+ result += unquoted[ii];
}
- result += unquoted[ii];
}
return result;
Index: pcrecpp_unittest.cc
===================================================================
--- pcrecpp_unittest.cc (revision 320)
+++ pcrecpp_unittest.cc (working copy)
@@ -497,6 +497,7 @@
TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
TestQuoteMeta("((?!)xxx).*yyy");
TestQuoteMeta("([");
+ TestQuoteMeta(string("foo\0bar", 7));
}
static void TestQuoteMetaSimpleNegative() {