[Pcre-svn] [1096] code/trunk: Add support for invalid UTF-8 …

Top Page
Delete this message
Author: Subversion repository
Date:  
To: pcre-svn
Subject: [Pcre-svn] [1096] code/trunk: Add support for invalid UTF-8 matching to pcre2grep.
Revision: 1096
          http://www.exim.org/viewvc/pcre2?view=rev&revision=1096
Author:   ph10
Date:     2019-05-28 15:14:22 +0100 (Tue, 28 May 2019)
Log Message:
-----------
Add support for invalid UTF-8 matching to pcre2grep.


Modified Paths:
--------------
    code/trunk/ChangeLog
    code/trunk/RunGrepTest
    code/trunk/doc/pcre2grep.1
    code/trunk/src/pcre2grep.c
    code/trunk/testdata/grepoutput8


Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog    2019-05-25 16:31:38 UTC (rev 1095)
+++ code/trunk/ChangeLog    2019-05-28 14:14:22 UTC (rev 1096)
@@ -18,7 +18,11 @@
 interpreter, and integrate with the existing JIT support via the new
 PCRE2_MATCH_INVALID_UTF compile-time option.


+5. Give more error detail for invalid UTF-8 when detected in pcre2grep.

+6. Add support for invalid UTF-8 to pcre2grep.
+
+
Version 10.33 16-April-2019
---------------------------


Modified: code/trunk/RunGrepTest
===================================================================
--- code/trunk/RunGrepTest    2019-05-25 16:31:38 UTC (rev 1095)
+++ code/trunk/RunGrepTest    2019-05-28 14:14:22 UTC (rev 1096)
@@ -8,7 +8,7 @@
 # * Put printf arguments in single, not double quotes to avoid unwanted
 #     escaping.
 # * Use \0 for binary zero in printf, not \x0, for the benefit of older
-#     versions.
+#     versions (and use octal for other special values).


# Set the C locale, so that sort(1) behaves predictably.

@@ -676,7 +676,17 @@
echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
echo "RC=$?" >>testtrygrep
+
+ echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
+ printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
+ (cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' testtemp1grep) >>testtrygrep 2>&1
+ echo "RC=$?" >>testtrygrep

+ echo "---------------------------- Test U5 ------------------------------" >>testtrygrep
+ printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
+ (cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' testtemp1grep) >>testtrygrep
+ echo "RC=$?" >>testtrygrep
+
$cf $srcdir/testdata/grepoutput8 testtrygrep
if [ $? != 0 ] ; then exit 1; fi


Modified: code/trunk/doc/pcre2grep.1
===================================================================
--- code/trunk/doc/pcre2grep.1    2019-05-25 16:31:38 UTC (rev 1095)
+++ code/trunk/doc/pcre2grep.1    2019-05-28 14:14:22 UTC (rev 1096)
@@ -1,4 +1,4 @@
-.TH PCRE2GREP 1 "24 November 2018" "PCRE2 10.33"
+.TH PCRE2GREP 1 "28 May 2019" "PCRE2 10.34"
 .SH NAME
 pcre2grep - a grep with Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -644,12 +644,22 @@
 ignored when used with \fB-L\fP (list files without matches), because the grand
 total would always be zero.
 .TP
-\fB-u\fP, \fB--utf-8\fP
+\fB-u\fP, \fB--utf\fP
 Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
 with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
 \fB--include\fP options) and all subject lines that are scanned must be valid
 strings of UTF-8 characters.
 .TP
+\fb-U\fP, \fB--utf-allow-invalid\fP
+As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
+unit sequences. These can never form part of any pattern match. This facility
+allows valid UTF-8 strings to be sought in executable or other binary files.
+For more details about matching in non-valid UTF-8 strings, see the
+.\" HREF
+\fBpcre2unicode\fP(3)
+.\"
+documentation.
+.TP
 \fB-V\fP, \fB--version\fP
 Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
 standard output and then exit. Anything else on the command line is
@@ -711,9 +721,9 @@
 \fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
 \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
 \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
-\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
-\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
-capturing parentheses number.
+\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
+options are specific to \fBpcre2grep\fP, as is the use of the
+\fB--only-matching\fP option with a capturing parentheses number.
 .P
 Although most of the common options work the same way, a few are different in
 \fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
@@ -884,6 +894,6 @@
 .rs
 .sp
 .nf
-Last updated: 24 November 2018
-Copyright (c) 1997-2018 University of Cambridge.
+Last updated: 28 May 2019
+Copyright (c) 1997-2019 University of Cambridge.
 .fi


Modified: code/trunk/src/pcre2grep.c
===================================================================
--- code/trunk/src/pcre2grep.c    2019-05-25 16:31:38 UTC (rev 1095)
+++ code/trunk/src/pcre2grep.c    2019-05-28 14:14:22 UTC (rev 1096)
@@ -13,7 +13,7 @@
 The header can be found in the special z/OS distribution, which is available
 from www.zaconsultants.net or from www.cbttape.org.


-           Copyright (c) 1997-2018 University of Cambridge
+           Copyright (c) 1997-2019 University of Cambridge


-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -110,6 +110,19 @@
#define snprintf _snprintf
#endif

+/* VC and older compilers don't support %td or %zu, and even some that claim to
+be C99 don't support it (hence DISABLE_PERCENT_ZT). */
+
+#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(DISABLE_PERCENT_ZT)
+#define PTR_FORM "lu"
+#define SIZ_FORM "lu"
+#define SIZ_CAST (unsigned long int)
+#else
+#define PTR_FORM "td"
+#define SIZ_FORM "zu"
+#define SIZ_CAST
+#endif
+
#define FALSE 0
#define TRUE 1

@@ -451,6 +464,7 @@
   { OP_NODATA,    's',      NULL,              "no-messages",   "suppress error messages" },
   { OP_NODATA,    't',      NULL,              "total-count",   "print total count of matching lines" },
   { OP_NODATA,    'u',      NULL,              "utf",           "use UTF mode" },
+  { OP_NODATA,    'U',      NULL,              "utf-allow-invalid", "use UTF mode, allow for invalid code units" },
   { OP_NODATA,    'V',      NULL,              "version",       "print version information and exit" },
   { OP_NODATA,    'v',      NULL,              "invert-match",  "select non-matching lines" },
   { OP_NODATA,    'w',      NULL,              "word-regex(p)", "force patterns to match only as words"  },
@@ -1733,6 +1747,15 @@
   fprintf(stderr, "%s", msg);
   FWRITE_IGNORE(matchptr, 1, slen, stderr);   /* In case binary zero included */
   fprintf(stderr, "\n\n");
+  if (*mrc <= PCRE2_ERROR_UTF8_ERR1 &&
+      *mrc >= PCRE2_ERROR_UTF8_ERR21)
+    {
+    unsigned char mbuffer[256];
+    PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+    (void)pcre2_get_error_message(*mrc, mbuffer, sizeof(mbuffer));
+    fprintf(stderr, "%s at offset %" SIZ_FORM "\n\n", mbuffer, 
+      SIZ_CAST startchar);
+    }
   if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT ||
       *mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT)
     resource_error = TRUE;
@@ -3401,6 +3424,7 @@
   case 's': silent = TRUE; break;
   case 't': show_total_count = TRUE; break;
   case 'u': options |= PCRE2_UTF; utf = TRUE; break;
+  case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
   case 'v': invert = TRUE; break;
   case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
   case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;


Modified: code/trunk/testdata/grepoutput8
===================================================================
(Binary files differ)