Revision: 947
http://vcs.pcre.org/viewvc?view=rev&revision=947
Author: ph10
Date: 2012-03-04 16:51:13 +0000 (Sun, 04 Mar 2012)
Log Message:
-----------
Add support for binary files to pcregrep.
Modified Paths:
--------------
code/trunk/ChangeLog
code/trunk/RunGrepTest
code/trunk/doc/pcregrep.1
code/trunk/pcregrep.c
Added Paths:
-----------
code/trunk/testdata/grepbinary
Modified: code/trunk/ChangeLog
===================================================================
--- code/trunk/ChangeLog 2012-02-29 18:00:55 UTC (rev 946)
+++ code/trunk/ChangeLog 2012-03-04 16:51:13 UTC (rev 947)
@@ -69,7 +69,10 @@
18. Added --file-list option to pcregrep.
+19. Added binary file support to pcregrep, including the -a, --binary-files,
+ -I, and --text options.
+
Version 8.30 04-February-2012
-----------------------------
Modified: code/trunk/RunGrepTest
===================================================================
--- code/trunk/RunGrepTest 2012-02-29 18:00:55 UTC (rev 946)
+++ code/trunk/RunGrepTest 2012-03-04 16:51:13 UTC (rev 947)
@@ -415,6 +415,38 @@
(cd $srcdir; $valgrind $pcregrep --file-list=./testdata/grepfilelist "dolor" ./testdata/grepinput3) >>testtry 2>&1
echo "RC=$?" >>testtry
+echo "---------------------------- Test 86 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 87 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep "cat" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 88 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -v "cat" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 89 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -I "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 90 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --binary-files=without-match "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 91 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep -a "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 92 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --binary-files=text "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
+echo "---------------------------- Test 93 -----------------------------" >>testtry
+(cd $srcdir; $valgrind $pcregrep --text "dog" ./testdata/grepbinary) >>testtry 2>&1
+echo "RC=$?" >>testtry
+
# Now compare the results.
$cf $srcdir/testdata/grepoutput testtry
Modified: code/trunk/doc/pcregrep.1
===================================================================
--- code/trunk/doc/pcregrep.1 2012-02-29 18:00:55 UTC (rev 946)
+++ code/trunk/doc/pcregrep.1 2012-03-04 16:51:13 UTC (rev 947)
@@ -95,6 +95,15 @@
standard input is always so treated.
.
.
+.SH "BINARY FILES"
+.rs
+.sp
+By default, a file that contains a binary zero byte within the first 1024 bytes
+is identified as a binary file, and is processed specially. (GNU grep also
+identifies binary files in this manner.) See the \fB--binary-files\fP option
+for a means of changing the way binary files are handled.
+.
+.
.SH OPTIONS
.rs
.sp
@@ -117,6 +126,10 @@
of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
guarantees to have up to 8K of following text available for context output.
.TP
+\fB-a\fP, \fB--text\fP
+Treat binary files as text. This is equivalent to
+\fB--binary-files\fP=\fItext\fP.
+.TP
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
Output \fInumber\fP lines of context before each matching line. If filenames
and/or line numbers are being output, a hyphen separator is used instead of a
@@ -125,6 +138,17 @@
of \fInumber\fP is expected to be relatively small. However, \fBpcregrep\fP
guarantees to have up to 8K of preceding text available for context output.
.TP
+\fB--binary-files=\fP\fIword\fP
+Specify how binary files are to be processed. If the word is "binary" (the
+default), pattern matching is performed on binary files, but the only output is
+"Binary file <name> matches" when a match succeeds. If the word is "text",
+which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are
+processed in the same way as any other file. In this case, when a match
+succeeds, the output may be binary garbage, which can have nasty effects if
+sent to a terminal. If the word is "without-match", which is equivalent to the
+\fB-I\fP option, binary files are not processed at all; they are assumed not to
+be of interest.
+.TP
\fB--buffer-size=\fP\fInumber\fP
Set the parameter that controls how much memory is used for buffering files
that are being scanned.
@@ -265,6 +289,10 @@
Output a help message, giving brief details of the command options and file
type support, and then exit.
.TP
+\fB-I\fP
+Treat binary files as never matching. This is equivalent to
+\fB--binary-files\fP=\fIwithout-match\fP.
+.TP
\fB-i\fP, \fB--ignore-case\fP
Ignore upper/lower case distinctions during comparisons.
.TP
@@ -493,7 +521,7 @@
.rs
.sp
Many of the short and long forms of \fBpcregrep\fP's options are the same
-as in the GNU \fBgrep\fP program (version 2.5.4). Any long option of the form
+as in the GNU \fBgrep\fP program. Any long option of the form
\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
(PCRE terminology). However, the \fB--file-list\fP, \fB--file-offsets\fP,
\fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP,
@@ -588,6 +616,6 @@
.rs
.sp
.nf
-Last updated: 28 February 2012
+Last updated: 04 March 2012
Copyright (c) 1997-2012 University of Cambridge.
.fi
Modified: code/trunk/pcregrep.c
===================================================================
--- code/trunk/pcregrep.c 2012-02-29 18:00:55 UTC (rev 946)
+++ code/trunk/pcregrep.c 2012-03-04 16:51:13 UTC (rev 947)
@@ -104,6 +104,10 @@
enum { EL_LF, EL_CR, EL_CRLF, EL_ANY, EL_ANYCRLF };
+/* Binary file options */
+
+enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
+
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
environments), a warning is issued if the value of fwrite() is ignored.
Unfortunately, casting to (void) does not suppress the warning. To get round
@@ -160,6 +164,7 @@
static int after_context = 0;
static int before_context = 0;
+static int binary_files = BIN_BINARY;
static int both_context = 0;
static int bufthird = PCREGREP_BUFSIZE;
static int bufsize = 3*PCREGREP_BUFSIZE;
@@ -197,7 +202,7 @@
/* Structure for options and list of them */
enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_LONGNUMBER,
- OP_OP_NUMBER, OP_PATLIST };
+ OP_OP_NUMBER, OP_PATLIST, OP_BINFILES };
typedef struct option_item {
int type;
@@ -227,12 +232,15 @@
#define N_BUFSIZE (-15)
#define N_NOJIT (-16)
#define N_FILE_LIST (-17)
+#define N_BINARY_FILES (-18)
static option_item optionlist[] = {
- { OP_NODATA, N_NULL, NULL, "", " terminate options" },
+ { OP_NODATA, N_NULL, NULL, "", "terminate options" },
{ OP_NODATA, N_HELP, NULL, "help", "display this help and exit" },
{ OP_NUMBER, 'A', &after_context, "after-context=number", "set number of following context lines" },
+ { OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
+ { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
@@ -247,6 +255,7 @@
{ OP_NODATA, N_FOFFSETS, NULL, "file-offsets", "output file offsets, not text" },
{ OP_NODATA, 'H', NULL, "with-filename", "force the prefixing filename on output" },
{ OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" },
+ { OP_NODATA, 'I', NULL, "", "treat binary files as not matching (ignore)" },
{ OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" },
#ifdef SUPPORT_PCREGREP_JIT
{ OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" },
@@ -1047,6 +1056,7 @@
char *ptr = main_buffer;
char *endptr;
size_t bufflength;
+BOOL binary = FALSE;
BOOL endhyphenpending = FALSE;
BOOL input_line_buffered = line_buffered;
FILE *in = NULL; /* Ensure initialized */
@@ -1094,6 +1104,17 @@
endptr = main_buffer + bufflength;
+/* Unless binary-files=text, see if we have a binary file. This uses the same
+rule as GNU grep, namely, a search for a binary zero byte near the start of the
+file. */
+
+if (binary_files != BIN_TEXT)
+ {
+ binary =
+ memchr(main_buffer, 0, (bufflength > 1024)? 1024 : bufflength) != NULL;
+ if (binary && binary_files == BIN_NOMATCH) return 1;
+ }
+
/* Loop while the current pointer is not at the end of the file. For large
files, endptr will be at the end of the buffer when we are in the middle of the
file, but ptr will never get there, because as soon as it gets over 2/3 of the
@@ -1209,6 +1230,16 @@
/* Just count if just counting is wanted. */
if (count_only) count++;
+
+ /* When handling a binary file and binary-files==binary, the "binary"
+ variable will be set true (it's false in all other cases). In this
+ situation we just want to output the file name. No need to scan further. */
+
+ else if (binary)
+ {
+ fprintf(stdout, "Binary file %s matches\n", filename);
+ return 0;
+ }
/* If all we want is a file name, there is no need to scan any more lines
in the file. */
@@ -1845,11 +1876,18 @@
contains an underscore. */
if (strchr(op->long_name, '_') != NULL) continue;
+
+ if (op->one_char > 0 && (op->long_name)[0] == 0)
+ n = 31 - printf(" -%c", op->one_char);
+ else
+ {
+ if (op->one_char > 0) sprintf(s, "-%c,", op->one_char);
+ else strcpy(s, " ");
+ n = 31 - printf(" %s --%s", s, op->long_name);
+ }
- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " ");
- n = 31 - printf(" %s --%s", s, op->long_name);
if (n < 1) n = 1;
- printf("%.*s%s\n", n, " ", op->help_text);
+ printf("%.*s%s\n", n, " ", op->help_text);
}
printf("\nNumbers may be followed by K or M, e.g. --buffer-size=100K.\n");
@@ -1880,9 +1918,11 @@
case N_LBUFFER: line_buffered = TRUE; break;
case N_LOFFSETS: line_offsets = number = TRUE; break;
case N_NOJIT: study_options &= ~PCRE_STUDY_JIT_COMPILE; break;
+ case 'a': binary_files = BIN_TEXT; break;
case 'c': count_only = TRUE; break;
case 'F': process_options |= PO_FIXED_STRINGS; break;
case 'H': filenames = FN_FORCE; break;
+ case 'I': binary_files = BIN_NOMATCH; break;
case 'h': filenames = FN_NONE; break;
case 'i': options |= PCRE_CASELESS; break;
case 'l': omit_zero_count = TRUE; filenames = FN_MATCH_ONLY; break;
@@ -2316,7 +2356,7 @@
/* If the option type is OP_PATLIST, it's the -e option, which can be called
multiple times to create a list of patterns. */
-
+
if (op->type == OP_PATLIST)
{
if (cmd_pattern_count >= MAX_PATTERN_COUNT)
@@ -2327,6 +2367,24 @@
}
patterns[cmd_pattern_count++] = option_data;
}
+
+ /* Handle OP_BINARY_FILES */
+
+ else if (op->type == OP_BINFILES)
+ {
+ if (strcmp(option_data, "binary") == 0)
+ binary_files = BIN_BINARY;
+ else if (strcmp(option_data, "without-match") == 0)
+ binary_files = BIN_NOMATCH;
+ else if (strcmp(option_data, "text") == 0)
+ binary_files = BIN_TEXT;
+ else
+ {
+ fprintf(stderr, "pcregrep: unknown value \"%s\" for binary-files\n",
+ option_data);
+ pcregrep_exit(usage(2));
+ }
+ }
/* Otherwise, deal with single string or numeric data values. */
Added: code/trunk/testdata/grepbinary
===================================================================
(Binary files differ)
Property changes on: code/trunk/testdata/grepbinary
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream