Re: [exim] Pipe message from filter to return header

Top Page
Delete this message
Reply to this message
Author: Marc Perkel
Date:  
To: Edgar Lovecraft, exim-users
CC: 
Subject: Re: [exim] Pipe message from filter to return header
OK - I got your code and grafted it into my system and it's working. The
new bayesian filter is going to have to learn for a few days to take
advantage of the phone numbers - but I believe that it will make a
significant difference - especially with identifying nonspam. People
using their phone number in the sig line is a great white rule.

That you very much for this code.

Edgar Lovecraft wrote:

>Marc Perkel wrote:
>
>
>>OK - here's what I want to do - I want to run a program from the system
>>filter and return the results in a header - something like this:
>>headers add "X-Spamprobe: ${run {/etc/exim/scripts/spamprobe-test
>>$message_id}{$value}{}}"
>>
>>However - I need to pipe the message into the program that I'm trying to
>>get the results from. How do I do the piping part?
>>
>>
>
>Last I knew, you can't, you have to go get the spooled files.
>Any way, here is the latest rendition of what started as a simple
>script for you a couple messages back, as I figure that you are just
>wrapping what I have given you this should do all that you need, plus
>US phone numbers. Let me know what all you come up with out of this.
>
>Cheers!
>
><PERL_SCRIPT>
>#!/usr/bin/perl -w
>
># This can be called from the command line with a pipe as in:
>#       './script <mbox_file' or 'cat mbox_file |./script'
>#
># This can also be called with two command arguments as in:
>#       ./script <Exim_message-ID> <sender_address>
>#
># If called with a Exim_message-ID, the message-ID MUST come first
>#    as we use the message-ID parameter to grab -H and -D files
>#    from the Exim spool directories, or from the Exiscan 'scan'
>#    directory.

>
>use strict;
>use IO::File;
>
># Define two constants for the Exim and Exiscan directories.
>use constant EXIM_SPOOL_DIR => '/var/spool/exim-data/input/';
>use constant EXIM_XSCAN_DIR => '/var/spool/exim-data/scan/';
>
># Define the File Handle as STDIN and mbox as ''
>my $FH = 'STDIN';
>
># If we are given an Exim Message-ID and NOT an MBOX file through STDIN
>#    then we need to make a temporary MBOX file to parse.
>#    just wrap in an eval{} to help with fatal errors
>if ( $ARGV[0] )
>{
>  eval{ ($FH) = make_mbox($ARGV[0],$ARGV[1]) };
>  exit 1 if !$FH;
>}

>
># Parse the temporary MBOX file or STDIN and wrap in an eval{} to help
>#    with fatal errors.
>eval{ parse_mbox_file($FH) };

>
># Exit the script with 0 unless $@ else exit 1
>exit 1 if $@;
>exit 0;
>
># Used to create a temporary MBOX file
>sub make_mbox
>{
>#    $msgID  : Exim Message-ID
>#    $sender : Message sender as defined by Exim $sender_address
>#    $dta[12]: Exim message data file location   (1 is for split_spool)
>#    $hdr[12]: Exim message header file location (1 is for split_spool)
>#    $eml    : This can be used to grab an Exiscan eml file
>#    $dir    : This is the dir location ONLY FOR split spool instalations
>#    $msgfh  : The File Handle for the message data
>#    $hdrfh  : The File Handle for the message header
>#    $mbxfh  : The File Handle for the temporary mbox File
>#    $from   : Fake a MBOX From line as 'From $sender localtime()'
>#
># Exit from the sub if we cannot open a message data File Handle.
>#    Set autoflush on for the temporary mbox file.
>  my $msgID   = $_[0];
>  my $sender  = $_[1] || '-NUL-';
>  my $dir     = $1 if $msgID =~ /^.{5}(.)-.{6}-.{2}/;
>  my $dta1    = EXIM_SPOOL_DIR.$dir."/".$msgID."-D";
>  my $dta2    = EXIM_SPOOL_DIR."/".$msgID."-D";
>  my $hdr1    = EXIM_SPOOL_DIR.$dir."/".$msgID."-H";
>  my $hdr2    = EXIM_SPOOL_DIR."/".$msgID."-H";
>  my $eml     = EXIM_XSCAN_DIR.$msgID."/".$msgID.".eml";
>  my $msgfh   = IO::File->new($dta1,O_RDONLY) ||
>                IO::File->new($dta2,O_RDONLY) ||
>                IO::File->new($eml,O_RDONLY)  || return('');
>  my $hdrfh   = IO::File->new($hdr1,O_RDONLY) ||
>                IO::File->new($hdr2,O_RDONLY) || '';
>  my $mbxfh   = IO::File->new_tmpfile; $mbxfh->autoflush(1);
>  my $from    = "From ".$sender." ".scalar localtime();

>
># Now we need to create a fake MBOX file from the two seperate
>#    files, or we just use the already made eml file.  We check
>#    to see if we opened an Exim data file, it has the Message-ID
>#    as the very first line.  #Exit from the sub if we have an
>#    Exim data file but no Exim header file.
>#
># redundant yes, but ensure we have the top line.
>  $msgfh->seek(0,0); my $line = $msgfh->getline();
>  my $_is_exim_data = '';
>     $_is_exim_data = 1 if $line =~ /^$msgID-D$/;
>  if ( !$_is_exim_data )
>  {
>    # We have an Exiscan file, these are easy just add the
>    # fake 'From ' line to the top and then dump the Exiscan
>    # eml file after that.  Close the $msgfh and $hdrfh then
>    # return the MBOX File Handle.
>    $mbxfh->seek(0,0);
>    $mbxfh->print($from);
>    # We need to rewind the eml file so that we get the very first line.
>    $msgfh->seek(0,0);
>    while (defined($_ = $msgfh->getline()))
>    {
>      $mbxfh->print($_);
>    }
>    undef $msgfh; undef $hdrfh;
>    $mbxfh->seek(0,0);
>    return($mbxfh);
>  }

>
># We have an Exim message data file, so we need an Exim header file too
>#    exit from the sub if we do not have one.
>  return('') if !$hdrfh;

>
># We can write the fake 'From ' line, then we need to get only the
>#    message headers from the header file and write those, then we can
>#    get the Exim data file remove the first line and write the rest.
>#
># Exit from the sub if the header file does not have the message-ID as
>#    the first line.
>#
># redundant yes, but ensure we have the top line.
>  $hdrfh->seek(0,0); $line = $hdrfh->getline();
>  return('') if $line !~ /^$msgID-H$/;
>  $mbxfh->seek(0,0);
>  $mbxfh->print($from);
>  my $_firstHeader = 0;
>  while (defined($_ = $hdrfh->getline()))
>  {
>    $_firstHeader = 1 if /^\s*$/;
>    next if !$_firstHeader;
>    s/^[0-9]{3,}[BCFIPRST* ]{1} //;
>    $mbxfh->print($_);
>  }
>  $mbxfh->print("\n");
>  while (defined($_ = $msgfh->getline()))
>  {
>    $mbxfh->print($_);
>  }
>  undef $msgfh; undef $hdrfh;
>  $mbxfh->seek(0,0);
>  return($mbxfh);
>}

>
># Used to parse the temporary MBOX file of STDIN
>sub parse_mbox_file
>{
># Grab the File Handle
> my $fh = shift;
>
># Setup the variables that we are going to use
>#    $_firstHeader: Keep track of the start of a message
>#    $_lastHeader : The header//body seperator for a message
>#    $_isText     : Skip any non text MIME parts
>#    %http        : http://<domains>/ found in a message
>#    %addr        : any email\@<domain> found in a message
>#    %phon        : any number sequence that MAY be a phone number
>  my ($_firstHeader,$_lastHeader,$_isText,%http,%addr,%phon) = (0,0,1,(),
>(),());

>
># Grap the information from STDIN, such as "script <file"
>#    or "cat file|script" etc.  The output is in the form
>#
># <NEW LINE>
># From ....
># header: ....
># header: ....
># header: ....
># <NEW LINE>
># http://domain ...
># http://domain ...
># email://address ...
># email://address ...
># phone://number ...
># <NEW LINE>
>  while (defined($_ = $fh->getline()))
>  {

>
># MBOX messages start with "From " and check for End of File
>#    as we need to print the http domains and email links
>#    if we found any before exiting.
>    if ( /^From / || eof )
>    {
>      ($_firstHeader,$_lastHeader)           = (1,0) if !eof;
>      print STDOUT sort keys %http and %http = ()    if %http;
>      print STDOUT sort keys %addr and %addr = ()    if %addr;
>      print STDOUT sort keys %phon and %phon = ()    if %phon;
>      s/^/\n/;
>    }

>
># If we are in the headers section of a message print the line.
>#    Set that we are done with the headers section at the first
>#    blank line we find, as that should be the message
>#    header//body seperator.
>    if ( $_firstHeader )
>    {
>      ($_firstHeader,$_lastHeader) = (0,1) if /^\s*$/;
>      print STDOUT;
>    }

>
># We only want the headers of a message and then only check
>#    none 'binary' MIME parts, like text/html, text/plain,
>#    that type of thing.  Or just comment these lines if you
>#    want the http and email stuff run on every part of a message
>#    no matter what the MIME part type is.
>    $_isText = 0 if /^Content-Type: /;
>    $_isText = 1 if /^Content-Type: text.*/i;
>    next if !$_isText;

>
># While a line contains http://<something> in it, strip out
>#    everything upto the first / (or space, etc) after http://
>#    this is not perfect but should do the job in most cases.
>#    If you only want http:// domains in the body parts only,
>#    then add '$_lastHeader && ' before the 'm{'.  Only add
>#    unique domain strings.
>    while ( m{(http://[A-Za-z0-9.-]+)}g )
>    {
>      $http{"$1\n"}++ if !$http{"$1\n"};
>    }

>
># While a line contains anything@something then grab the stuff
>#    that looks like it is an email address.  This is less
>#    perfect than the http stuff, but then again, checking for
>#    'valid' email addressess in something is near impossible ;)
>#    We do try and make the email address thing as clean as
>#    possible.  If you only want the email address looking strings
>#    from the body of a message then add '$_lastHeader && ' before
>#    the 'm{'.  Only add unique email strings.
>    while ( $_lastHeader && m{\s+([^[:space:]]+\@[A-Za-z0-9.-]+)}g )
>    {
>      my $email =  $1;
>      next if $email !~ /\@[A-Za-z0-9.-]+\..*/ || $email =~ /\@.*\@/;
>      $email =~ s/[)<\["'\]>(]|(?i:mailto:|href=)|\.$//g;
>      $addr{"email://$email\n"}++ if !$addr{"email://$email\n"};
>    }
># While a line contains anything that may be a phone number.
>#    this is rather loose and mainly for US numbers, I do not
>#    know how well, if at all, it will catch non-US numbers.
>    while ( $_lastHeader && m/(\d\d\d)[ ).-](\d\d\d)[ .-](\d\d\d\d)/g)
>    {
>      $phon{"phone://$1.$2.$3\n"}++ if !$phon{"phone://$1.$2.$3\n"};
>    }
>  }
>  return 0;
>}

>
>__END__
>
></PERL_SCRIPT>
>
>
>