Re: [exim] I've modified Erik Mugele's exim_surbl.pl utility…

Pàgina inicial
Delete this message
Reply to this message
Autor: Marc Perkel
Data:  
A: exim-users
Assumptes vells: [exim] I've modified Erik Mugele's exim_surbl.pl utility - NEW VERSION
Assumpte: Re: [exim] I've modified Erik Mugele's exim_surbl.pl utility - PREFORMAT
Maybe this will be better to avoid line wrap.

#
# Copyright (c) 2006-2007 Erik Mugele.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTES
# -----
#
# 1. This script makes use of the Country Code Top Level 
# Domains (ccTLD) provided by the SURBL group at
# http://spamcheck.freeapp.net/two-level-tlds  
# THE VARIABLE $cctld_file MUST BE SET TO THE FULL PATH AND 
# NAME OF THE FILE CONTAINING THE CCTLD LIST!  (see below)
#
# 2. This script makes use of whitelisting of popular domains.  The 
# source of the list can be found here: 
# http://spamassassin.apache.org/full/3.1.x/dist/rules/25_uribl.cf
# These are domains that are whitelisted by the SURBL group so it
# doesn't make sense to waste resources doing lookups on them.
# THE VARIABLE $whitelist_file MUST BE SET TO THE FULL PATH AND
# NAME OF THE FILE CONTAINING THE WHITE LIST!  (see below)          
# 
# 3. Per the guidelines at http://www.surbl.org, if your site processes
# more than 100,000 messages per day, you should NOT be using the 
# public SURBL name servers but should be rsync-ing from them and 
# running your own.  See http://www3.surbl.org/rsync-signup.html
#


use strict;
use warnings;

sub surblspamcheck
{
# Designed and written by Erik Mugele, 2004-2006
# http://www.teuton.org/~ejm
# Version 2.0
# Modified by Marc Perkel - marc@??? - http://www.junkemailfilter.com

    # The following variable is the full path to the file containing the 
    # list of Country Code Top Level Domains (ccTLD).
    # ---------------------------------------------------------------------
    # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE 
    # CONTAINING THE CCTLD LIST!
    # ---------------------------------------------------------------------
    my $cctld_file = "/etc/exim/ccTLD.txt";    


    # The following variable is the full path to the file containing
    # whitelist entries.  
    # ---------------------------------------------------------------------
    # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE 
    # CONTAINING THE WHITELIST DOMAINS!
    # ---------------------------------------------------------------------
    my $whitelist_file = "/etc/exim/surbl_whitelist.txt";


    # This variable defines the maximum MIME file size that will be checked
    # if this script is called by the MIME ACL.  This is primarily to
    # keep the load down on the server.  Size is in bytes.
    my $max_file_size = 50000;


    # The following two variables enable or disable the SURBL and URIBL
    # lookups.  Set to 1 to enable and 0 to disable.
    my $surbl_enable = 1;
    my $uribl_enable = 1;
    my $dob_enable = 1;
    my $karma_enable = 1;


    # Check to see if a decode MIME attachment is being checked or 
    # just a plain old text message with no attachments
    my $exim_body = "";
    my $mime_filename = Exim::expand_string('$mime_decoded_filename');
    if ($mime_filename) {
        # DEBUG Statement
        #warn ("MIME FILENAME: $mime_filename\n");
        # If the MIME file is too large, skip it.
        if (-s $mime_filename <= $max_file_size) {
            open(INFILE,"<$mime_filename");
            binmode(INFILE);
            while (read(INFILE,my $buff,1024)) {
                $exim_body .= $buff;
            }
            close (INFILE);
        } else {
            $exim_body = "";
        }
    } else {
        $exim_body = Exim::expand_string('$message_body');
    }


    sub surbllookup {
        # This subroutine does the actual DNS lookup and builds and returns
        # the return message for the SURBL lookup.
        my @params = @_;
        my($return_string);
        my $surbldomain = ".multi.surbl.org";
        my @dnsbladdr=gethostbyname($params[0].$surbldomain);
        # If gethostbyname() returned anything, build a return message.
        $return_string = "";
        if (scalar(@dnsbladdr) != 0) {
            $return_string = "Blacklisted URL in message. (".$params[0].") in";
            my @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
            if ($surblipaddr[3] & 64) {
                $return_string .= " [jp]";
            }
            if ($surblipaddr[3] & 32) {
                $return_string .= " [ab]";
            }
            if ($surblipaddr[3] & 16) {
                $return_string .= " [ob]";
            }
            if ($surblipaddr[3] & 8) {
                $return_string .= " [ph]";
            }
            if ($surblipaddr[3] & 4) {
                $return_string .= " [ws]";
            }
            if ($surblipaddr[3] & 2) {
                $return_string .= " [sc]";
            }
            $return_string .= ". See http://www.surbl.org/lists.html.";
        }
        return $return_string;
    }


    sub uribllookup {
        # This subroutine does the actual DNS lookup and builds and returns
        # the return message for the URIBL check.
        my @params = @_;
        my($return_string);
        my $surbldomain = ".black.uribl.com";
        my @dnsbladdr=gethostbyname($params[0].$surbldomain);
        # If gethostbyname() returned anything, build a return message.
        $return_string = "";
        if (scalar(@dnsbladdr) != 0) {
            $return_string = "Blacklisted URL in message. (".$params[0].") in";
            my @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
            if ($surblipaddr[3] & 8) {
                $return_string .= " [red]";
            }
            if ($surblipaddr[3] & 4) {
                $return_string .= " [grey]";
            }
            if ($surblipaddr[3] & 2) {
                $return_string .= " [black]";
            }
            $return_string .= ". See http://lookup.uribl.com.";
        }
        return $return_string;
    }


    # This list returns domain name under 5 days old
    sub doblookup {
        # This subroutine does the actual DNS lookup and builds and returns
        # the return message for the URIBL check.
        my @params = @_;
        my($return_string);
        my $dobdomain = ".dob.sibl.support-intelligence.net";
        my @dnsbladdr=gethostbyname($params[0].$dobdomain);
        # If gethostbyname() returned anything, build a return message.
        $return_string = "";
        if (scalar(@dnsbladdr) != 0) {
            $return_string = "Blacklisted URL in message. (".$params[0].") in DOB.";
        }
        return $return_string;
    }


    # http://wiki.junkemailfilter.com/index.php/Spam_DNS_Lists
    sub karmalookup {
        # If white listed - skip other lookups
        my @params = @_;
        my($return_string);
        my $karmadomain = ".hostkarma.junkemailfilter.com";
        my @dnsbladdr=gethostbyname($params[0].$karmadomain);
        # If gethostbyname() returned anything, build a return message.
        $return_string = "";
        if (scalar(@dnsbladdr) != 0) {
            my @karmaipaddr = unpack('C4',($dnsbladdr[4])[0]);


            # Junk Email Filter now has a Day old Bread List
            if ($karmaipaddr[3] eq 6) {
                $return_string = "Blacklisted URL in message. (".$params[0].") in HOSTKARMA DOB.";
            }


            # This is mostly for caching purposes
            if ($karmaipaddr[3] eq 2) {
                $return_string = "Blacklisted URL in message. (".$params[0].") in HOSTKARMA.";
            }


            # Exclude HostHarma White List
            if ($karmaipaddr[3] eq 1) {
                $return_string = "whitelisted";
            }


            # Exclude HostHarma Yellow List
            if ($karmaipaddr[3] eq 3) {
                $return_string = "whitelisted";
            }


            # Exclude HostHarma NoBl List
            if ($karmaipaddr[3] eq 5) {
                $return_string = "whitelisted";
            }
        }
        return $return_string;
    }


    sub converthex {
        # This subroutin converts two hex characters to an ASCII character.
        # It is called when ASCII obfuscation or Printed-Quatable characters
        # are found (i.e. %AE or =AE).
        # It should return a converted/plain address after splitting off
        # everything that isn't part of the address portion of the URL.
        my @ob_parts = @_;
        my $address = $ob_parts[0];
        for (my $j=1; $j < scalar(@ob_parts); $j++) {
            $address .= chr(hex(substr($ob_parts[$j],0,2)));
            $address .= substr($ob_parts[$j],2,);
        }
        $address = (split(/[^A-Za-z0-9._\-]/,$address))[0];
        return $address
    }


    ################
    # Main Program #
    ################


    if ($exim_body) {
        # Find all the URLs in the message by finding the HTTP string
        my @parts = split /[hH][tT][tT][pP]:\/\//,$exim_body;
        if (scalar(@parts) > 1) {
            my(@lookupdomains,@whitelist,@cctlds);
            # Read the entries from the ccTLD file.
            open (cctld_handle,$cctld_file) or die "Can't open $cctld_file.\n";
            while (<cctld_handle>) {
                next if (/^#/ || /^$/ || /^\s$/);
                push(@cctlds,$_);
            }
            close (cctld_handle) or die "Close: $!\n";
            my($address);


            # Read the entries from the whitelist file.
            open (whitelist_handle,$whitelist_file) or die "Can't open $whitelist_file.\n";
            while (<whitelist_handle>) {
                next if (/^#/ || /^$/ || /^\s$/);
                push(@whitelist,$_);
            }
            close (whitelist_handle) or die "Close: $!\n";


            # Go through each of the HTTP parts that were found in the message
            for (my $i=1; $i < scalar(@parts); $i++) {
                # Special case of Quoted Printable EOL marker
                my ($return_result,@domain);
                $parts[$i] =~ s/=\n//g;
                    # Split the parts and find the address portion of the URL.
                # Address SHOULD be either a FQDN, IP address, or encoded address.
                $address = (split(/[^A-Za-z0-9\._\-%=]/,$parts[$i]))[0];
                # Check for an =.  If it exists, we assume the URL is doing 
                # Quoted-Printable.  Decode it and redine $address
                if ($address =~ /=/) {
                    my @ob_parts = split /=/,$address;
                    $address = converthex(@ob_parts);
                }
                # Check for a %.  If it exists the URL is using % ASCII
                # obfuscation.  Decode it and redefine $address.
                if ($address =~ /%/) {
                    my @ob_parts = split /%/,$address;
                    $address = converthex(@ob_parts);
                }
                # Split the the address into the elements separated by periods.
                @domain = split /\./,$address;
                # Check the length of the domain name.  If less then two elements
                # at this point it is probably bogus or there is a bug in one of 
                # the decoding/converting routines above.
                if (scalar(@domain) >= 2) {
                    my($spamcheckdomain);
                    $return_result="";
                    # By default, assume that the domain check is on a 
                    # "standard" two level domain
                    $spamcheckdomain=$domain[-2].".".$domain[-1];
                    # Check for a two level domain
                    if (((scalar(@domain) == 2) || (scalar(@domain) >= 5))  && 
                        (grep(/^$spamcheckdomain$/i,@cctlds))) {
                        $return_result="cctld";
                    }
                    # Check for a three level domain
                    if (scalar(@domain) == 3) {
                        if (grep(/^$spamcheckdomain$/i,@cctlds)) {
                            $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
                            if (grep(/^$spamcheckdomain$/,@cctlds)) {
                                $return_result="cctld";
                            }
                        }
                    }
                    # Check for a four level domain
                    if (scalar(@domain) == 4) {
                        # Check to see if the domain is an IP address
                        if ($domain[-1] =~ /[a-zA-Z]/) {
                            if (grep(/^$spamcheckdomain$/i,@cctlds)) {
                                $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
                                if (grep(/^$spamcheckdomain$/i,@cctlds)) {
                                    $spamcheckdomain=$domain[-4].".".$spamcheckdomain;
                                }
                            }
                        } else {
                            # Domain is an IP address
                            $spamcheckdomain=$domain[3].".".$domain[2].
                                ".".$domain[1].".".$domain[0];
                        }
                    }
                    # DEBUG statement
                    #warn ("FOUND DOMAIN ($mime_filename): $spamcheckdomain\n");
                    # If whitelisting is enabled check domain against the 
                    # whitelist.
                    if ($whitelist_file ne "") {
                        foreach my $whitelist_entry (@whitelist) {
                            chomp($whitelist_entry);
                            if ($spamcheckdomain =~ m/^$whitelist_entry$/i) {
                                $return_result="whitelisted";
                                last;
                            }
                        }
                    }
                    # If the domain is whitelisted or in the cctld skip adding
                    # it to the lookup list.
                    if ($return_result eq "") {
                        if (scalar(@lookupdomains) > 0) {
                            # Check so see if the domain already is in the list.
                            if (not grep(/^$spamcheckdomain$/i,@lookupdomains)) {
                                    push(@lookupdomains,$spamcheckdomain);
                            }
                        } else {
                            push(@lookupdomains,$spamcheckdomain);
                        }
                    }
                }
            }
            # If there are items in the lookupdomains list then
            # perform lookups on them.  If there are not, something is wrong
            # and just return false.  There should always be something in the list.
            if (scalar(@lookupdomains) > 0) {
                foreach my $i (@lookupdomains) {
                    my $return_result;
                    # DEBUG statement.
                    #warn ("CHECKING DOMAIN ($mime_filename): $i\n");


                    # do a HOSTKARMA lookup - contains whitelist information
                    # http://wiki.junkemailfilter.com/index.php/Spam_DNS_Lists
                    if (($karma_enable == 1) && ($return_result eq "")) {
                        $return_result = karmalookup($i);
                    }


                    # do an SURBL lookup
                    if (($surbl_enable == 1) && ($return_result eq "")) {
                        $return_result = surbllookup($i);
                    }


                    # do a URIBL lookup
                    if (($uribl_enable == 1) && ($return_result eq "")) {
                        $return_result = uribllookup($i);
                    }


                    # do a Day Old Bread lookup - domains under 5 days old
                    if (($dob_enable == 1) && ($return_result eq "")) {
                        $return_result = doblookup($i);
                    }


                    # If we got a hit return the result to Exim
                    if ($return_result ne "") {
                        return $return_result;
                    }
                }
            }
        }
    }
    # We didn't find any URLs or the URLs we did find were not
    # listed so return false.
    return -1;
}