OK - here's a new version of my modification. And - I have a new data
source that is allowing me to generate my own DOB list. So for those of
you who noticed problems with the other one - test this.
This version uses strict and warnings. Thanks to Todd Lyons for his
assistance on helping me with this.
#
# Copyright (c) 2006-2007 Erik Mugele. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTES
# -----
#
# 1. This script makes use of the Country Code Top Level # Domains
(ccTLD) provided by the SURBL group at
# http://spamcheck.freeapp.net/two-level-tlds # THE VARIABLE
$cctld_file MUST BE SET TO THE FULL PATH AND # NAME OF THE FILE
CONTAINING THE CCTLD LIST! (see below)
#
# 2. This script makes use of whitelisting of popular domains. The #
source of the list can be found here: #
http://spamassassin.apache.org/full/3.1.x/dist/rules/25_uribl.cf
# These are domains that are whitelisted by the SURBL group so it
# doesn't make sense to waste resources doing lookups on them.
# THE VARIABLE $whitelist_file MUST BE SET TO THE FULL PATH AND
# NAME OF THE FILE CONTAINING THE WHITE LIST! (see below) # #
3. Per the guidelines at http://www.surbl.org, if your site processes
# more than 100,000 messages per day, you should NOT be using the #
public SURBL name servers but should be rsync-ing from them and #
running your own. See http://www3.surbl.org/rsync-signup.html
#
use strict;
use warnings;
sub surblspamcheck
{
# Designed and written by Erik Mugele, 2004-2006
#
http://www.teuton.org/~ejm
# Version 2.0
# Modified by Marc Perkel - marc@??? -
http://www.junkemailfilter.com
# The following variable is the full path to the file containing the
# list of Country Code Top Level Domains (ccTLD).
# ---------------------------------------------------------------------
# THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
# CONTAINING THE CCTLD LIST!
# ---------------------------------------------------------------------
my $cctld_file = "/etc/exim/ccTLD.txt"; # The following
variable is the full path to the file containing
# whitelist entries. #
---------------------------------------------------------------------
# THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
# CONTAINING THE WHITELIST DOMAINS!
# ---------------------------------------------------------------------
my $whitelist_file = "/etc/exim/surbl_whitelist.txt";
# This variable defines the maximum MIME file size that will be
checked
# if this script is called by the MIME ACL. This is primarily to
# keep the load down on the server. Size is in bytes.
my $max_file_size = 50000;
# The following two variables enable or disable the SURBL and URIBL
# lookups. Set to 1 to enable and 0 to disable.
my $surbl_enable = 1;
my $uribl_enable = 1;
my $dob_enable = 1;
my $karma_enable = 1;
# Check to see if a decode MIME attachment is being checked or
# just a plain old text message with no attachments
my $exim_body = "";
my $mime_filename = Exim::expand_string('$mime_decoded_filename');
if ($mime_filename) {
# DEBUG Statement
#warn ("MIME FILENAME: $mime_filename\n");
# If the MIME file is too large, skip it.
if (-s $mime_filename <= $max_file_size) {
open(INFILE,"<$mime_filename");
binmode(INFILE);
while (read(INFILE,my $buff,1024)) {
$exim_body .= $buff;
}
close (INFILE);
} else {
$exim_body = "";
}
} else {
$exim_body = Exim::expand_string('$message_body');
}
sub surbllookup {
# This subroutine does the actual DNS lookup and builds and returns
# the return message for the SURBL lookup.
my @params = @_;
my($return_string);
my $surbldomain = ".multi.surbl.org";
my @dnsbladdr=gethostbyname($params[0].$surbldomain);
# If gethostbyname() returned anything, build a return message.
$return_string = "";
if (scalar(@dnsbladdr) != 0) {
$return_string = "Blacklisted URL in message.
(".$params[0].") in";
my @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
if ($surblipaddr[3] & 64) {
$return_string .= " [jp]";
}
if ($surblipaddr[3] & 32) {
$return_string .= " [ab]";
}
if ($surblipaddr[3] & 16) {
$return_string .= " [ob]";
}
if ($surblipaddr[3] & 8) {
$return_string .= " [ph]";
}
if ($surblipaddr[3] & 4) {
$return_string .= " [ws]";
}
if ($surblipaddr[3] & 2) {
$return_string .= " [sc]";
}
$return_string .= ". See http://www.surbl.org/lists.html.";
}
return $return_string;
}
sub uribllookup {
# This subroutine does the actual DNS lookup and builds and returns
# the return message for the URIBL check.
my @params = @_;
my($return_string);
my $surbldomain = ".black.uribl.com";
my @dnsbladdr=gethostbyname($params[0].$surbldomain);
# If gethostbyname() returned anything, build a return message.
$return_string = "";
if (scalar(@dnsbladdr) != 0) {
$return_string = "Blacklisted URL in message.
(".$params[0].") in";
my @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
if ($surblipaddr[3] & 8) {
$return_string .= " [red]";
}
if ($surblipaddr[3] & 4) {
$return_string .= " [grey]";
}
if ($surblipaddr[3] & 2) {
$return_string .= " [black]";
}
$return_string .= ". See http://lookup.uribl.com.";
}
return $return_string;
}
# This list returns domain name under 5 days old
sub doblookup {
# This subroutine does the actual DNS lookup and builds and returns
# the return message for the URIBL check.
my @params = @_;
my($return_string);
my $dobdomain = ".dob.sibl.support-intelligence.net";
my @dnsbladdr=gethostbyname($params[0].$dobdomain);
# If gethostbyname() returned anything, build a return message.
$return_string = "";
if (scalar(@dnsbladdr) != 0) {
$return_string = "Blacklisted URL in message.
(".$params[0].") in DOB.";
}
return $return_string;
}
# http://wiki.junkemailfilter.com/index.php/Spam_DNS_Lists
sub karmalookup {
# If white listed - skip other lookups
my @params = @_;
my($return_string);
my $karmadomain = ".hostkarma.junkemailfilter.com";
my @dnsbladdr=gethostbyname($params[0].$karmadomain);
# If gethostbyname() returned anything, build a return message.
$return_string = "";
if (scalar(@dnsbladdr) != 0) {
my @karmaipaddr = unpack('C4',($dnsbladdr[4])[0]);
# Junk Email Filter now has a Day old Bread List
if ($karmaipaddr[3] eq 6) {
$return_string = "Blacklisted URL in message.
(".$params[0].") in HOSTKARMA DOB.";
}
# This is mostly for caching purposes
if ($karmaipaddr[3] eq 2) {
$return_string = "Blacklisted URL in message.
(".$params[0].") in HOSTKARMA.";
}
# Exclude HostHarma White List
if ($karmaipaddr[3] eq 1) {
$return_string = "whitelisted";
}
# Exclude HostHarma Yellow List
if ($karmaipaddr[3] eq 3) {
$return_string = "whitelisted";
}
# Exclude HostHarma NoBl List
if ($karmaipaddr[3] eq 5) {
$return_string = "whitelisted";
}
}
return $return_string;
}
sub converthex {
# This subroutin converts two hex characters to an ASCII character.
# It is called when ASCII obfuscation or Printed-Quatable characters
# are found (i.e. %AE or =AE).
# It should return a converted/plain address after splitting off
# everything that isn't part of the address portion of the URL.
my @ob_parts = @_;
my $address = $ob_parts[0];
for (my $j=1; $j < scalar(@ob_parts); $j++) {
$address .= chr(hex(substr($ob_parts[$j],0,2)));
$address .= substr($ob_parts[$j],2,);
}
$address = (split(/[^A-Za-z0-9._\-]/,$address))[0];
return $address
}
################
# Main Program #
################
if ($exim_body) {
# Find all the URLs in the message by finding the HTTP string
my @parts = split /[hH][tT][tT][pP]:\/\//,$exim_body;
if (scalar(@parts) > 1) {
my(@lookupdomains,@whitelist,@cctlds);
# Read the entries from the ccTLD file.
open (cctld_handle,$cctld_file) or die "Can't open
$cctld_file.\n";
while (<cctld_handle>) {
next if (/^#/ || /^$/ || /^\s$/);
push(@cctlds,$_);
}
close (cctld_handle) or die "Close: $!\n";
my($address);
# Read the entries from the whitelist file.
open (whitelist_handle,$whitelist_file) or die "Can't open
$whitelist_file.\n";
while (<whitelist_handle>) {
next if (/^#/ || /^$/ || /^\s$/);
push(@whitelist,$_);
}
close (whitelist_handle) or die "Close: $!\n";
# Go through each of the HTTP parts that were found in the
message
for (my $i=1; $i < scalar(@parts); $i++) {
# Special case of Quoted Printable EOL marker
my ($return_result,@domain);
$parts[$i] =~ s/=\n//g;
# Split the parts and find the address portion of the
URL.
# Address SHOULD be either a FQDN, IP address, or encoded
address.
$address = (split(/[^A-Za-z0-9\._\-%=]/,$parts[$i]))[0];
# Check for an =. If it exists, we assume the URL is
doing # Quoted-Printable. Decode it and redine $address
if ($address =~ /=/) {
my @ob_parts = split /=/,$address;
$address = converthex(@ob_parts);
}
# Check for a %. If it exists the URL is using % ASCII
# obfuscation. Decode it and redefine $address.
if ($address =~ /%/) {
my @ob_parts = split /%/,$address;
$address = converthex(@ob_parts);
}
# Split the the address into the elements separated by
periods.
@domain = split /\./,$address;
# Check the length of the domain name. If less then two
elements
# at this point it is probably bogus or there is a bug in
one of # the decoding/converting routines above.
if (scalar(@domain) >= 2) {
my($spamcheckdomain);
$return_result="";
# By default, assume that the domain check is on a
# "standard" two level domain
$spamcheckdomain=$domain[-2].".".$domain[-1];
# Check for a two level domain
if (((scalar(@domain) == 2) || (scalar(@domain) >=
5)) && (grep(/^$spamcheckdomain$/i,@cctlds))) {
$return_result="cctld";
}
# Check for a three level domain
if (scalar(@domain) == 3) {
if (grep(/^$spamcheckdomain$/i,@cctlds)) {
$spamcheckdomain=$domain[-3].".".$spamcheckdomain;
if (grep(/^$spamcheckdomain$/,@cctlds)) {
$return_result="cctld";
}
}
}
# Check for a four level domain
if (scalar(@domain) == 4) {
# Check to see if the domain is an IP address
if ($domain[-1] =~ /[a-zA-Z]/) {
if (grep(/^$spamcheckdomain$/i,@cctlds)) {
$spamcheckdomain=$domain[-3].".".$spamcheckdomain;
if (grep(/^$spamcheckdomain$/i,@cctlds)) {
$spamcheckdomain=$domain[-4].".".$spamcheckdomain;
}
}
} else {
# Domain is an IP address
$spamcheckdomain=$domain[3].".".$domain[2].
".".$domain[1].".".$domain[0];
}
}
# DEBUG statement
#warn ("FOUND DOMAIN ($mime_filename):
$spamcheckdomain\n");
# If whitelisting is enabled check domain against the
# whitelist.
if ($whitelist_file ne "") {
foreach my $whitelist_entry (@whitelist) {
chomp($whitelist_entry);
if ($spamcheckdomain =~
m/^$whitelist_entry$/i) {
$return_result="whitelisted";
last;
}
}
}
# If the domain is whitelisted or in the cctld skip
adding
# it to the lookup list.
if ($return_result eq "") {
if (scalar(@lookupdomains) > 0) {
# Check so see if the domain already is in
the list.
if (not
grep(/^$spamcheckdomain$/i,@lookupdomains)) {
push(@lookupdomains,$spamcheckdomain);
}
} else {
push(@lookupdomains,$spamcheckdomain);
}
}
}
}
# If there are items in the lookupdomains list then
# perform lookups on them. If there are not, something is wrong
# and just return false. There should always be something in
the list.
if (scalar(@lookupdomains) > 0) {
foreach my $i (@lookupdomains) {
my $return_result;
# DEBUG statement.
#warn ("CHECKING DOMAIN ($mime_filename): $i\n");
# do a HOSTKARMA lookup - contains whitelist information
#
http://wiki.junkemailfilter.com/index.php/Spam_DNS_Lists
if (($karma_enable == 1) && ($return_result eq "")) {
$return_result = karmalookup($i);
}
# do an SURBL lookup
if (($surbl_enable == 1) && ($return_result eq "")) {
$return_result = surbllookup($i);
}
# do a URIBL lookup
if (($uribl_enable == 1) && ($return_result eq "")) {
$return_result = uribllookup($i);
}
# do a Day Old Bread lookup - domains under 5 days old
if (($dob_enable == 1) && ($return_result eq "")) {
$return_result = doblookup($i);
}
# If we got a hit return the result to Exim
if ($return_result ne "") {
return $return_result;
}
}
}
}
}
# We didn't find any URLs or the URLs we did find were not
# listed so return false.
return -1;
}