#!/usr/local/bin/perl -w
###########################################################################
# mime_strip.html_bodies.pl #
###########################################################################
# Used to strip the "HTML-alternative" MIME attachments that some Mozilla #
# and MS Outlook users like to add to every email. Used in conjunction #
# with procmail. #
# #
# Originally coded 12/17/2002 by Lester Hightower #
# Based on info from this URL: #
###########################################################################
use strict;
use MIME::Parser;
use MIME::Entity;
use IPC::Open2;
use FileHandle;
use Getopt::Long;
$Getopt::Long::ignorecase = 0;
$Getopt::Long::debug = 0;
my $DoSpamCheck=0;
my $MsgDupThreshold=-1;
my $help=0;
my $DEBUG=0;
if ( !GetOptions(
'spamcheck=i' => \$DoSpamCheck,
'msg-dup-threshold=i' => \$MsgDupThreshold,
'help' => \$help,
'debug' => \$DEBUG,
) or $help )
{
app_usage(2);
}
my $VERSION = 1.6;
$|++;
my $envelope = ;
my $parser = MIME::Parser->new;
$parser->output_to_core(1);
$parser->tmp_to_core(1);
my $ent = eval { $parser->parse(\*STDIN) }; die "$@" if $@;
if ($ent->effective_type eq "multipart/alternative"
and $ent->parts == 2
and $ent->parts(0)->effective_type eq "text/plain"
and $ent->parts(1)->effective_type eq "text/html") {
my $body=$ent->parts(0)->bodyhandle->as_string();
# If the body is short (so a regex makes sense) and it's all white-space,
# then dropping the HTML body makes the message a little useless, and
# likely it is spam... However, if we have lynx available, we'll
# try to convert it to plain text.
if ( (length($body) < 500 && $body =~ m/^\s+$/) || $DoSpamCheck) {
my $html_body=$ent->parts(1)->bodyhandle->as_string();
my $new_body = &HTMLtoTXTviaLynx(\$html_body);
if (length($new_body)) {
if ($DoSpamCheck) {
$body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold);
} else {
$body = $new_body;
}
}
} else {
$body.="\n\n[HTML alternate version deleted]\n";
}
my $newent = MIME::Entity->build(Data => $body);
$ent->parts([$newent]);
$ent->make_singlepart;
$ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
} elsif ($ent->effective_type eq "multipart/mixed"
and $ent->parts(0)->effective_type eq "multipart/alternative"
and $ent->parts(0)->parts == 2
and $ent->parts(0)->parts(0)->effective_type eq "text/plain"
and $ent->parts(0)->parts(1)->effective_type eq "text/html") {
my $body=$ent->parts(0)->parts(0)->bodyhandle->as_string();
# If the body is short (so a regex makes sense) and it's all white-space,
# then dropping the HTML body makes the message a little useless, and
# likely it is spam... However, if we have lynx available, we'll
# try to convert it to plain text.
if (length($body) < 500 && $body =~ m/^\s+$/) {
my $html_body=$ent->parts(0)->parts(1)->bodyhandle->as_string();
my $new_body = &HTMLtoTXTviaLynx(\$html_body);
if (length($new_body)) {
if ($DoSpamCheck) {
$body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold);
} else {
$body = $new_body;
}
}
} else {
$body.="\n\n[HTML alternate version deleted]\n";
}
my $newent = MIME::Entity->build(Data => $body);
$ent->parts(0)->parts([$newent]);
$ent->parts(0)->make_singlepart;
$ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
$ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
#
# Mozilla Mail (added 12/17/2002 by LHH)
#
# Emails from mozilla with "multipart/related" attachments.
} elsif ($ent->effective_type eq "multipart/alternative"
and $ent->parts == 2
and $ent->parts(0)->effective_type eq "text/plain"
and $ent->parts(1)->effective_type eq "multipart/related") {
my $newent = MIME::Entity->build(Data =>
$ent->parts(0)->bodyhandle->as_string() .
"\n\n[HTML alternate version deleted]\n");
$ent->parts([$newent]);
$ent->make_singlepart;
$ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
#
# Mozilla Mail (added 03/27/2003 by LHH)
#
# Emails from mozilla with "multipart/related" attachments inside of
# an "multipart/alternative" in a "multipart/mixed" message.
# These come when a document is attached to an email and the
# client is also configured to send HTML mail.
} elsif ($ent->effective_type eq "multipart/mixed"
and $ent->parts(0)->effective_type eq "multipart/alternative"
and $ent->parts(0)->parts == 2
and $ent->parts(0)->parts(0)->effective_type eq "text/plain"
and $ent->parts(0)->parts(1)->effective_type eq "multipart/related") {
my $newent = MIME::Entity->build(Data =>
$ent->parts(0)->parts(0)->bodyhandle->as_string() .
"\n\n[HTML alternate version deleted]\n");
$ent->parts(0)->parts([$newent]);
$ent->parts(0)->make_singlepart;
$ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
$ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
}
print $envelope;
$ent->print;
exit;
###############
# Subroutines #
###############
sub HTMLtoTXTviaLynx {
my $rhtml=shift @_;
if (ref($rhtml) ne 'SCALAR') { return ''; }
my $LYNX_EXE=`which lynx 2>/dev/null`;
chomp($LYNX_EXE);
if (! (-x $LYNX_EXE)) { return ''; }
my $body='';
my $tmp_file = "/tmp/mime_strip.html_bodies.$$.lynx.tmp";
my $h_out=new FileHandle;
my $h_in=new FileHandle;
my $new_enough_lynx=0;
my $lynx_ver="UNKNOWN";
if (open($h_in, "'$LYNX_EXE' --version 2>&1 |")) {
my @ver_data=<$h_in>;
my ($ver_info,@crap)=grep(/version/i, @ver_data);
chomp($ver_info);
close($h_in);
if ($ver_info =~ m/Version (([0-9]+)\.([0-9]+)\.([0-9]+)[^\s]+)/i) {
$lynx_ver=$1;
if ($2 >= 2 && $3 >= 8 && $4 >= 4) {
$new_enough_lynx=1;
}
}
}
if ($new_enough_lynx && open($h_out, "| '$LYNX_EXE' -stdin -dump -nolog -noredir -localhost -nolist -width=76 1>'$tmp_file' 2>/dev/null")) {
print $h_out $$rhtml;
close $h_out;
if (-r $tmp_file && open($h_in, "< $tmp_file")) {
my @lynx_text_arr=<$h_in>;
close($h_in);
unlink $tmp_file;
my $lynx_text=join('', @lynx_text_arr);
if (length($lynx_text)) {
$body = $lynx_text . "\n[HTML converted to text by lynx $lynx_ver]\n";
}
}
} elsif (! $new_enough_lynx) {
$body="NOTICE from mime_strip.html_bodies.pl:\n" .
"\n" .
"This message had an empty text/plain attachment, so I wanted\n" .
"to use lynx to convert the HTML attachment to plain text,\n" .
"and I found a lynx binary at $LYNX_EXE, but the lynx\n" .
"version is too old for me to use (ver. $lynx_ver).\n" .
"\n" .
"You might consider upgrading lynx to v2.8.4 or higher.\n";
}
return $body;
}
sub DoSpamCheck {
my $ent=shift @_;
my $rTxtBody=shift @_;
my $rLnxBody=shift @_;
my $MsgDupThreshold=shift @_;
# Get the "[HTML converted to text by lynx...]" line out of $rLnxBody
my @tmp=grep(!/^\[HTML converted to text by /,split(/[\r\n]+/,$$rLnxBody));
my $htm_spam_body=join("\n", @tmp);
# Grab all the words, and word count, into %words_txt and %words_htm
my %words_txt; my %words_htm;
%words_txt = map { lc($_) => (iif($words_txt{lc($_)}) + 1) } uniq([split(/\W+/, $$rTxtBody)] );
%words_htm = map { lc($_) => (iif($words_htm{lc($_)}) + 1) } uniq([split(/\W+/, $htm_spam_body)]);
# If we matched on empty string, drop those
if (defined($words_txt{''})) { delete $words_txt{''}; }
if (defined($words_htm{''})) { delete $words_htm{''}; }
# Go through each word and count the difference in the number of
# occurrances in that word in the text vs. the HTML parts, and then
# Set "X-MultiPart-SpamRating" and "X-MultiPart-IsSpam" headers for
# this message, as appropriate.
#my %diff_words;
my $total_diff=0;
foreach my $word ( uniq([keys %words_txt, keys %words_htm]) ) {
my $diff = abs(iif($words_txt{$word}) - iif($words_htm{$word}));
#$diff_words{$word} = $diff;
$total_diff += $diff;
#print "LHHD: \"$word\"\n";
}
#print "LHHD:\n" . $htm_spam_body . "\n---\n" . $$rTxtBody . "\nEND LHHD\n";
my $head=$ent->head();
$head->replace('X-MultiPart-SpamRating', $total_diff);
if ($DoSpamCheck > 0 && $total_diff > $DoSpamCheck) {
$head->replace('X-MultiPart-IsSpam', 'YES');
}
my $new_body='';
if ($total_diff > $MsgDupThreshold) {
my $lsp = 15; # Leading space on section headers
$new_body= (" " x $lsp) . "+" . ("-" x 39) . "+\n" .
(" " x $lsp) . "| Plain-text part of multi-part message |\n" .
(" " x $lsp) . "+" . ("-" x 39) . "+\n" .
"\n" .
$$rTxtBody .
"\n" .
(" " x $lsp) . "+" . ("-" x 33) . "+\n" .
(" " x $lsp) . "| HTML part of multi-part message |\n" .
(" " x $lsp) . "+" . ("-" x 33) . "+\n" .
$$rLnxBody;
} else {
$new_body = $$rTxtBody;
}
return $new_body;
}
sub app_usage {
my ($exit) = shift @_;
print STDERR <
Usage: $0
-help This message.
-spamcheck=n Do spam check between html and text message parts.
"n" is one of three types of values:
0 - do no spam checking
<0 - do spam checking but only apply the
X-MultiPart-SpamRating header.
>0 - do spam checking and apply the
X-MultiPart-SpamRating header and also
the X-MultiPart-IsSpam: YES header if
the SpamRating is > than "n"
-msg-dup-threshold=n Threshold above which to show both the plain-text
and the lynx-converted text in the msg body when
using the -spamcheck option.
-debug Not yet implemented.
EndOfUsage
exit $exit if $exit != 0;
return $exit;
}
sub uniq {
my($rarray) = shift(@_);
my(%tmphash, $item);
foreach $item (@$rarray) {
$tmphash{$item}++;
}
return(sort keys %tmphash);
}
sub iif {
my $val=shift @_;
if (! defined($val)) {
return 0;
} else {
return int($val);
}
}
###############
## Begin POD ##
###############
=head1 NAME
mime_strip.html_bodies.pl
=head1 README
Used to strip the alternative "HTML body" attachments that some Mozilla and
MS Outlook users like to add to every email. Version 1.5 also introduced a
feature to catch most multipart/alternative SPAM, which has become very
popular recently and that gets past most SPAM filters. This script is most
often used in conjunction with procmail.
=head1 DESCRIPTION
Below is a snippet from my .procmailrc to illustrate the use of this script.
Note that perldoc wraps some of the lines when it should not, so if you
intend to copy/paste please open the script itself and copy/paste from
there, not from a "perldoc" or "man" view.
#############################################################################
:0
* ^Content-Type: (multipart/alternative|multipart/mixed)
{
# OK, before we just blindly file this in filtered.multipart_alternative
# or deliver it, let's give spamassassin a chance at it.
:0fw
* < 100000
| /usr/bin/spamc
# Store a copy, just in case. (paranoia, may kill this later)
:0cHBE
* ^Content-Type: multipart/alternative
{
:0
/home/hightowe/mail/junk_mail/filtered.multipart_alternative.$THIS_MONTH
}
#############################################
# Run the message through the strip filters #
#############################################
# Note: on 08/12/2003 I added the "a" flags here to only run this
# recipe if the "multipart/alternative" matched right above.
# Strip HTML-alternative bodies
:0afw
| /home/hightowe/bin/mime_strip.html_bodies.pl -spamcheck=50 -msg-dup-threshold=5
# Rename possibly dangerous attachments (.exe/.vbs/.pif/etc.)
:0fw
| /home/hightowe/bin/mime_rename.dangerous_windows_exts.pl
}
# OK, we're going to file all "X-MultiPart-IsSpam: YES" messages
# into /home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH.
# Note that this header is applied by ~/bin/mime_strip.html_bodies.pl.
:0H:
* ^X-MultiPart-IsSpam: YES
/home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH
#############################################################################
=head1 AUTHORSHIP
Lester Hightower
=head1 CHANGE LOG
Dec-17-2002: Originally created by Lester Hightower
Mar-25-2003: Ver. 1.0, first release to CPAN
Mar-26-2003: Ver. 1.1, added "README" to POD for better CPAN behavior
Mar-27-2003: Ver. 1.2, added another Mozilla "multipart/related" catch
Apr-22-2003: Ver. 1.3, bugfix: changed to $ent->bodyhandle->as_string()
Jan-06-2004: Ver. 1.4: added lynx-based HTML->plain text conversions
for multi-part alternatives that do not have a
text/plain component.
Jan-11-2004: Ver. 1.5: added -spamcheck=n option to try to catch the
new multipart/alternative SPAM that has become
very popular lately -- where the plain text part
is something that SpamAssassin, et. al., allows
through (random words, paragraphs from books),
but the HTML part is an advertisement.
Jan-16-2004: Ver. 1.6: added -msg-dup-threshold=n option to allow one to
eliminate that msg body duplication caused by
-spamcheck. This is particularly useful for low
rated messages (which are normally not spam).
=head1 PREREQUISITES
This script requires the C module. It also requires
C. If you want to use the HTML->text feature
(not required, and not very important), or the -spamcheck feature,
you will need version 2.4.8 or higher of lynx, the text-based
web-browser, installed.
=pod OSNAMES
any
=pod SCRIPT CATEGORIES
Mail
Mail/Converters
Mail/Filters
=cut
阅读(761) | 评论(0) | 转发(0) |