Chinaunix首页 | 论坛 | 博客
  • 博客访问: 5393178
  • 博文数量: 1144
  • 博客积分: 11974
  • 博客等级: 上将
  • 技术积分: 12312
  • 用 户 组: 普通用户
  • 注册时间: 2005-04-13 20:06
文章存档

2017年(2)

2016年(14)

2015年(10)

2014年(28)

2013年(23)

2012年(29)

2011年(53)

2010年(86)

2009年(83)

2008年(43)

2007年(153)

2006年(575)

2005年(45)

分类: PERL

2014-12-03 23:26:32


#!/usr/local/bin/perl -w

###########################################################################
# mime_strip.html_bodies.pl                                               #
###########################################################################
# Used to strip the "HTML-alternative" MIME attachments that some Mozilla #
# and MS Outlook users like to add to every email.  Used in conjunction   #
# with procmail.                                                          #
#                                                                         #
# Originally coded 12/17/2002 by Lester Hightower    #
# Based on info from this URL: #
###########################################################################

use strict;
use MIME::Parser;
use MIME::Entity;
use IPC::Open2;
use FileHandle;
use Getopt::Long;
$Getopt::Long::ignorecase = 0;
$Getopt::Long::debug = 0;

my $DoSpamCheck=0;
my $MsgDupThreshold=-1;
my $help=0;
my $DEBUG=0;
if ( !GetOptions(
                     'spamcheck=i'  => \$DoSpamCheck,
                     'msg-dup-threshold=i'  => \$MsgDupThreshold,
                     'help'     => \$help,
                     'debug'    => \$DEBUG,
                    ) or $help )
    {
        app_usage(2);
    }

my $VERSION = 1.6;

$|++;

my $envelope = ;

my $parser = MIME::Parser->new;
$parser->output_to_core(1);
$parser->tmp_to_core(1);

my $ent = eval { $parser->parse(\*STDIN) }; die "$@" if $@;

if ($ent->effective_type eq "multipart/alternative"
    and $ent->parts == 2
    and $ent->parts(0)->effective_type eq "text/plain"
    and $ent->parts(1)->effective_type eq "text/html") {
 
  my $body=$ent->parts(0)->bodyhandle->as_string();
  # If the body is short (so a regex makes sense) and it's all white-space,
  # then dropping the HTML body makes the message a little useless, and
  # likely it is spam...  However, if we have lynx available, we'll
  # try to convert it to plain text.
  if ( (length($body) < 500 && $body =~ m/^\s+$/) || $DoSpamCheck) {
    my $html_body=$ent->parts(1)->bodyhandle->as_string();
    my $new_body = &HTMLtoTXTviaLynx(\$html_body);
    if (length($new_body)) {
      if ($DoSpamCheck) {
        $body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold);
      } else {
        $body = $new_body;
      }
    }
  } else {
    $body.="\n\n[HTML alternate version deleted]\n";
  }
  my $newent = MIME::Entity->build(Data => $body);
  $ent->parts([$newent]);
  $ent->make_singlepart;
  $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
} elsif ($ent->effective_type eq "multipart/mixed"
    and $ent->parts(0)->effective_type eq "multipart/alternative"
    and $ent->parts(0)->parts == 2
    and $ent->parts(0)->parts(0)->effective_type eq "text/plain"
    and $ent->parts(0)->parts(1)->effective_type eq "text/html") {

  my $body=$ent->parts(0)->parts(0)->bodyhandle->as_string();
  # If the body is short (so a regex makes sense) and it's all white-space,
  # then dropping the HTML body makes the message a little useless, and
  # likely it is spam...  However, if we have lynx available, we'll
  # try to convert it to plain text.
  if (length($body) < 500 && $body =~ m/^\s+$/) {
    my $html_body=$ent->parts(0)->parts(1)->bodyhandle->as_string();
    my $new_body = &HTMLtoTXTviaLynx(\$html_body);
    if (length($new_body)) {
      if ($DoSpamCheck) {
        $body = &DoSpamCheck($ent, \$body, \$new_body, $MsgDupThreshold);
      } else {
        $body = $new_body;
      }
    }
  } else {
    $body.="\n\n[HTML alternate version deleted]\n";
  }
  my $newent = MIME::Entity->build(Data => $body);
  $ent->parts(0)->parts([$newent]);
  $ent->parts(0)->make_singlepart;
  $ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
  $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
#
# Mozilla Mail (added 12/17/2002 by LHH)
#
# Emails from mozilla with "multipart/related" attachments.
} elsif ($ent->effective_type eq "multipart/alternative"
    and $ent->parts == 2
    and $ent->parts(0)->effective_type eq "text/plain"
    and $ent->parts(1)->effective_type eq "multipart/related") {
 
  my $newent = MIME::Entity->build(Data =>
                                   $ent->parts(0)->bodyhandle->as_string() .
                                   "\n\n[HTML alternate version deleted]\n");
  $ent->parts([$newent]);
  $ent->make_singlepart;
  $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
#
# Mozilla Mail (added 03/27/2003 by LHH)
#
# Emails from mozilla with "multipart/related" attachments inside of
# an "multipart/alternative" in a "multipart/mixed" message.
# These come when a document is attached to an email and the
# client is also configured to send HTML mail.
} elsif ($ent->effective_type eq "multipart/mixed"
    and $ent->parts(0)->effective_type eq "multipart/alternative"
    and $ent->parts(0)->parts == 2
    and $ent->parts(0)->parts(0)->effective_type eq "text/plain"
    and $ent->parts(0)->parts(1)->effective_type eq "multipart/related") {

    my $newent = MIME::Entity->build(Data =>
                 $ent->parts(0)->parts(0)->bodyhandle->as_string() .
                          "\n\n[HTML alternate version deleted]\n");
    $ent->parts(0)->parts([$newent]);
    $ent->parts(0)->make_singlepart;
    $ent->parts(0)->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
    $ent->sync_headers(Length => 'COMPUTE', Nonstandard => 'ERASE');
}

print $envelope;
$ent->print;

exit;

###############
# Subroutines #
###############
sub HTMLtoTXTviaLynx {
  my $rhtml=shift @_;
  if (ref($rhtml) ne 'SCALAR') { return ''; }

  my $LYNX_EXE=`which lynx 2>/dev/null`;
  chomp($LYNX_EXE);
  if (! (-x $LYNX_EXE)) { return ''; }

  my $body='';
  my $tmp_file = "/tmp/mime_strip.html_bodies.$$.lynx.tmp";
  my $h_out=new FileHandle;
  my $h_in=new FileHandle;
  my $new_enough_lynx=0;
  my $lynx_ver="UNKNOWN";
  if (open($h_in, "'$LYNX_EXE' --version 2>&1 |")) {
    my @ver_data=<$h_in>;
    my ($ver_info,@crap)=grep(/version/i, @ver_data);
    chomp($ver_info);
    close($h_in);
    if ($ver_info =~ m/Version (([0-9]+)\.([0-9]+)\.([0-9]+)[^\s]+)/i) {
      $lynx_ver=$1;
      if ($2 >= 2 && $3 >= 8 && $4 >= 4) {
        $new_enough_lynx=1;
      }
    }
  }
  if ($new_enough_lynx && open($h_out, "| '$LYNX_EXE' -stdin -dump -nolog -noredir -localhost -nolist -width=76 1>'$tmp_file' 2>/dev/null")) {
    print $h_out $$rhtml;
    close $h_out;
    if (-r $tmp_file && open($h_in, "< $tmp_file")) {
      my @lynx_text_arr=<$h_in>;
      close($h_in);
      unlink $tmp_file;
      my $lynx_text=join('', @lynx_text_arr);
      if (length($lynx_text)) {
        $body = $lynx_text . "\n[HTML converted to text by lynx $lynx_ver]\n";
      }
    }
  } elsif (! $new_enough_lynx) {
    $body="NOTICE from mime_strip.html_bodies.pl:\n" .
        "\n" .
        "This message had an empty text/plain attachment, so I wanted\n" .
        "to use lynx to convert the HTML attachment to plain text,\n" .
        "and I found a lynx binary at $LYNX_EXE, but the lynx\n" .
        "version is too old for me to use (ver. $lynx_ver).\n" .
        "\n" .
        "You might consider upgrading lynx to v2.8.4 or higher.\n";
  }
return $body;
}

sub DoSpamCheck {
  my $ent=shift @_;
  my $rTxtBody=shift @_;
  my $rLnxBody=shift @_;
  my $MsgDupThreshold=shift @_;

  # Get the "[HTML converted to text by lynx...]" line out of $rLnxBody
  my @tmp=grep(!/^\[HTML converted to text by /,split(/[\r\n]+/,$$rLnxBody));
  my $htm_spam_body=join("\n", @tmp);

  # Grab all the words, and word count, into %words_txt and %words_htm
  my %words_txt;  my %words_htm;
  %words_txt = map { lc($_) => (iif($words_txt{lc($_)}) + 1) } uniq([split(/\W+/, $$rTxtBody)] );
  %words_htm = map { lc($_) => (iif($words_htm{lc($_)}) + 1) } uniq([split(/\W+/, $htm_spam_body)]);

  # If we matched on empty string, drop those
  if (defined($words_txt{''})) { delete $words_txt{''}; }
  if (defined($words_htm{''})) { delete $words_htm{''}; }

  # Go through each word and count the difference in the number of
  # occurrances in that word in the text vs. the HTML parts, and then
  # Set "X-MultiPart-SpamRating" and "X-MultiPart-IsSpam" headers for
  # this message, as appropriate.
  #my %diff_words;
  my $total_diff=0;
  foreach my $word ( uniq([keys %words_txt, keys %words_htm]) ) {
    my $diff = abs(iif($words_txt{$word}) - iif($words_htm{$word}));
    #$diff_words{$word} = $diff;
    $total_diff += $diff;
    #print "LHHD: \"$word\"\n";
  }
  #print "LHHD:\n" . $htm_spam_body . "\n---\n" . $$rTxtBody . "\nEND LHHD\n";
  my $head=$ent->head();
  $head->replace('X-MultiPart-SpamRating', $total_diff);
  if ($DoSpamCheck > 0 && $total_diff > $DoSpamCheck) {
    $head->replace('X-MultiPart-IsSpam', 'YES');
  }

  my $new_body='';
  if ($total_diff > $MsgDupThreshold) {
    my $lsp = 15;       # Leading space on section headers
    $new_body=  (" " x $lsp) . "+" . ("-" x 39) . "+\n" .
                (" " x $lsp) . "| Plain-text part of multi-part message |\n" .
                (" " x $lsp) . "+" . ("-" x 39) . "+\n" .
                "\n" .
                        $$rTxtBody .
                "\n" .
                (" " x $lsp) . "+" . ("-" x 33) . "+\n" .
                (" " x $lsp) . "| HTML part of multi-part message |\n" .
                (" " x $lsp) . "+" . ("-" x 33) . "+\n" .
                        $$rLnxBody;
  } else {
    $new_body = $$rTxtBody;
  }

return $new_body;
}


sub app_usage {
    my ($exit) = shift @_;

    print STDERR < Usage: $0
    -help                This message.

    -spamcheck=n         Do spam check between html and text message parts.
                         "n" is one of three types of values:
                                0 - do no spam checking
                               <0 - do spam checking but only apply the
                                    X-MultiPart-SpamRating header.
                               >0 - do spam checking and apply the
                                    X-MultiPart-SpamRating header and also
                                    the X-MultiPart-IsSpam: YES header if
                                    the SpamRating is > than "n"

    -msg-dup-threshold=n Threshold above which to show both the plain-text
                         and the lynx-converted text in the msg body when
                         using the -spamcheck option.

    -debug               Not yet implemented.
EndOfUsage
    exit $exit if $exit != 0;
return $exit;
}

sub uniq {
  my($rarray) = shift(@_);
  my(%tmphash, $item);
  foreach $item (@$rarray) {
    $tmphash{$item}++;
  }
return(sort keys %tmphash);
}

sub iif {
  my $val=shift @_;
  if (! defined($val)) {
   return 0;
  } else {
    return int($val);
  }
}


###############
## Begin POD ##
###############

=head1 NAME

mime_strip.html_bodies.pl

=head1 README

Used to strip the alternative "HTML body" attachments that some Mozilla and
MS Outlook users like to add to every email.  Version 1.5 also introduced a
feature to catch most multipart/alternative SPAM, which has become very
popular recently and that gets past most SPAM filters.  This script is most
often used in conjunction with procmail.

=head1 DESCRIPTION

Below is a snippet from my .procmailrc to illustrate the use of this script.
Note that perldoc wraps some of the lines when it should not, so if you
intend to copy/paste please open the script itself and copy/paste from
there, not from a "perldoc" or "man" view.

#############################################################################
:0
* ^Content-Type: (multipart/alternative|multipart/mixed)
{
  # OK, before we just blindly file this in filtered.multipart_alternative
  # or deliver it, let's give spamassassin a chance at it.
  :0fw
  * < 100000
  | /usr/bin/spamc

  # Store a copy, just in case. (paranoia, may kill this later)
  :0cHBE
  * ^Content-Type: multipart/alternative
  {
    :0
    /home/hightowe/mail/junk_mail/filtered.multipart_alternative.$THIS_MONTH
  }
                                                                                
  #############################################
  # Run the message through the strip filters #
  #############################################
  # Note: on 08/12/2003 I added the "a" flags here to only run this
  # recipe if the "multipart/alternative" matched right above.
  # Strip HTML-alternative bodies
  :0afw
  | /home/hightowe/bin/mime_strip.html_bodies.pl -spamcheck=50 -msg-dup-threshold=5
                                                                                
  # Rename possibly dangerous attachments (.exe/.vbs/.pif/etc.)
  :0fw
  | /home/hightowe/bin/mime_rename.dangerous_windows_exts.pl
}

# OK, we're going to file all "X-MultiPart-IsSpam: YES" messages
# into /home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH.
# Note that this header is applied by ~/bin/mime_strip.html_bodies.pl.
:0H:
* ^X-MultiPart-IsSpam: YES
/home/hightowe/mail/junk_mail/filtered.MultiPartSpam.$THIS_MONTH

#############################################################################

=head1 AUTHORSHIP

Lester Hightower

=head1 CHANGE LOG

Dec-17-2002: Originally created by Lester Hightower
Mar-25-2003: Ver. 1.0, first release to CPAN
Mar-26-2003: Ver. 1.1, added "README" to POD for better CPAN behavior
Mar-27-2003: Ver. 1.2, added another Mozilla "multipart/related" catch
Apr-22-2003: Ver. 1.3, bugfix: changed to $ent->bodyhandle->as_string()
Jan-06-2004: Ver. 1.4: added lynx-based HTML->plain text conversions
                       for multi-part alternatives that do not have a
                       text/plain component.
Jan-11-2004: Ver. 1.5: added -spamcheck=n option to try to catch the
                       new multipart/alternative SPAM that has become
                       very popular lately -- where the plain text part
                       is something that SpamAssassin, et. al., allows
                       through (random words, paragraphs from books),
                       but the HTML part is an advertisement.
Jan-16-2004: Ver. 1.6: added -msg-dup-threshold=n option to allow one to
                       eliminate that msg body duplication caused by
                       -spamcheck.  This is particularly useful for low
                       rated messages (which are normally not spam).

=head1 PREREQUISITES

This script requires the C module.  It also requires
C.  If you want to use the HTML->text feature
(not required, and not very important), or the -spamcheck feature,
you will need version 2.4.8 or higher of lynx, the text-based
web-browser, installed.

=pod OSNAMES

any

=pod SCRIPT CATEGORIES

Mail
Mail/Converters
Mail/Filters

=cut

阅读(761) | 评论(0) | 转发(0) |
0

上一篇:MIME::Parser邮件解析

下一篇:PERL写的WEBMAIL

给主人留下些什么吧!~~