Perl Crawler爬虫2010-05-08 14:38|文章出处:
在yunshu代码的基础上了做了些改进:
1)添加了爬虫爬时带cookie,对于需要验证的后台可以爬到
2)对URL进行去重时,去掉URL中query部分的值。
3)判断新URL是否需要加入爬虫queue,进行模糊匹配。根据应用,URL单纯数字变化则忽略
[ccsafe@sec-host10 crawl]$ cat crawler_minix.pl
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;
use Thread::Queue;
use Thread::Semaphore;
use Bloom::Filter;
use URI;
use URI::URL;
use Web::Scraper;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Cookies::Guess;
use String::Diff;
use String::Diff qw(diff_fully diff diff_merge diff_regexp);
use URI::Split qw(uri_split uri_join);
#crawling with signed cookie
my $cookie_jar = './cookie.lwp';
my $tmp_ua = LWP::UserAgent->new;
$tmp_ua->timeout(15);
$tmp_ua->protocols_allowed(['http','https']);
$tmp_ua->agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
$tmp_ua->cookie_jar(HTTP::Cookies->new('file'=>'./cookie.lwp','autosave'=>1));
push @{$tmp_ua->requests_redirectable}, 'POST';
my $max_threads = 15;
#my $base_url = $ARGV[0] || '';
my $base_url = $ARGV[0] || '';
my $host = URI::URL->new($base_url)->host;
print $host."\n";
my $queue = Thread::Queue->new( );
my $semaphore = Thread::Semaphore->new( $max_threads );
my $mutex = Thread::Semaphore->new( 1 );
#my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
#my $logfile = "crawler".($year+1900).($mon+1).$mday.$hour.$min.$sec.".log";
#open(BANLOG,">>$logfile") or die("can't open logfile:$!\n");
my $filter = shared_clone( Bloom::Filter->new(capacity => 1000000, error_rate => 0.001) );
$queue->enqueue( $base_url );
$filter->add( $base_url );
my @tmp_url = ();
push(@tmp_url,$base_url);
while( 1 )
{
# join all threads which can be joined
#my $joined = 0;
foreach ( threads->list(threads::joinable) )
{
#$joined ++;
$_->join( );
}
#print $joined, " joinedn";
# if there are no url need process.
my $item = $queue->pending();
if( $item == 0 )
{
my $active = threads->list(threads::running);
# there are no active thread, we finish the job
if( $active == 0 )
{
print "All done!\n";
last;
}
# we will get some more url if there are some active threads, just wait for them
else
{
#print "[MAIN] 0 URL, but $active active threadn";
sleep 1;
next;
}
}
# if there are some url need process
#print "[MAIN] $item URLn";
$semaphore->down;
#print "[MAIN]Create thread.n";
threads->create( \&ProcessUrl );
}
# join all threads which can be joined
foreach ( threads->list() )
{
$_->join( );
}
sub ProcessUrl
{
my $scraper = scraper
{
process '//a', 'links[]' => ;
};
my $res;
my $link;
while( my $url = $queue->dequeue_nb() )
{
eval
{
$scraper->user_agent($tmp_ua);
$res = $scraper->scrape( URI->new($url) )->{'links'};
};
if( $@ )
{
warn "";
next;
}
next if (! defined $res );
#print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";
foreach( @{$res} )
{
$link = $_->as_string;
$link = URI::URL->new($link, $url);
# not http and not https?
next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
# another domain?
#next if( $link->host ne $host );
#search for the sub domain
next if(!($link->host =~ /$host/));
$link = $link->abs->as_string;
if( $link =~ /(.*?)#(.*)/ )
{
$link = $1;
}
next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );
print "test:$link\n";
#EscapeUrl,skip query form values
my $tmp_link = &EscapeUrl($link);
#print "Escape:".$tmp_link."\n";
$mutex->down();
my $tmp_mark = 0;
#print "test start:$link\n";
if( ! $filter->check($tmp_link) )
{
#print "Test filter ok:$tmp_link\n";
#DiffUrl,diff $link from queue with number
foreach(@tmp_url)
{
#print "Test Queue:".$tmpurl."\n";
#print "test-1:$_\ntest-2:$tmp_link\n";
if(&DiffUrl($_,$link))
{
$tmp_mark = 2;
last;
}
}
if( $tmp_mark != 2 )
{
$queue->enqueue($link);
#print "add queue:$link\n";
$filter->add($tmp_link);
print "add filter:$tmp_link\n";
#print BANLOG $filter->key_count(), " ", $link, "\n";
#print $filter->key_count(), " ", $link, "\n";
push(@tmp_url,$link);
}
else
{
print "pass:$link\n";
}
}
#print "pass:$link\n";
$mutex->up();
undef $link;
}
undef $res;
}
undef $scraper;
$semaphore->up( );
}
#close(BANLOG);
print "ALL DONE.\n";
#skip arg
sub EscapeUrl
{
my $urlold = shift;
my ($scheme,$auth,$path,$query,$frag) = uri_split($urlold);
my $urlnew = uri_join($scheme,$auth,$path);
my $u = URI->new($urlold);
my @tmp_array = $u->query_form();
my $tmp = '';
my $i = 0;
for($i=0;$i<@tmp_array;$i+=2)
{
$tmp .=$tmp_array[$i]."=&";
}
if(@tmp_array != 0)
{
$tmp =~ s/&$//;
$urlnew .= "?".$tmp;
}
undef $u;
#print $urlnew."\n";
return $urlnew;
}
sub DiffUrl
{
my $urlold = shift;
my $urlnew = shift;
my $urloldx = &EscapeUrl($urlold);
my $urlnewx = &EscapeUrl($urlnew);
my($old,$new) = String::Diff::diff($urloldx,$urlnewx);
#my($old,$new) = String::Diff::diff($urlold,$urlnew);
if (($old =~ m/(\[\d+\])/i) && ($new =~ m/{\d+}/i))
#if ($new =~ m/{\d+}/i)
{
#print "test num success.\n";
return 1;
}
else
{
#print "test num failed.\n";
return 0;
}
}