Chinaunix首页 | 论坛 | 博客
  • 博客访问: 5359597
  • 博文数量: 1144
  • 博客积分: 11974
  • 博客等级: 上将
  • 技术积分: 12312
  • 用 户 组: 普通用户
  • 注册时间: 2005-04-13 20:06
文章存档

2017年(2)

2016年(14)

2015年(10)

2014年(28)

2013年(23)

2012年(29)

2011年(53)

2010年(86)

2009年(83)

2008年(43)

2007年(153)

2006年(575)

2005年(45)

分类: LINUX

2010-09-13 15:19:41

Perl Crawler爬虫2010-05-08 14:38|文章出处:
在yunshu代码的基础上了做了些改进:
1)添加了爬虫爬时带cookie,对于需要验证的后台可以爬到
2)对URL进行去重时,去掉URL中query部分的值。
3)判断新URL是否需要加入爬虫queue,进行模糊匹配。根据应用,URL单纯数字变化则忽略
[ccsafe@sec-host10 crawl]$ cat crawler_minix.pl
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;
use Thread::Queue;
use Thread::Semaphore;
use Bloom::Filter;
use URI;
use URI::URL;
use Web::Scraper;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Cookies::Guess;
use String::Diff;
use String::Diff qw(diff_fully diff diff_merge diff_regexp);
use URI::Split qw(uri_split uri_join);
#crawling with signed cookie
my $cookie_jar = './cookie.lwp';
my $tmp_ua = LWP::UserAgent->new;
$tmp_ua->timeout(15);
$tmp_ua->protocols_allowed(['http','https']);
$tmp_ua->agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
$tmp_ua->cookie_jar(HTTP::Cookies->new('file'=>'./cookie.lwp','autosave'=>1));
push @{$tmp_ua->requests_redirectable}, 'POST';
my $max_threads = 15;
#my $base_url = $ARGV[0] || '';
my $base_url = $ARGV[0] || '';
my $host = URI::URL->new($base_url)->host;
print $host."\n";
my $queue = Thread::Queue->new( );
my $semaphore = Thread::Semaphore->new( $max_threads );
my $mutex = Thread::Semaphore->new( 1 );

#my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
#my $logfile = "crawler".($year+1900).($mon+1).$mday.$hour.$min.$sec.".log";
#open(BANLOG,">>$logfile") or die("can't open logfile:$!\n");

my $filter = shared_clone( Bloom::Filter->new(capacity => 1000000, error_rate => 0.001) );
$queue->enqueue( $base_url );
$filter->add( $base_url );
my @tmp_url = ();
push(@tmp_url,$base_url);
while( 1 )
{
        # join all threads which can be joined
        #my $joined = 0;
        foreach ( threads->list(threads::joinable) )
        {
                #$joined ++;
                $_->join( );
        }
        #print $joined, " joinedn";
        # if there are no url need process.
        my $item = $queue->pending();
        if( $item == 0 )
        {
                my $active = threads->list(threads::running);
                # there are no active thread, we finish the job
                if( $active == 0 )
                {
                        print "All done!\n";
                        last;
                }
                # we will get some more url if there are some active threads, just wait for them
                else
                {
                        #print "[MAIN] 0 URL, but $active active threadn";
                        sleep 1;
                        next;
                }
        }
        # if there are some url need process
        #print "[MAIN] $item URLn";
        $semaphore->down;
        #print "[MAIN]Create thread.n";
        threads->create( \&ProcessUrl );
}
# join all threads which can be joined
foreach ( threads->list() )
{
        $_->join( );
}
sub ProcessUrl
{
        my $scraper = scraper
        {
                process '//a', 'links[]' => ;
        };
        my $res;
        my $link;
        while( my $url = $queue->dequeue_nb() )
        {
                eval
               {
                        $scraper->user_agent($tmp_ua);
                        $res = $scraper->scrape( URI->new($url) )->{'links'};
                };
                if( $@ )
                {
                        warn "";
                        next;
                }
                next if (! defined $res );
                #print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";
                foreach( @{$res} )
                {
                        $link = $_->as_string;
                        $link = URI::URL->new($link, $url);
                        # not http and not https?
                        next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
                        # another domain?
                        #next if( $link->host ne $host );
                        #search for the sub domain
                        next if(!($link->host =~ /$host/));
                        $link = $link->abs->as_string;
                        if( $link =~ /(.*?)#(.*)/ )
                        {
                                $link = $1;
                        }
                        next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );

                        print "test:$link\n";
                        #EscapeUrl,skip query form values
                        my $tmp_link = &EscapeUrl($link);
                        #print "Escape:".$tmp_link."\n";
                        $mutex->down();
                        my $tmp_mark = 0;
                        #print "test start:$link\n";
                        if( ! $filter->check($tmp_link) )
                        {
                                #print "Test filter ok:$tmp_link\n";
                                #DiffUrl,diff $link from queue with number
                                foreach(@tmp_url)
                                {
                                        #print "Test Queue:".$tmpurl."\n";
                                        #print "test-1:$_\ntest-2:$tmp_link\n";
                                        if(&DiffUrl($_,$link))
                                        {
                                                $tmp_mark = 2;
                                                last;
                                        }
                                }
                                if( $tmp_mark != 2 )
                                {
                                        $queue->enqueue($link);
                                        #print "add queue:$link\n";
                                        $filter->add($tmp_link);
                                        print "add filter:$tmp_link\n";
                                        #print BANLOG $filter->key_count(), " ", $link, "\n";
                                        #print $filter->key_count(), " ", $link, "\n";
                                        push(@tmp_url,$link);
                                }
                                else
                                {
                                        print "pass:$link\n";
                                }
                        }
                        #print "pass:$link\n";
                        $mutex->up();
                        undef $link;
                }
                undef $res;
        }
        undef $scraper;
        $semaphore->up( );
}
#close(BANLOG);
print "ALL DONE.\n";
#skip arg
sub EscapeUrl
{
        my $urlold = shift;
        my ($scheme,$auth,$path,$query,$frag) = uri_split($urlold);
        my $urlnew = uri_join($scheme,$auth,$path);
        my $u = URI->new($urlold);
        my @tmp_array = $u->query_form();
        my $tmp = '';
        my $i = 0;
        for($i=0;$i<@tmp_array;$i+=2)
        {
                $tmp .=$tmp_array[$i]."=&";
        }
        if(@tmp_array != 0)
        {
                $tmp =~ s/&$//;
                $urlnew .= "?".$tmp;
        }
        undef $u;
        #print $urlnew."\n";
        return $urlnew;
}
sub DiffUrl
{
        my $urlold = shift;
        my $urlnew = shift;
        my $urloldx = &EscapeUrl($urlold);
        my $urlnewx = &EscapeUrl($urlnew);
        my($old,$new) = String::Diff::diff($urloldx,$urlnewx);
        #my($old,$new) = String::Diff::diff($urlold,$urlnew);
        if (($old =~ m/(\[\d+\])/i) && ($new =~ m/{\d+}/i))
        #if ($new =~ m/{\d+}/i)
        {
                #print "test num success.\n";
                return 1;
        }
        else
        {
                #print "test num failed.\n";
                return 0;
        }

}
 
阅读(2032) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~