Perl Crawler爬虫-snowtty-ChinaUnix博客

冰雪塵埃snowtty.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

snowtty

博客访问： 5434005
博文数量： 1144
博客积分： 11974
博客等级：上将
技术积分： 12312
用户组：普通用户
注册时间： 2005-04-13 20:06

文章分类

全部博文（1144）

技术--Oracle&MyS（112）
编程--python编程（21）
编程--perl程序（183）
技术--防火墙类（9）
技术--samba类（7）
技术--apache类（18）
技术--netfilter（7）
工作--工作??（62）
生活--情感生活（116）
学习--英语学习（24）
学习--早先日志（46）
工作--周报总结（36）
学习--学习笔记（45）
技术--Rsync维护（11）
技术--OPENldap（1）
技术--squid维护（9）
技术--DNS 维护（17）
技术--FTP 维护（7）
技术--qmail维护（128）
技术--网络技术（26）
技术--linux 类（183）

openvpn（0）

nagios（10）
编程--awk&sed（11）
编程--shell编程（50）
未分配的博文（15）

文章存档

2017年（2）

2016年（14）

2015年（10）

2014年（28）

2013年（23）

2012年（29）

2011年（53）

2010年（86）

2009年（83）

2008年（43）

2007年（153）

2006年（575）

2005年（45）

我的朋友

相关博文

Perl Crawler爬虫

分类： LINUX

2010-09-13 15:19:41

Perl Crawler爬虫2010-05-08 14:38|文章出处：

在yunshu代码的基础上了做了些改进：
1)添加了爬虫爬时带cookie,对于需要验证的后台可以爬到
2)对URL进行去重时，去掉URL中query部分的值。
3)判断新URL是否需要加入爬虫queue，进行模糊匹配。根据应用，URL单纯数字变化则忽略

[ccsafe@sec-host10 crawl]$ cat crawler_minix.pl
#!/usr/bin/perl
use strict;
use warnings;

use threads;
use threads::shared;
use Thread::Queue;
use Thread::Semaphore;

use Bloom::Filter;
use URI;
use URI::URL;
use Web::Scraper;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Cookies::Guess;

use String::Diff;
use String::Diff qw(diff_fully diff diff_merge diff_regexp);
use URI::Split qw(uri_split uri_join);

#crawling with signed cookie
my $cookie_jar = './cookie.lwp';

my $tmp_ua = LWP::UserAgent->new;
$tmp_ua->timeout(15);
$tmp_ua->protocols_allowed(['http','https']);
$tmp_ua->agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;.NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");

$tmp_ua->cookie_jar(HTTP::Cookies->new('file'=>'./cookie.lwp','autosave'=>1));

push @{$tmp_ua->requests_redirectable}, 'POST';

my $max_threads = 15;

#my $base_url = $ARGV[0] || '';
my $base_url = $ARGV[0] || '';
my $host = URI::URL->new($base_url)->host;
print $host."\n";

my $queue = Thread::Queue->new( );

my $semaphore = Thread::Semaphore->new( $max_threads );
my $mutex = Thread::Semaphore->new( 1 );

#my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
#my $logfile = "crawler".($year+1900).($mon+1).$mday.$hour.$min.$sec.".log";
#open(BANLOG,">>$logfile") or die("can't open logfile:$!\n");

my $filter = shared_clone( Bloom::Filter->new(capacity => 1000000, error_rate => 0.001) );

$queue->enqueue( $base_url );
$filter->add( $base_url );

my @tmp_url = ();
push(@tmp_url,$base_url);

while( 1 )
{
        # join all threads which can be joined
        #my $joined = 0;
        foreach ( threads->list(threads::joinable) )
        {
                #$joined ++;
                $_->join( );
        }
        #print $joined, " joinedn";

        # if there are no url need process.
        my $item = $queue->pending();
        if( $item == 0 )
        {
                my $active = threads->list(threads::running);
                # there are no active thread, we finish the job
                if( $active == 0 )
                {
                        print "All done!\n";
                        last;
                }
                # we will get some more url if there are some active threads, just wait for them
                else
                {
                        #print "[MAIN] 0 URL, but $active active threadn";
                        sleep 1;
                        next;
                }
        }

        # if there are some url need process
        #print "[MAIN] $item URLn";
        $semaphore->down;
        #print "[MAIN]Create thread.n";
        threads->create( \&ProcessUrl );
}

# join all threads which can be joined
foreach ( threads->list() )
{
$_->join( );
}

sub ProcessUrl
{
        my $scraper = scraper
        {
                process '//a', 'links[]' => ;
        };

my $res;
my $link;

        while( my $url = $queue->dequeue_nb() )
        {
                eval
               {
                        $scraper->user_agent($tmp_ua);
                        $res = $scraper->scrape( URI->new($url) )->{'links'};
                };
                if( $@ )
                {
                        warn "";
                        next;
                }
                next if (! defined $res );

#print "there are ".scalar(threads->list(threads::running))." threads, ", $queue->pending(), " urls need process.n";

                foreach( @{$res} )
                {
                        $link = $_->as_string;
                        $link = URI::URL->new($link, $url);

                        # not http and not https?
                        next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
                        # another domain?
                        #next if( $link->host ne $host );

#search for the sub domain
next if(!($link->host =~ /$host/));

$link = $link->abs->as_string;

                        if( $link =~ /(.*?)#(.*)/ )
                        {
                                $link = $1;
                        }

next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );

print "test:$link\n";

                        #EscapeUrl,skip query form values
                        my $tmp_link = &EscapeUrl($link);
                        #print "Escape:".$tmp_link."\n";

$mutex->down();
my $tmp_mark = 0;

                        #print "test start:$link\n";
                        if( ! $filter->check($tmp_link) )
                        {
                                #print "Test filter ok:$tmp_link\n";

#DiffUrl,diff $link from queue with number

                                foreach(@tmp_url)
                                {
                                        #print "Test Queue:".$tmpurl."\n";
                                        #print "test-1:$_\ntest-2:$tmp_link\n";
                                        if(&DiffUrl($_,$link))
                                        {
                                                $tmp_mark = 2;
                                                last;
                                        }

}

                                if( $tmp_mark != 2 )
                                {
                                        $queue->enqueue($link);
                                        #print "add queue:$link\n";
                                        $filter->add($tmp_link);
                                        print "add filter:$tmp_link\n";
                                        #print BANLOG $filter->key_count(), " ", $link, "\n";
                                        #print $filter->key_count(), " ", $link, "\n";

push(@tmp_url,$link);

                                }
                                else
                                {
                                        print "pass:$link\n";
                                }

}
#print "pass:$link\n";

                        $mutex->up();
                        undef $link;
                }
                undef $res;
        }
        undef $scraper;
        $semaphore->up( );
}

#close(BANLOG);
print "ALL DONE.\n";

#skip arg
sub EscapeUrl
{
        my $urlold = shift;
        my ($scheme,$auth,$path,$query,$frag) = uri_split($urlold);
        my $urlnew = uri_join($scheme,$auth,$path);

        my $u = URI->new($urlold);
        my @tmp_array = $u->query_form();
        my $tmp = '';
        my $i = 0;
        for($i=0;$i<@tmp_array;$i+=2)
        {
                $tmp .=$tmp_array[$i]."=&";

        }
        if(@tmp_array != 0)
        {
                $tmp =~ s/&$//;
                $urlnew .= "?".$tmp;
        }
        undef $u;
        #print $urlnew."\n";
        return $urlnew;

}

sub DiffUrl
{
        my $urlold = shift;
        my $urlnew = shift;
        my $urloldx = &EscapeUrl($urlold);
        my $urlnewx = &EscapeUrl($urlnew);

        my($old,$new) = String::Diff::diff($urloldx,$urlnewx);
        #my($old,$new) = String::Diff::diff($urlold,$urlnew);
        if (($old =~ m/(\[\d+\])/i) && ($new =~ m/{\d+}/i))
        #if ($new =~ m/{\d+}/i)
        {
                #print "test num success.\n";
                return 1;
        }
        else
        {
                #print "test num failed.\n";
                return 0;
        }

}

阅读(2055) | 评论(0) | 转发(0) |

上一篇：第一个多线程抓取程序

下一篇：邮件控制电脑，perl也行

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6