#!/usr/bin/
perluse strict;
use utf8;
use Encode;
use LWP::UserAgent;
use threads;
use Thread::Semaphore;
use threads::shared;
binmode(STDOUT,":encoding(utf8");
my %mail_list;
#注:从mail_list.txt
文件中读取已获取到邮件地址的用户列表
my %user_list;
#注:从user_list.txt文件中读取所有用户列表(可能包含邮件地址已经获取到的用户)
my $max_thread = 4;
my $semaphore=Thread::Semaphore->new();
share(%mail_list);
share(%user_list);
#将所有用户列表放入%user_list散列
open(UL,"user_list.txt");
while(
){
chomp($_);
my ($user,$id)=split("\t",$_);
$user_list{$id}=$user;
}
close(UL);
#将所有已经获取了邮件地址的用户放入%mail_list散列
open(FH,"while(){
chomp($_);
my ($user,$id,$mail)=split("\t",$_);
$mail_list{$id}=$user;
}
close(FH);
open(ML,">>mail_list.txt");
sub ths_submit{
my($start,$stop)=@_;
my $i=0;
while(my ($id,$user)=each %user_list){
$i++;
if($i>$start and $i<=$stop){
if(defined $mail_list{$id}){ #注:如果当前用户已经在%mail_list散列中,表示该用户邮件已获取过,不用再重复获取
{
lock(%user_list);
delete $user_list{$id};
}
}else{
my $rst=&get_mail($user,$id);
if($rst==0){ #注:返回0表示获取邮件失败,重新放回数组
{
lock(%user_list);
$user_list{$id}=$user;
next;
}
}
}
}else{
next;
}
}
# 释放一个信号量,表示一个线程运行结束
$semaphore->up( );
}
foreach my $i(1..$max_thread){
my $start=$i*10000-10000;
my $stop=$start+10000;
my $i = threads->create( \&ths_submit, $start,$stop );
$i->detach();
}
Wait2Quit( );
sub Wait2Quit{
print "Waiting to quit...\n";
my $num = 0;
while( $num < $max_thread ){
# 尝试获取信号量,当能够获取到最大线程数个信号量时,表示所有线程都结束了
$semaphore->down( );
$num ++;
print "$num thread quit.\n";
}
print "All $max_thread thread quit.\n";
}
sub get_mail{
chomp(my $user=shift);
chomp(my $id=shift);
$user=decode('utf8',$user);
my $u=$user;
print "开始抓取用户: $user 的邮件 ...... \n";
$user=encode('gbk',$user);
my $browser=LWP::UserAgent->new();
$browser->proxy('http','');
$browser->timeout(10000);
my $url="";
my $response=$browser->post($url,
[
'username1'=>$user,
],
'User-Agent'=>'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.12) Gecko/2009072711 CentOS/3.0.12-1.el5.centos Firefox/3.0.12',
'Accept'=>'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language'=>'en-us,en;q=0.5',
'Accept-Encoding'=>'gzip,deflate',
'Accept-Charset'=>'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Cache-Control'=>'max-age=0',
);
print "错误的返回类型 -- ", $response->content_type unless $response->content_type eq 'text/html';
if($response->is_success){
my $content=$response->content;
$content=decode('gbk',$content);
#$content=~/.*?请尽快到<(.*)>检查邮件并操作.*/;
#my $mail=$1;
if ($content=~/.*?请尽快到<(.*)>检查邮件并操作.*/){
$mail_list{$id}=$user;
print "写入用户$u 的邮件到文件,完成\n";
$u=encode('utf8',$u);
print ML "$u\t$id\t$1\n";
return 1;
}else{
if($content=~/.*?找不到您输入的用户名.*/){
print "该用户".$u."不存在\n";
}elsif($content=~/.*?本论坛设置了.*/){
print "用户".$u."在一小时内只能提交一次\n";
}else{
print "获取用户 $u 出错\n";
}
return 0;
}
}else{
print "获取当前用户邮件地址失败,失败原因:".$response->status_line;
return 0;
}
}
程序运行后,报错:
Segmentation fault
好像是数组越界什么的,这个程序感觉还是蛮有代表意义的,如果能解决,对我自己对初学者应该都有一定意义吧,希望高手不吝赐教了,谢谢!
由于本人是第一尝试写多线程的程序,没会经验,其中部分思路借鉴了云老大的思路,这里注明一下。原文地址在:
阅读(4230) | 评论(0) | 转发(0) |