分类: LINUX
2008-10-13 10:23:19
代码比较简单:
#!/usr/bin/perl -w
use strict;
if(@ARGV!=1){
print "\t $0 the Logfile\n\n\teg: $0 httpd_20080328.log \n\n";
exit;
}
my $file=shift;
my $total_line=0;
my %url;
my ($uv,$pv)=(0,0);
my ($effect,$uneffect)=(0,0);
my %uv;
my %ip;
open(F,"$file");
while(
if(my @info=$_=~ /(?:.*\.gif),(.*),\?info=(.{44})(.*)/){#先将访问的时间和ip地址用正择表达式取出,后面的第二次处理-----a
my @date_ip=split(/,/,$info[0]);
my
($access_time,$peer,$uid,$create_time,$action)=($date_ip[0],$date_ip[1],substr($info[1],0,32),substr($info[1],32,8),substr($info[1],40,4));
my @tempip=split(/\./,$peer);
my $int_ip=$tempip[0]*256*256*256+$tempip[1]*256*256+$tempip[2]*256+$tempip[3];
if(not exists $ip{$int_ip}){#--将ip存入hash,对于重复的ip进行计数-----
$ip{$int_ip}=1;
}else{
$ip{$int_ip}+=1;
}
if(not exists $uv{$uid}){#---将uv存入hash,对重复的id进行计数-----
$uv{$uid}=1;
}else{
$uv{$uid}+=1;
}
if($action eq 'href'){
my $string=$info[2];
my @tmp=split(/http/,$string);
my ($source_url,$current_url,$time)=("null","null",0);;
my $num=$#tmp;
if($num==1){#--http的字符串只有当前的url-----
my $www=$string;
if(my @time_get=$www=~ /\d{4}(.*)([0-9]{4})\&t=\d+/){#--能匹配到时间的情况--
$current_url=$time_get[0];
my @url=$current_url=~ /http:\/\/([a-zA-Z.0-9:_]*)\/.*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
$time=$time_get[1];
}elsif(my @url_get=$www=~ /\d{4}(.*)\&t=\d+/){#--不能匹配到时间的情况--
$current_url=$url_get[0];
my @url=$current_url=~ /http:\/\/([a-zA-Z0-9:._]*)\/.*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
$time=0;
}else{
$uneffect++;
}
}else{#---http字符串中既有当前的url和上一次的url----
my $www=$string;
if(my @time_get=$www=~ /\d{4}(.*)refe[0-9]{4}(.*)([0-9]{4})&t=\d+/){#--能匹配到时间的情况--
$source_url=$1;
$current_url=$2;
$time=$3;
my @url=$current_url=~ /http:\/\/([a-zA-Z0-9:._]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
}elsif(my @url_get=$www=~ /\d{4}(.*)refe\d{4}(.*)&t=\d+/){#--不能匹配到时间的情况---
$source_url=$1;
$current_url=$2;
$time=0;
my @url=$current_url=~ /http:\/\/([a-z0-9.A-Z:_]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
}else{
$uneffect++;
}
}
}elsif( $action eq 'adid'){
my $string=$info[2];
my @tmp=split(/http/,$string);
my ($adid,$cid,$source_url,$current_url,$time)=(0,0,'null','null',0);
my $num=$#tmp;
if($num==1){#adid中只有当前的url情况
my $www=$string;
if(my @url_get=$www=~ /\d{4}([0-9a-z]*)href\d{4}(http.*)refe\d{4}(.*)([0-9]{4})\&t=\d+/){
$adid=$url_get[0];
$current_url=$url_get[1];
$source_url=$url_get[2];
$time=$url_get[3];
my @url=$current_url=~ /http:\/\/([a-zA-Z0-9.:_]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
}elsif(my @time_get=$www=~ /\d{4}([0-9a-z]*)href\d{4}(http.*)refe\d{4}(.*)\&t=\d+/){
$adid=$url_get[0];
$current_url=$url_get[1];
$source_url=$time_get[2];
my @url=$current_url=~ /http:\/\/([a-zA-Z0-9.:_]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
$time=0;
}else{
$uneffect++;
}
}else{#adid中既有当前url和上一次的url
my $www=$string;
if(my @url_get=$www=~ /\d{4}([0-9a-z]*)href\d{4}(http.*)refe\d{4}(.*)([0-9-]{4})\&t=\d+/){ #--能匹配到时间的情况--
$adid=$url_get[0];
$current_url=$url_get[1];
$source_url=$url_get[2];
$time=$url_get[3];
my @url=$current_url=~ /http:\/\/([0-9a-zA-Z.:_]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
}elsif(my @time_get=$www=~ /\d{4}([0-9a-z]*)href\d{4}(http.*)refe\d{4}(.*)\&t=\d+/){#---不能匹配到时间的情况--
$adid=$url_get[0];
$current_url=$url_get[1];
$source_url=$time_get[2];
$time=0;
my @url=$current_url=~ /http:\/\/([a-zA-Z0-9.:_]*)\/*/;
if(not exists $url{"$url[0]"}){
$url{"$url[0]"}=1;
}else{
$url{"$url[0]"}+=1;
}
}else{
$uneffect++;
}
}
}
}elsif(my @info2=$_=~ /(.*),(.*),(.*),/){
my @tempip=split(/\./,$3);
my $int_ip=$tempip[0]*256*256*256+$tempip[1]*256*256+$tempip[2]*256+$tempip[3];
if(not exists $ip{$int_ip}){
$ip{$int_ip}=1;
}else{
$ip{$int_ip}+=1;
}
$uneffect++;
}
}
$total_line=$.;
close F;
#----统计域名级的访问量------
my $url_effect=0;
for (sort {$url{$a} <=> $url{$b}} keys %url){
my $percent=sprintf("%5.2f",$url{"$_"}*100/$total_line);
$url_effect+=$url{"$_"};
print " URL: $_ 的访问量:$url{$_} 访问有效比:$percent%\n";
}
my $url_effect_percent=sprintf("%5.2f",$url_effect*100/$total_line);
print " 总访问量:$total_line 有效的url访问量:$url_effect 有效率:$url_effect_percent%\n\n";
#----end-----------
#-----IP统计-------
&ipstatics;
#----ip统计结束----
#------统计uv和pv的数值---------------
for (keys %uv){
$uv++;
$pv+=$uv{$_};
}
my $pv_percent=sprintf("%5.2f",$pv*100/$total_line);
my $uv_percent=sprintf("%5.2f",$uv*100/$pv);
print " \n UV 总数:$uv PV 总数:$pv PV所占访问百分比:$pv_percent% UV占pv的百分比:$uv_percent%\n";
#--------IP统计函数(未完)-------------
sub ipstatics
{
my $ipdb_file="/int_db";
my (@ipstart,@ipend,@area);
my %hash;
my ($effect,$uneffect,$effect_uniq,$uneffect_uniq)=(0,0,0,0); #------分别对去重复的ip有效个数,ip无效个数进行计数,和对未去重复的ip的有效个数,和无效个数进行计数----
open IPDB,"$ipdb_file";
while(
my @temp=split/,/;
push @ipstart,$temp[0];
push @ipend,$temp[1];
push @area,$temp[2];
}
close IPDB;
my $max=$#ipstart;
my $tag=0;
for(sort keys %ip){
do{$uneffect+=$ip{$_};$uneffect_uniq++;next} if(not defined $ipend[$tag]);
my $ipseg_end=int $ipend[$tag];
while(defined $ipend[$tag] and $_>$ipend[$tag]){
$tag++;
}
do {$uneffect+=$ip{$_};$uneffect_uniq++} if($tag>$max);
my ($start,$end)=(int $ipstart[$tag],int $ipend[$tag]);
if($_> $start and $_ < $end){
if(not exists $hash{$area[$tag]}){
$hash{$area[$tag]}{'uniq'}=1;
$hash{$area[$tag]}{'count'}=int $ip{$_};
}else{
$hash{$area[$tag]}{'uniq'}+=1;
$hash{$area[$tag]}{'count'}+=int $ip{$_};
}
$effect+=$ip{$_};
$effect_uniq++;
}else{
$uneffect+=$ip{$_};
$uneffect_uniq++;
}
}
my $total_ip_access=$effect+$uneffect;
my $uniq_ip_total=$effect_uniq+$uneffect_uniq;
my $ip_total=0;
print "\n##########################################################\n";
print "地区 \t 去重复的ip数目 访问的ip数目 访问比\n\n";
for (sort {$hash{$a}{'count'}<=>$hash{$b}{'count'}} keys %hash){
my $percent=sprintf("%5.2f",$hash{$_}{'count'}*100/$total_ip_access);
$ip_total+=$hash{$_}{'count'};
print "$_ \t $hash{$_}{'uniq'} \t $hash{$_}{'count'} \t $percent\n";
}
my $effect_per=sprintf("%5.2f",$total_ip_access*100/$total_line);
my $hit_per=sprintf("%5.2f",$effect_uniq*100/$uniq_ip_total);
print " \n\n 总的访问数:$total_line 访问的ip总数:$total_ip_access 有效率: $effect_per\n";
print " \n\n 总的ip数:$total_ip_access 命中ip库ip数:$effect==$ip_total
去重复的ip总数:$uniq_ip_total 去重复后命中ip库的总数:$effect_uniq 命中率: $hit_per\n";
}