统计结果:
1、CU:最后统计的空间浪费结果是:2282M end time:2010-04-04 11:38:40, 主题:
351641, 帖子:
1432244, 会员: 23633797,创建于2003-2-10
2、化工:最后统计的空间浪费结果是:88M end time:2010-04-05 8:52:40, 主题: 3548, 帖子:
9608, 会员: 19094,创建于2007-2-4
3、人大经济论坛:今日:
1216, 昨日:
11562, 最高日:
16583
, 主题:
585770, 帖子:
4881335, 会员:
1721538,创建于2004-5-23
4、起点手机论坛,,
今日: 1687, 昨日: 8143, 会员: 44822
5、我要玩手机论坛,,今日:
2201, 昨日:
12656, 会员:
4048683
第一版:[2010-3-31]
$url = "";
$ch = curl_init();
$timeout = 5;
$sum = 0;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$contents = curl_exec($ch);
curl_close($ch);
$path = '/tmp';
$p = opendir($path);
if(is_dir($path)){
chdir($path);
echo scandir($p);
$ereg = '
.*<\/td>'; ereg($ereg,$contents,$outcome); print_r($outcome); // foreach($outcome as $string) // echo $string.' '; //$sum += strlen($string); file_put_contents(web,$sum); }else echo "$path is not dir";
closedir($path); ?>
第二版:[2010-4-1] define("CU","");
// $url = "";
//判断网页链接是否已经到头[未完成!] function isLinkEnd($url1,$url2){ $page1 = file_get_contents($url1); $page2 = file_get_contents($url2); $startword = ''; $endword = ''; $startpos1 = strpos($page1,$startword); $endpos1 = strpos($page1,$endword); $startpos2 = strpos($page2,$startword); $endpos2 = strpos($page2,$endword); $result1 = strstr($page1,$startpos1+strlen($startword),$endpos1-$startpos1-$strlen($startword)); $result2 = strstr($page2,$startpos2+strlen($startword),$endpos2-$startpos1-$strlen($startword)); return ($result1 == $result2?0:1); } //自动抽取chinaunix.net上含有回复下载的所有链接,保存为数组返回[已完成!] function linksOf(){ $page = 1; $thisurl = CU."$page".'.html';
while( $page <= 42 ){ $links1[] = $thisurl; $page++; $thisurl = CU."$page".'.html'; } return $links1; } //自动检查所有链接的分页页面地址,保存为数组返回[已完成!] function linksOfPart($links1){ foreach($links1 as $linksBefore){ $contents = file($linksBefore); $ereg1 = ' } } } return $linksPerArticle; } //自动检查所有链接地址,保存为数组返回[尚未完成!] function linksOfAll($linksPerArticle){ foreach($linksPerArticle as $linksBefore){ $contents = file($url); } return $linksOfAll; } //计算打开的每页无效字符字节数[已完成!] function countPerPage($url){ $contents = file($url); $sum = 0; $ereg1 = ' define("CU_F","");
// $url = "";
//判断网页链接是否已经到头[已完成!] function isLinkEnd($url1,$url2){ $page1 = file_get_contents($url1); $page2 = file_get_contents($url2); $startword = ''; $endword = ''; $startpos1 = strpos($page1,$startword); $endpos1 = strpos($page1,$endword); $startpos2 = strpos($page2,$startword); $endpos2 = strpos($page2,$endword); $result1 = substr($page1,$startpos1+strlen($startword),$endpos1-$startpos1-strlen($startword)); $result2 = substr($page2,$startpos2+strlen($startword),$endpos2-$startpos1-strlen($startword)); //返回0说明到头,返回1尚未到头 return ($result1 == $result2?0:1); }
//自动抽取chinaunix.net上含有回复下载的所有链接,保存为数组返回[已完成!] function linksOf(){ $page = 1; $thisurl = CU."$page".'.html';
while( $page <= 42 ){ $links1[] = $thisurl; $page++; $thisurl = CU."$page".'.html'; } return $links1; } //自动检查所有链接的分页页面地址,保存为数组返回[已完成!] function linksOfPart($links1){ foreach($links1 as $linksBefore){ $contents = file($linksBefore); $ereg1 = ' } } } return $linksPerArticle; } //自动检查所有链接地址,保存为数组返回[已完成!] function linksOfAll($linksPerArticle){ foreach($linksPerArticle as $links){ $i = 1; $page_before = CU_F."$links-$i".'-1.html'; $i++; $page_after = CU_F."$links-$i".'-1.html'; while(isLinkEnd($page_before,$page_after)){ $linksOfAll[] = $page_before; $page_before = $page_after; $i++; $page_after = CU_F."$links-$i".'-1.html'; } $linksOfAll[] = $page_before; } return $linksOfAll; } //计算打开的每页无效字符字节数[已完成!] function countPerPage($url){ $contents = file($url); $sum = 0; $ereg1 = '?> 这一版本中,自动截取CU下载版块页面,统计空间浪费功能已经实现,但程序运行速度很慢,有待改进。 出现错误: 开始时间:2010-04-02 09:37:19
Notice: Undefined variable: sum in /var/www/html/lk.php
on line 94
Fatal error: Maximum execution time of 30 seconds exceeded in /var/www/html/lk.php
on line 75
进一步改进: define("CU",""); define("CU_F","");
// $url = "";
//判断网页链接是否已经到头[已完成!] function isLinkEnd($url1,$url2){ $page1 = file_get_contents($url1); $page2 = file_get_contents($url2); $startword = ''; $endword = ''; $startpos1 = strpos($page1,$startword); $endpos1 = strpos($page1,$endword); $startpos2 = strpos($page2,$startword); $endpos2 = strpos($page2,$endword); $result1 = substr($page1,$startpos1+strlen($startword),$endpos1-$startpos1-strlen($startword)); $result2 = substr($page2,$startpos2+strlen($startword),$endpos2-$startpos1-strlen($startword)); //返回0说明到头,返回1尚未到头 return ($result1 == $result2?0:1); }
//自动抽取chinaunix.net上含有回复下载的所有链接,保存为数组返回[已完成!] function linksOf(){ $page = 1; $thisurl = CU."$page".'.html';
while( $page <= 42 ){ $links1[] = $thisurl; $page++; $thisurl = CU."$page".'.html'; } return $links1; } //自动检查所有链接的分页页面地址,保存为数组返回[已完成!] function linksOfPart($links1){ foreach($links1 as $linksBefore){ $contents = file($linksBefore); $ereg1 = ' } } } return $linksPerArticle; } //自动检查所有链接地址,保存为数组返回[已完成!] function linksOfAll($linksPerArticle){ foreach($linksPerArticle as $links){ $i = 1; $page_before = CU_F."$links-$i".'-1.html'; $i++; $page_after = CU_F."$links-$i".'-1.html'; while(isLinkEnd($page_before,$page_after)){ $linksOfAll[] = $page_before; $page_before = $page_after; $i++; $page_after = CU_F."$links-$i".'-1.html'; } $linksOfAll[] = $page_before; } return $linksOfAll; } //计算打开的每页无效字符字节数[已完成!] function countPerPage($url){ $contents = file($url); $sum = 0; $ereg1 = ' | '; $totalnum = 0; $testme = linksOfAll(linksOfPart(linksOf())); foreach($testme as $mylinks){ $totalnum += countPerPage($mylinks); } echo "最后统计的空间浪费结果是:".round($totalnum/1024)."M"; echo "结束时间:".date("Y-m-d H:i:s").' '; ?>
准备再次改进: 直接抓取baidu.com的结果,"site:xxx.com 本帖隐藏的内容需要回复才可以浏览"。 再进行分析,统计数量。 选取的网站有: 1、 -> 1000 2、 -> 290 3、 -> 242 4、 人大经济论坛 -> 932 5、 天涯社区 -> 1080 6、bbs.duowan.com 多玩游戏论坛 -> 80400 -> 主题: 3516513, 帖子: 79036761, 会员: 8861576 -> 2005-6-29 7、sohu.com 搜狐网 -> 9,490 8、sina.com.cn 新浪网 -> 29100 9、mop.com 猫扑 -> 155
define("CU",""); define("CU_F","");
// $url = "";
//判断网页链接是否已经到头[已完成!] function isLinkEnd($url1,$url2){ $page1 = file_get_contents($url1); $page2 = file_get_contents($url2); $startword = ''; $endword = ''; $startpos1 = strpos($page1,$startword); $endpos1 = strpos($page1,$endword); $startpos2 = strpos($page2,$startword); $endpos2 = strpos($page2,$endword); $result1 = substr($page1,$startpos1+strlen($startword),$endpos1-$startpos1-strlen($startword)); $result2 = substr($page2,$startpos2+strlen($startword),$endpos2-$startpos1-strlen($startword)); //返回0说明到头,返回1尚未到头 return ($result1 == $result2?0:1); }
//自动抽取chinaunix.net上含有回复下载的所有链接,保存为数组返回[已完成!] function linksOf(){ $page = 1; $thisurl = CU."$page".'.html';
while( $page <= 42 ){ $links1[] = $thisurl; $page++; $thisurl = CU."$page".'.html'; } return $links1; } //自动检查所有链接的分页页面地址,保存为数组返回[已完成!] function linksOfPart($links1){ foreach($links1 as $linksBefore){ $contents = file($linksBefore); $ereg1 = ' } } } return $linksPerArticle; } //自动检查所有链接地址,保存为数组返回[尚未完成!] function linksOfAll($linksPerArticle){ foreach($linksPerArticle as $links){ $i = 1; $page_before = CU_F."$links-$i".'-1.html'; $i++; $page_after = CU_F."$links-$i".'-1.html'; while(isLinkEnd($page_before,$page_after)){ $linksOfAll[] = $page_before; $page_before = $page_after; $i++; $page_after = CU_F."$links-$i".'-1.html'; } $linksOfAll[] = $page_before; } return $linksOfAll; } //计算打开的每页无效字符字节数[已完成!] function countPerPage($url){ if(fopen("$url",'r')){ $contents = file($url); $sum = 0; $ereg1 = ' | '; $totalnum = 0; $testme = linksOfAll(linksOfPart(linksOf())); echo '统计的链接共有:'.count($testme).'页 '; foreach($testme as $key=>$mylinks){ echo '统计页面'.$key.' '; $totalnum += countPerPage($mylinks); } echo "最后统计的空间浪费结果是:".round($totalnum/1024)."M"; echo "结束时间:".date("Y-m-d H:i:s").' '; ?>
最后: lk.php define("CU",""); //define("CU_F",""); define("M_WORD","本帖隐藏的内容需要回复才可以浏览");
//判断网页链接是否已经到头[已完成!] function isLinkEnd($url1,$url2){ $flag = 0; $opts = array( 'http'=>array( 'method'=>"GET", 'timeout'=>60, ) ); $context = stream_context_create($opts);//设置超时;用于解决连接超时的问题
while( ($page1 = file_get_contents($url1, false, $context))=== FALSE || ($page2 = file_get_contents($url2, false, $context))=== FALSE ){ $flag++;echo "flag=$flag "; if($flag > 3)return 0; } $startword = ''; $endword = ''; $startpos1 = strpos($page1,$startword); $endpos1 = strpos($page1,$endword); $startpos2 = strpos($page2,$startword); $endpos2 = strpos($page2,$endword); $result1 = substr($page1,$startpos1+strlen($startword),$endpos1-$startpos1-strlen($startword)); $result2 = substr($page2,$startpos2+strlen($startword),$endpos2-$startpos1-strlen($startword)); //返回0说明到头,返回1尚未到头 return ($result1 == $result2?0:1); }
//自动抽取chinaunix.net上含有回复下载的所有链接,保存为数组返回[已完成!] function linksOf($thisurl,$pagetotal){ $page = 1;
while( $page <= $pagetotal ){ $links1[] = $thisurl."$page".'.html'; $page++; } return $links1; }
//自动检查所有链接的分页页面地址,保存为数组返回[已完成!] function linksOfPart($links1){ foreach($links1 as $linksBefore){ $flag = 0; while(count($contents = file($linksBefore)) == 0){ $flag++; if($flag > 3)break; } if($flag > 3)continue; $ereg1 = ' } } } return $linksPerArticle; }
//自动检查所有链接地址,保存为数组返回[已经完成!] function linksOfAll($linksPerArticle){ $opts = array( 'http'=>array( 'method'=>"GET", 'timeout'=>60, ) ); $context = stream_context_create($opts); foreach($linksPerArticle as $links){ $i = 1; $page_before = CU_F."$links-$i".'-1.html'; $i++; $page_after = CU_F."$links-$i".'-1.html'; while(isLinkEnd($page_before,$page_after)){ $linksOfAll[] = $page_before; $page_before = $page_after; $i++; $page_after = CU_F."$links-$i".'-1.html'; } $linksOfAll[] = $page_before; } return $linksOfAll; }
//计算打开的每页无效字符字节数[已完成!] function countPerPage($url){ $flag = 0; while( ( $contents = file($url)) == FALSE ){ $flag++;echo "flag=$flag "; if($flag > 3) return 0; }
$sum = 0; $ereg1 = ' | '; $totalnum = 0; $savelink = linksOfPart(linksOf()); if(!($fp = fopen('D:\xampp\htdocs\savepartlink.txt','w'))) exit; fwrite($fp,serialize($savelink));
fclose($fp); echo 'file savepartlink.txt save success '; echo "now is:".date("Y-m-d H:i:s").' ; $testme = linksOfAll($savelink); if(!($fp = fopen('D:\xampp\htdocs\savealllink.txt','w'))) exit; fwrite($fp,serialize($testme));
fclose($fp); echo 'file savealllink.txt save success '; echo 'total page is:'.count($testme).' ';
foreach($testme as $key=>$mylinks){ echo '统计页面'.$key.' '; $totalnum += countPerPage($mylinks); }
echo "最后统计的空间浪费结果是:".round($totalnum/1024)."M";
echo "end time:".date("Y-m-d H:i:s").' ';*/ ?> huagong.php include 'lk.php'; define("CU_F",""); define("WENJIAN","huagong.txt"); //取得全部链接 // if(! file_exists(WENJIAN)){ $target_link = array(''=>19, ''=>27, ''=>25, ''=>10, ''=>30, ''=>5, ''=>5, ''=>5, ''=>6, ''=>26, ''=>1 );
foreach($target_link as $key => $value){ $links_zhuan[] = linksOf($key,$value); } foreach($links_zhuan as $key => $value){ foreach($value as $key2 => $value2) $links[] = $value2; }
//取得需要统计的全部页面 $savelink = linksOfPart($links); // echo count($savelink);
// }else{ set_time_limit(0); echo 'start time:'.date("Y-m-d H:i:s").' '; $totalnum = 0; $testme = linksOfAll($savelink);
echo 'total page is:'.count($testme).' ';
foreach($testme as $key=>$mylinks){ $totalnum += countPerPage($mylinks); echo '统计页面'.$key.' '; }
echo '最后统计的空间浪费结果是:'.round($totalnum/1024).'M';
echo 'end time:'.date("Y-m-d H:i:s").' '; //} ?>
阅读(2043) | 评论(0) | 转发(0) |
|
|
|