关于
simple_html_dom使用,我的博客里讲的很清楚,Simple HTML DOM是一款非常强大的html Dom解析器,它能帮助我们php解析html文档对象包括不符合W3C标准的html文档,并且像jQuery那样操作DOM元素,通过元素的id,class,tag等等来查找定位。我们可以利用Simple HTML DOM来采集我们所需要的数据.
需求:我们需要
采集新浪的【新浪天气】频道的所有城市的今、明、后3天的天气情况。
分析:根据要求我分析出,所有城市天气的List页(列表页)地址如下
第一步:
新建数据库及数据表[code]CREATE database weather;
-
CREATE database weather;
-
CREATE TABLE `weather` (
-
`city_name` varchar(50) NOT NULL DEFAULT ”,
-
`detail_url` varchar(255) DEFAULT ”,
-
`day_conditions` varchar(255) DEFAULT ”,
-
`day_wind` varchar(255) DEFAULT ”,
-
`day_highertemp` varchar(255) DEFAULT ”,
-
`yesterday_conditions` varchar(255) DEFAULT ”,
-
`yesterday_wind` varchar(255) DEFAULT ”,
-
`yesterday_lowertemp` varchar(255) DEFAULT ”,
-
`date` varchar(255) DEFAULT ”,
-
PRIMARY KEY (`city_name`,`date`)
-
) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=gb2312
第二步:
编写PHP脚本getWeather.php(一个文件,需要在DOS命令行下运行)。
注意:在此php文件中需要引入simple_html_dom.php文件,可以下载.require_once(“simple_html_dom.php”);
-
<?php
-
require_once(“simple_html_dom.php”);
-
ini_set(‘memory_limit’,’1000M’);
-
ini_set(“max_execution_time”,0);
-
header(“Content-type: text/html;charset=gb2312″);
-
//起始时间
-
$startS=utime();
-
//——建立数据库连接—-实时更新抓取数据START——————-//
-
$mysqli = mysqli_connect(“localhost”,”root”,”root”,”weather”)or die(“Failed to connect!”.mysqli_connect_error());
-
mysqli_query($mysqli,”set names gb2312″);
-
function select_one($conn,$sql){
-
$res = mysqli_query($conn,$sql);
-
$result = array();
-
if(!empty($res)){
-
$result = mysqli_fetch_assoc($res);
-
}
-
return $result;
-
}
-
//起始时间1
-
$start1=utime();
-
$urls = “http://php.weather.sina.com.cn/search_sheng.php”;
-
$htmlCN = file_get_html($urls);
-
$li = $htmlCN->find(“.city_nav ol li a”);
-
$listUrl = array();
-
//获取所有城市天气的链接
-
foreach($li as $k => $v){
-
$v->href = “http://php.weather.sina.com.cn”.$v->href;
-
$listUrl[$k] = substr($v->href,0,strrpos($v->href,’&'));
-
}
-
unset($li);
-
$htmlCN->clear(); // clean up memory
-
unset($htmlCN);
-
$now = date(“Y-m-d”);
-
$tom = date(“Y-m-d”,strtotime(“+1 days”));
-
$tom2 = date(“Y-m-d”,strtotime(“+2 days”));
-
$allDay = array();
-
//根据所有城市天气的链接拼凑明后两天的链接
-
foreach($listUrl as $k => $v){
-
$allDay[$k][$now] = $v.”&day=0&dpc=1″;
-
$allDay[$k][$tom] = $v.”&day=1&dpc=1″;
-
$allDay[$k][$tom2] = $v.”&day=2&dpc=1″;
-
}
-
unset($listUrl);
-
//结束时间1
-
$end1=utime();
-
$run1=$end1-$start1;
-
echo date(“Y-m-d H:i:s”).”__The first weather is going :”.substr($run1,0,5).” seconds \r\n”;
-
foreach($allDay as $keys => $days){
-
//起始时间2
-
$start2=utime();
-
foreach($days as $day => $value){
-
//起始时间3
-
$start3=utime();
-
$htmlS = file_get_html($value);
-
$city_a = $htmlS->find(“#tab_01_ctn tbody tr td a”);
-
$newArr = array();
-
foreach($city_a as $k => $v){
-
if(trim($v->plaintext)==’详情’){
-
continue;
-
}
-
$newArr[$k]['url'] = trim($v->href);
-
}
-
unset($city_a);
-
$UrlDetail = array();
-
foreach($newArr as $k => $v){
-
$UrlDetail[] = $v['url'];
-
}
-
unset($newArr);
-
$newArr2 = array();
-
$city_td = $htmlS->find(“#tab_01_ctn tbody tr”);
-
foreach($city_td as $k => $v){
-
$newArr2[$k] = trim(preg_replace(‘/\s+/’,'@’,trim($v->plaintext)));
-
}
-
unset($newArr2[0]);
-
unset($newArr2[1]);
-
unset($city_td);
-
$newArr3 = array();
-
foreach($newArr2 as $k => $v){
-
if(substr_count($v,”@”)==10){
-
$newArr3[] = substr($v,strpos($v,’@')+1,strrpos($v,’@')-strpos($v,’@')-1);
-
}else{
-
$newArr3[] = substr($v,0,strrpos($v,’@'));
-
}
-
}
-
unset($newArr2);
-
$newArr4 = array();
-
foreach($newArr3 as $k => $v){
-
$newArr4[$k] = explode(‘@’,$v);
-
}
-
unset($newArr3);
-
$weathers = array();
-
foreach($newArr4 as $k => $v){
-
$weathers[$k]['city_name'] = “‘”.removeQuot($v[0]).”‘”;
-
$weathers[$k]['detail_url'] = “‘”.removeQuot($UrlDetail[$k]).”‘”;
-
$weathers[$k]['day_conditions'] = “‘”.removeQuot($v[1]).”‘”;
-
$weathers[$k]['day_wind'] = “‘”.removeQuot($v[2].$v[3]).”‘”;
-
$weathers[$k]['day_highertemp'] = “‘”.removeQuot($v[4]).”‘”;
-
$weathers[$k]['yesterday_conditions'] = “‘”.removeQuot($v[5]).”‘”;
-
$weathers[$k]['yesterday_wind'] = “‘”.removeQuot($v[6].$v[7]).”‘”;
-
$weathers[$k]['yesterday_lowertemp'] = “‘”.removeQuot($v[8]).”‘”;
-
$weathers[$k]['date'] = “‘”.removeQuot($day).”‘”;
-
}
-
unset($newArr4);
-
//下面数据–循环入库–//
-
foreach($weathers as $k => $v){
-
if(!empty($v)&&is_array($v)){
-
$inserSql = “REPLACE INTO weather(“.implode(“,”,array_keys($v)).”) VALUES (“.implode(“,”,$v).”)”;
-
mysqli_query($mysqli,$inserSql) or die(“Failed”.mysqli_error());
-
}
-
echo date(“Y-m-d H:i:s”).”__The insertSQL is going : -+- [".$k."] -+- \r\n”;
-
}
-
unset($weathers);
-
//结束时间3
-
$end3=utime();
-
$run3=$end3-$start3;
-
echo date(“Y-m-d H:i:s”).”__The three weather is going :”.substr($run3,0,5).”-+-”.$day.” seconds \r\n”;
-
}
-
//结束时间2
-
$end2=utime();
-
$run2=$end2-$start2;
-
echo date(“Y-m-d H:i:s”).”__The second weather is going :”.substr($run2,0,5).”-+-”.$keys.” seconds \r\n”;
-
}
-
//————————一些功能函数-START———————-//
-
//替换特殊字串函数
-
function removeQuot($string){
-
if(strstr($string, “‘”)){
-
$string = str_replace(“‘”,”
第三步:
在命令行下运行(通常这种采集脚本数据量很大),确保你的当前环境能用命令行运行。
具体看情况而行,下面截图是我命令行的运行方式:
第四步:
如上图(2)脚本已经运行完毕,花费170.5秒。最终得到数据如下(只截一部分):
阅读(1910) | 评论(0) | 转发(0) |