Chinaunix首页 | 论坛 | 博客
  • 博客访问: 161797
  • 博文数量: 19
  • 博客积分: 470
  • 博客等级: 下士
  • 技术积分: 252
  • 用 户 组: 普通用户
  • 注册时间: 2012-03-06 09:16
文章分类

全部博文(19)

文章存档

2013年(2)

2012年(17)

分类: PHP

2013-07-29 20:56:58

关于simple_html_dom使用,我的博客里讲的很清楚,Simple HTML DOM是一款非常强大的html Dom解析器,它能帮助我们php解析html文档对象包括不符合W3C标准的html文档,并且像jQuery那样操作DOM元素,通过元素的id,class,tag等等来查找定位。我们可以利用Simple HTML DOM来采集我们所需要的数据.


需求:我们需要采集新浪的【新浪天气】频道的所有城市的今、明、后3天的天气情况。
分析:根据要求我分析出,所有城市天气的List页(列表页)地址如下

第一步:
新建数据库及数据表[code]CREATE database weather;


点击(此处)折叠或打开

  1. CREATE database weather;
  2. CREATE TABLE `weather` (
  3. `city_name` varchar(50) NOT NULL DEFAULT,
  4. `detail_url` varchar(255) DEFAULT,
  5. `day_conditions` varchar(255) DEFAULT,
  6. `day_wind` varchar(255) DEFAULT,
  7. `day_highertemp` varchar(255) DEFAULT,
  8. `yesterday_conditions` varchar(255) DEFAULT,
  9. `yesterday_wind` varchar(255) DEFAULT,
  10. `yesterday_lowertemp` varchar(255) DEFAULT,
  11. `date` varchar(255) DEFAULT,
  12. PRIMARY KEY (`city_name`,`date`)
  13. ) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=gb2312

第二步:
编写PHP脚本getWeather.php(一个文件,需要在DOS命令行下运行)。
注意:在此php文件中需要引入simple_html_dom.php文件,可以下载.require_once(“simple_html_dom.php”);


点击(此处)折叠或打开

  1. <?php
  2. require_once(“simple_html_dom.php”);
  3. ini_set(‘memory_limit’,’1000M’);
  4. ini_set(“max_execution_time”,0);
  5. header(“Content-type: text/html;charset=gb2312″);
  6. //起始时间
  7. $startS=utime();
  8. //——建立数据库连接—-实时更新抓取数据START——————-//
  9. $mysqli = mysqli_connect(“localhost”,”root”,”root”,”weather”)or die(“Failed to connect!”.mysqli_connect_error());
  10. mysqli_query($mysqli,”set names gb2312″);
  11. function select_one($conn,$sql){
  12. $res = mysqli_query($conn,$sql);
  13. $result = array();
  14. if(!empty($res)){
  15. $result = mysqli_fetch_assoc($res);
  16. }
  17. return $result;
  18. }
  19. //起始时间1
  20. $start1=utime();
  21. $urls = “http://php.weather.sina.com.cn/search_sheng.php”;
  22. $htmlCN = file_get_html($urls);
  23. $li = $htmlCN->find(.city_nav ol li a”);
  24. $listUrl = array();
  25. //获取所有城市天气的链接
  26. foreach($li as $k => $v){
  27. $v->href = “http://php.weather.sina.com.cn”.$v->href;
  28. $listUrl[$k] = substr($v->href,0,strrpos($v->href,&'));
  29. }
  30. unset($li);
  31. $htmlCN->clear(); // clean up memory
  32. unset($htmlCN);
  33. $now = date(“Y-m-d”);
  34. $tom = date(“Y-m-d”,strtotime(“+1 days”));
  35. $tom2 = date(“Y-m-d”,strtotime(“+2 days”));
  36. $allDay = array();
  37. //根据所有城市天气的链接拼凑明后两天的链接
  38. foreach($listUrl as $k => $v){
  39. $allDay[$k][$now] = $v.”&day=0&dpc=1″;
  40. $allDay[$k][$tom] = $v.”&day=1&dpc=1″;
  41. $allDay[$k][$tom2] = $v.”&day=2&dpc=1″;
  42. }
  43. unset($listUrl);
  44. //结束时间1
  45. $end1=utime();
  46. $run1=$end1-$start1;
  47. echo date(“Y-m-d H:i:s”).”__The first weather is going :”.substr($run1,0,5).” seconds \r\n”;
  48. foreach($allDay as $keys => $days){
  49. //起始时间2
  50. $start2=utime();
  51. foreach($days as $day => $value){
  52. //起始时间3
  53. $start3=utime();
  54. $htmlS = file_get_html($value);
  55. $city_a = $htmlS->find(“#tab_01_ctn tbody tr td a”);
  56. $newArr = array();
  57. foreach($city_a as $k => $v){
  58. if(trim($v->plaintext)==’详情’){
  59. continue;
  60. }
  61. $newArr[$k]['url'] = trim($v->href);
  62. }
  63. unset($city_a);
  64. $UrlDetail = array();
  65. foreach($newArr as $k => $v){
  66. $UrlDetail[] = $v['url'];
  67. }
  68. unset($newArr);
  69. $newArr2 = array();
  70. $city_td = $htmlS->find(“#tab_01_ctn tbody tr”);
  71. foreach($city_td as $k => $v){
  72. $newArr2[$k] = trim(preg_replace(‘/\s+/’,'@,trim($v->plaintext)));
  73. }
  74. unset($newArr2[0]);
  75. unset($newArr2[1]);
  76. unset($city_td);
  77. $newArr3 = array();
  78. foreach($newArr2 as $k => $v){
  79. if(substr_count($v,@)==10){
  80. $newArr3[] = substr($v,strpos($v,@')+1,strrpos($v,’@')-strpos($v,@')-1);
  81. }else{
  82. $newArr3[] = substr($v,0,strrpos($v,’@'));
  83. }
  84. }
  85. unset($newArr2);
  86. $newArr4 = array();
  87. foreach($newArr3 as $k => $v){
  88. $newArr4[$k] = explode(@,$v);
  89. }
  90. unset($newArr3);
  91. $weathers = array();
  92. foreach($newArr4 as $k => $v){
  93. $weathers[$k]['city_name'] = “‘”.removeQuot($v[0]).”‘”;
  94. $weathers[$k]['detail_url'] = “‘”.removeQuot($UrlDetail[$k]).”‘”;
  95. $weathers[$k]['day_conditions'] = “‘”.removeQuot($v[1]).”‘”;
  96. $weathers[$k]['day_wind'] = “‘”.removeQuot($v[2].$v[3]).”‘”;
  97. $weathers[$k]['day_highertemp'] = “‘”.removeQuot($v[4]).”‘”;
  98. $weathers[$k]['yesterday_conditions'] = “‘”.removeQuot($v[5]).”‘”;
  99. $weathers[$k]['yesterday_wind'] = “‘”.removeQuot($v[6].$v[7]).”‘”;
  100. $weathers[$k]['yesterday_lowertemp'] = “‘”.removeQuot($v[8]).”‘”;
  101. $weathers[$k]['date'] = “‘”.removeQuot($day).”‘”;
  102. }
  103. unset($newArr4);
  104. //下面数据–循环入库–//
  105. foreach($weathers as $k => $v){
  106. if(!empty($v)&&is_array($v)){
  107. $inserSql = “REPLACE INTO weather(.implode(,,array_keys($v)).) VALUES (.implode(,,$v).);
  108. mysqli_query($mysqli,$inserSql) or die(“Failed”.mysqli_error());
  109. }
  110. echo date(“Y-m-d H:i:s”).”__The insertSQL is going : -+- [".$k."] -+- \r\n”;
  111. }
  112. unset($weathers);
  113. //结束时间3
  114. $end3=utime();
  115. $run3=$end3-$start3;
  116. echo date(“Y-m-d H:i:s”).”__The three weather is going :.substr($run3,0,5).-+-.$day.” seconds \r\n”;
  117. }
  118. //结束时间2
  119. $end2=utime();
  120. $run2=$end2-$start2;
  121. echo date(“Y-m-d H:i:s”).”__The second weather is going :.substr($run2,0,5).-+-.$keys.” seconds \r\n”;
  122. }
  123. //————————一些功能函数-START———————-//
  124. //替换特殊字串函数
  125. function removeQuot($string){
  126. if(strstr($string, “‘”)){
  127. $string = str_replace(“‘”,

第三步:
在命令行下运行(通常这种采集脚本数据量很大),确保你的当前环境能用命令行运行。

具体看情况而行,下面截图是我命令行的运行方式:




第四步:
如上图(2)脚本已经运行完毕,花费170.5秒。最终得到数据如下(只截一部分):



阅读(1910) | 评论(0) | 转发(0) |
2

上一篇:我的PHP开发个人博客终于开通了

下一篇:没有了

给主人留下些什么吧!~~