Chinaunix首页 | 论坛 | 博客
  • 博客访问: 1691733
  • 博文数量: 410
  • 博客积分: 9563
  • 博客等级: 中将
  • 技术积分: 4517
  • 用 户 组: 普通用户
  • 注册时间: 2010-07-03 19:59
个人简介

文章分类

全部博文(410)

文章存档

2017年(6)

2016年(1)

2015年(3)

2014年(4)

2013年(32)

2012年(45)

2011年(179)

2010年(140)

分类: Python/Ruby

2011-02-01 09:54:30

最近需要通过网页抓取天气数据,对三种采集方法进行了对比
1、采集xml数据
  1. def getWeather0():
  2.     '''
  3.     从 取得天气数据(xml格式)
  4.     直接保存为xml格式
  5.     来源格式说明:
  6.     String(0) 到 String(4):省份0,城市1,城市代码2,城市图片名称3,最后更新时间4。
  7.     String(5) 到 String(11):当天的 气温5,概况6,风向和风力7,天气趋势开始图片
  8.         名称(以下称:图标一)8,天气趋势结束图片名称(以下称:图标二)9,现在的天
  9.         气实况10,天气和生活指数11。
  10.     String(12) 到 String(16):第二天的 气温12,概况13,风向和风力14,
  11.             图标一15,图标二16。
  12.     String(17) 到 String(21):第三天的 气温17,概况18,风向和风力19,
  13.             图标一20,图标二21。
  14.     String(22) 被查询的城市或地区的介绍
  15.     输出格式说明:(参见listToxml()输入要求)
  16.     '''
  17.     #相应参数:如果网址参数发生变化, 修改以下部分
  18.     ############################################
  19.     ID = ''
  20.     URL = ''.join(("",
  21.             "WebServices/Weatherwebservice.asmx/",
  22.             "getWeatherbyCityName?theCityName=",
  23.              ID))
  24.     FILENAME = "template/wea0.xml"
  25.     SOURCE = "0"
  26.     ############################################
  27.     try:
  28.         # 获取网页源文件
  29.         sock = urllib.urlopen(URL)
  30.         strxml = sock.read()
  31.         dom=xml.dom.minidom.parseString(strxml)
  32.         root=dom.documentElement
  33.         #验证xml的第二个string元素, 如果是表示数据正常
  34.         strlist=root.getElementsByTagName("string")
  35.         if strlist[1].hasChildNodes():
  36.             areastr=strlist[1].firstChild.data
  37.         else:
  38.             log('weather0获取xml文件失败')
  39.             log(strxml)
  40.             return False
  41.         if areastr== u'城市名称':
  42.             #生成一个包含所有天气字符串的列表
  43.             weatherlist = []
  44.             for eachstr in strlist:
  45.                 weatherlist.append(getText(eachstr))
  46.             resultlist = [u'0', weatherlist[4].replace("-", "/"),
  47.                     weatherlist[6], weatherlist[5], weatherlist[7],
  48.                     weatherlist[8], weatherlist[13],weatherlist[12],
  49.                     weatherlist[15],weatherlist[18],weatherlist[17],
  50.                     weatherlist[20]]
  51.             listToxml(resultlist, "template/wea0.xml")
  52.             log('weather0获取天气信息成功', 'logs/running.log')
  53.         else:
  54.             log('weather0获取xml文件失败')
  55.             log(strxml)
  56.             log('weather0获取xml文件失败', 'logs/running.log')
  57.             return False
  58.     except Exception, ex:
  59.         #如果错误, 记入日志
  60.         log('严重错误:weather0获取xml文件失败')
  61.         errlog('getWeather0', ex, sys.exc_info())
  62.         return False
  63.     return True
2、通过正则表达式抓取网页
  1. def getWeather2():
  2.     '''
  3.         从qq.ip138.com 取得天气数据(html格式), 输出为xml格式
  4.     '''
  5.     #相应参数:如果网址参数发生变化, 修改以下部分
  6.     ############################################

  7.     URL = "***.htm"

  8.     ############################################

  9.     reDay = re.compile(r'(?<=日期).*星期.+?(?=)',
  10.             re.I|re.S|re.U)
  11.     reWeather = re.compile(r'(?<=align\="center">天气).+?(?=,
  12.             re.I|re.S|re.U)
  13.     reTemperature = re.compile(r'(?<=align\="center">气温).+?(?=,
  14.             re.I|re.S|re.U)
  15.     reWind = re.compile(r'(?<=align\="center">风向).+?(?=,
  16.             re.I|re.S|re.U)
  17.     rePic = reWeather
  18.     reEachDay = re.compile(r'(\d{4}-\d{1,2}-\d{1,2})',re.I|re.S|re.U)
  19.     
  20.     weadata = []
  21.     for i in range(12):
  22.         weadata.append(u'')
  23.     try:
  24.         #获取网页源文件
  25.         sock = urllib.urlopen(URL)
  26.         strhtml = sock.read()
  27.         strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore')

  28.         # 正则表达式取得各段
  29.         dayPara = re.findall(reDay, strhtml)
  30.         weatherPara = re.findall(reWeather, strhtml)
  31.         temperaturePara = re.findall(reTemperature, strhtml)
  32.         windPara = re.findall(reWind, strhtml)
  33.         picPara = re.findall(rePic, strhtml)
  34.         #获取日期
  35.         theDays= re.findall(reEachDay, dayPara[0])
  36.         firstDay = datetime.datetime.strptime(theDays[1],'%Y-%m-%d')
  37.         nextDay = firstDay + datetime.timedelta(1)
  38.         lastDay = firstDay + datetime.timedelta(2)
  39.         weadata[0] = u'2'
  40.         weadata[1] = unicode(theDays[0].replace('-', '/'))
  41.         weadata[2] = unicode(firstDay.month)+u'月'+unicode(firstDay.day)+u'日 '
  42.         weadata[6] = unicode(nextDay.month)+u'月'+unicode(nextDay.day)+u'日 '
  43.         weadata[9] = unicode(lastDay.month)+u'月'+unicode(lastDay.day)+u'日 '

  44.         #获取天气概况
  45.         theWeathers= re.findall(r'(?<=br/>).+?(?=,weatherPara[0])
  46.         weadata[2] += unicode(theWeathers[1].decode('utf-8'))
  47.         weadata[6] += unicode(theWeathers[2] .decode('utf-8'))
  48.         weadata[9] += unicode(theWeathers[3] .decode('utf-8'))
  49.         # 获取温度信息
  50.         # [0] 当前温度 [1]明日最高 [2]明日最低[3]后日最高[4]后日最低
  51.         theGrades = re.findall('(-?\d+℃)', temperaturePara[0])
  52.         weadata[3] = unicode(theGrades[2].decode('utf-8')
  53.                 ) + u'/' +unicode(theGrades[3].decode('utf-8'))
  54.         weadata[7] = unicode(theGrades[4].decode('utf-8')
  55.                 ) + u'/' +unicode(theGrades[5].decode('utf-8'))
  56.         weadata[10] = unicode(theGrades[6].decode('utf-8')
  57.                 ) + u'/' +unicode(theGrades[7].decode('utf-8'))
  58.         #获取风向
  59.         # [0] 当前风向 [1]明日 [2]后日
  60.         theWinds = re.findall(r'(?<=td>).+?(?=)', windPara[0])
  61.         weadata[4] = unicode(theWinds[1].decode('utf-8'))
  62.         #获取天气图标
  63.         thePics = re.findall(r'/image/(..\.gif)"', picPara[0])
  64.         weadata[5] = unicode(thePics[1].decode('utf-8'))
  65.         weadata[8] = unicode(thePics[2].decode('utf-8'))
  66.         weadata[11] = unicode(thePics[3].decode('utf-8'))

  67.         listToxml(weadata, "template/wea2.xml")
  68.         log('weather2获取天气信息成功', 'logs/running.log')
  69.     except Exception, ex:
  70.         #如果错误, 记入日志
  71.         log('严重错误:weather2获取xml文件失败')
  72.         errlog('getWeather2', ex, sys.exc_info())
  73.         log('weather2获取xml文件失败', 'logs/running.log')
  74.         return False
  75.     return True

3、通过Python的BeautifulSoup模块分析html网页
  1. def getWeather1():
  2.     '''
  3.         从***.shtml
  4.         取得天气数据(html格式), 输出为xml格式
  5.     '''
  6.     #相应参数:如果网址参数发生变化, 修改以下部分
  7.     ############################################

  8.     URL = "***.shtml"

  9.     ############################################

  10.     weadata = []
  11.     for i in range(12):
  12.         weadata.append(u'')
  13.     try:
  14.         # 获取网页源文件
  15.         sock = urllib.urlopen(URL)
  16.         strhtml = sock.read()
  17.         soup = BeautifulSoup(strhtml)
  18.         #取得当日日期
  19.         daystr = soup.find("div", "weatherYubao").find("h1", "weatheH1").text
  20.         strday = re.search(r'\d{4}-\d{1,2}-\d{1,2}',daystr).group()
  21.         firstDay = datetime.datetime.strptime(strday,'%Y-%m-%d')
  22.         nextDay = firstDay + datetime.timedelta(1)
  23.         lastDay = firstDay + datetime.timedelta(2)
  24.         weadata[0] = u'1'
  25.         weadata[1] = unicode(strday.replace('-', '/') )
  26.         weadata[2] = unicode(firstDay.month)+u'月'+unicode(firstDay.day)+u'日 '
  27.         weadata[6] = unicode(nextDay.month)+u'月'+unicode(nextDay.day)+u'日 '
  28.         weadata[9] = unicode(lastDay.month)+u'月'+unicode(lastDay.day)+u'日 '

  29.         #取得有关天气的标签
  30.         wealist = soup.find("div", "weatherYubaoBox").findAll("table", "yuBaoTable")
  31.         if len(wealist) == 3:
  32.             #取得第一天信息
  33.             daytr = wealist[0].findAll("td")
  34.             #图片
  35.             thePic = os.path.basename(daytr[2].img['src'])
  36.             weadata[5] = thePic
  37.             #天气
  38.             theWeather = daytr[3].text
  39.             weadata[2] += theWeather
  40.             #温度
  41.             gradehighstr=daytr[4].findAll("strong")
  42.             theHighGrade = gradehighstr[0].text + gradehighstr[1].text
  43.             gradelowerstr=daytr[10].findAll("strong")
  44.             theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
  45.             weadata[3] = theLowerGrade + u'/' + theHighGrade
  46.             #风向
  47.             weadata[4] = daytr[5].text + daytr[6].text

  48.             #取得第二天信息
  49.             daytr = wealist[1].findAll("td")
  50.             #图片
  51.             thePic = os.path.basename(daytr[2].img['src'])
  52.             weadata[8] = thePic
  53.             #天气
  54.             theWeather = daytr[3].text
  55.             weadata[6] += theWeather
  56.             #温度
  57.             gradehighstr=daytr[4].findAll("strong")
  58.             theHighGrade = gradehighstr[0].text + gradehighstr[1].text
  59.             gradelowerstr=daytr[10].findAll("strong")
  60.             theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
  61.             weadata[7] = theLowerGrade + u'/' + theHighGrade

  62.             #取得第三天信息
  63.             daytr = wealist[2].findAll("td")
  64.             #图片
  65.             thePic = os.path.basename(daytr[2].img['src'])
  66.             weadata[11] = thePic
  67.             #天气
  68.             theWeather = daytr[3].text
  69.             weadata[9] += theWeather
  70.             #温度
  71.             gradehighstr=daytr[4].findAll("strong")
  72.             theHighGrade = gradehighstr[0].text + gradehighstr[1].text
  73.             gradelowerstr=daytr[10].findAll("strong")
  74.             theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
  75.             weadata[10] = theLowerGrade + u'/' + theHighGrade

  76.             listToxml(weadata, "template/wea1.xml")
  77.             log('weather1获取天气信息成功', 'logs/running.log')

  78.         else:
  79.             log('weather1获取html文件失败')
  80.             log(strhtml)
  81.             log('weather1获取html文件失败', 'logs/running.log')
  82.             return False

  83.         return True

通过对比,三种方法中,xml的方式最准确,但是如果没有xml信息源,必须采集html网页,BeautifulSoup的方法维护起来要比正则表达式要方便的多,正则表达式维护比较困难
阅读(2345) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~