最近需要通过网页抓取天气数据,对三种采集方法进行了对比
1、采集xml数据
- def getWeather0():
-
'''
-
从 取得天气数据(xml格式)
-
直接保存为xml格式
-
来源格式说明:
-
String(0) 到 String(4):省份0,城市1,城市代码2,城市图片名称3,最后更新时间4。
-
String(5) 到 String(11):当天的 气温5,概况6,风向和风力7,天气趋势开始图片
-
名称(以下称:图标一)8,天气趋势结束图片名称(以下称:图标二)9,现在的天
-
气实况10,天气和生活指数11。
-
String(12) 到 String(16):第二天的 气温12,概况13,风向和风力14,
-
图标一15,图标二16。
-
String(17) 到 String(21):第三天的 气温17,概况18,风向和风力19,
-
图标一20,图标二21。
-
String(22) 被查询的城市或地区的介绍
-
输出格式说明:(参见listToxml()输入要求)
-
'''
-
#相应参数:如果网址参数发生变化, 修改以下部分
-
############################################
- ID = ''
-
URL = ''.join(("",
-
"WebServices/Weatherwebservice.asmx/",
-
"getWeatherbyCityName?theCityName=",
- ID))
-
FILENAME = "template/wea0.xml"
-
SOURCE = "0"
-
############################################
-
try:
-
# 获取网页源文件
-
sock = urllib.urlopen(URL)
-
strxml = sock.read()
-
dom=xml.dom.minidom.parseString(strxml)
-
root=dom.documentElement
-
#验证xml的第二个string元素, 如果是表示数据正常
-
strlist=root.getElementsByTagName("string")
-
if strlist[1].hasChildNodes():
-
areastr=strlist[1].firstChild.data
-
else:
-
log('weather0获取xml文件失败')
-
log(strxml)
-
return False
-
if areastr== u'城市名称':
-
#生成一个包含所有天气字符串的列表
-
weatherlist = []
-
for eachstr in strlist:
-
weatherlist.append(getText(eachstr))
-
resultlist = [u'0', weatherlist[4].replace("-", "/"),
-
weatherlist[6], weatherlist[5], weatherlist[7],
-
weatherlist[8], weatherlist[13],weatherlist[12],
-
weatherlist[15],weatherlist[18],weatherlist[17],
-
weatherlist[20]]
-
listToxml(resultlist, "template/wea0.xml")
-
log('weather0获取天气信息成功', 'logs/running.log')
-
else:
-
log('weather0获取xml文件失败')
-
log(strxml)
-
log('weather0获取xml文件失败', 'logs/running.log')
-
return False
-
except Exception, ex:
-
#如果错误, 记入日志
-
log('严重错误:weather0获取xml文件失败')
-
errlog('getWeather0', ex, sys.exc_info())
-
return False
-
return True
2、通过正则表达式抓取网页
- def getWeather2():
-
'''
-
从qq.ip138.com 取得天气数据(html格式), 输出为xml格式
-
'''
-
#相应参数:如果网址参数发生变化, 修改以下部分
-
############################################
-
-
URL = "***.htm"
-
-
############################################
-
-
reDay = re.compile(r'(?<=日期).*星期.+?(?=)',
-
re.I|re.S|re.U)
-
reWeather = re.compile(r'(?<=align\="center">天气).+?(?=,
-
re.I|re.S|re.U)
-
reTemperature = re.compile(r'(?<=align\="center">气温).+?(?=,
-
re.I|re.S|re.U)
-
reWind = re.compile(r'(?<=align\="center">风向).+?(?=,
-
re.I|re.S|re.U)
-
rePic = reWeather
-
reEachDay = re.compile(r'(\d{4}-\d{1,2}-\d{1,2})',re.I|re.S|re.U)
-
-
weadata = []
-
for i in range(12):
-
weadata.append(u'')
-
try:
-
#获取网页源文件
-
sock = urllib.urlopen(URL)
-
strhtml = sock.read()
-
strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore')
-
-
# 正则表达式取得各段
-
dayPara = re.findall(reDay, strhtml)
-
weatherPara = re.findall(reWeather, strhtml)
-
temperaturePara = re.findall(reTemperature, strhtml)
-
windPara = re.findall(reWind, strhtml)
-
picPara = re.findall(rePic, strhtml)
-
#获取日期
-
theDays= re.findall(reEachDay, dayPara[0])
-
firstDay = datetime.datetime.strptime(theDays[1],'%Y-%m-%d')
-
nextDay = firstDay + datetime.timedelta(1)
-
lastDay = firstDay + datetime.timedelta(2)
-
weadata[0] = u'2'
-
weadata[1] = unicode(theDays[0].replace('-', '/'))
-
weadata[2] = unicode(firstDay.month)+u'月'+unicode(firstDay.day)+u'日 '
-
weadata[6] = unicode(nextDay.month)+u'月'+unicode(nextDay.day)+u'日 '
-
weadata[9] = unicode(lastDay.month)+u'月'+unicode(lastDay.day)+u'日 '
-
-
#获取天气概况
-
theWeathers= re.findall(r'(?<=br/>).+?(?=,weatherPara[0])
-
weadata[2] += unicode(theWeathers[1].decode('utf-8'))
-
weadata[6] += unicode(theWeathers[2] .decode('utf-8'))
-
weadata[9] += unicode(theWeathers[3] .decode('utf-8'))
-
# 获取温度信息
-
# [0] 当前温度 [1]明日最高 [2]明日最低[3]后日最高[4]后日最低
-
theGrades = re.findall('(-?\d+℃)', temperaturePara[0])
-
weadata[3] = unicode(theGrades[2].decode('utf-8')
-
) + u'/' +unicode(theGrades[3].decode('utf-8'))
-
weadata[7] = unicode(theGrades[4].decode('utf-8')
-
) + u'/' +unicode(theGrades[5].decode('utf-8'))
-
weadata[10] = unicode(theGrades[6].decode('utf-8')
-
) + u'/' +unicode(theGrades[7].decode('utf-8'))
-
#获取风向
-
# [0] 当前风向 [1]明日 [2]后日
-
theWinds = re.findall(r'(?<=td>).+?(?=)', windPara[0])
-
weadata[4] = unicode(theWinds[1].decode('utf-8'))
-
#获取天气图标
-
thePics = re.findall(r'/image/(..\.gif)"', picPara[0])
-
weadata[5] = unicode(thePics[1].decode('utf-8'))
-
weadata[8] = unicode(thePics[2].decode('utf-8'))
-
weadata[11] = unicode(thePics[3].decode('utf-8'))
-
-
listToxml(weadata, "template/wea2.xml")
-
log('weather2获取天气信息成功', 'logs/running.log')
-
except Exception, ex:
-
#如果错误, 记入日志
-
log('严重错误:weather2获取xml文件失败')
-
errlog('getWeather2', ex, sys.exc_info())
-
log('weather2获取xml文件失败', 'logs/running.log')
-
return False
-
return True
3、通过Python的BeautifulSoup模块分析html网页
- def getWeather1():
-
'''
-
从***.shtml
-
取得天气数据(html格式), 输出为xml格式
-
'''
-
#相应参数:如果网址参数发生变化, 修改以下部分
-
############################################
-
-
URL = "***.shtml"
-
-
############################################
-
-
weadata = []
-
for i in range(12):
-
weadata.append(u'')
-
try:
-
# 获取网页源文件
-
sock = urllib.urlopen(URL)
-
strhtml = sock.read()
-
soup = BeautifulSoup(strhtml)
-
#取得当日日期
-
daystr = soup.find("div", "weatherYubao").find("h1", "weatheH1").text
-
strday = re.search(r'\d{4}-\d{1,2}-\d{1,2}',daystr).group()
-
firstDay = datetime.datetime.strptime(strday,'%Y-%m-%d')
-
nextDay = firstDay + datetime.timedelta(1)
-
lastDay = firstDay + datetime.timedelta(2)
-
weadata[0] = u'1'
-
weadata[1] = unicode(strday.replace('-', '/') )
-
weadata[2] = unicode(firstDay.month)+u'月'+unicode(firstDay.day)+u'日 '
-
weadata[6] = unicode(nextDay.month)+u'月'+unicode(nextDay.day)+u'日 '
-
weadata[9] = unicode(lastDay.month)+u'月'+unicode(lastDay.day)+u'日 '
-
-
#取得有关天气的标签
-
wealist = soup.find("div", "weatherYubaoBox").findAll("table", "yuBaoTable")
-
if len(wealist) == 3:
-
#取得第一天信息
-
daytr = wealist[0].findAll("td")
-
#图片
-
thePic = os.path.basename(daytr[2].img['src'])
-
weadata[5] = thePic
-
#天气
-
theWeather = daytr[3].text
-
weadata[2] += theWeather
-
#温度
-
gradehighstr=daytr[4].findAll("strong")
-
theHighGrade = gradehighstr[0].text + gradehighstr[1].text
-
gradelowerstr=daytr[10].findAll("strong")
-
theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
-
weadata[3] = theLowerGrade + u'/' + theHighGrade
-
#风向
-
weadata[4] = daytr[5].text + daytr[6].text
-
-
#取得第二天信息
-
daytr = wealist[1].findAll("td")
-
#图片
-
thePic = os.path.basename(daytr[2].img['src'])
-
weadata[8] = thePic
-
#天气
-
theWeather = daytr[3].text
-
weadata[6] += theWeather
-
#温度
-
gradehighstr=daytr[4].findAll("strong")
-
theHighGrade = gradehighstr[0].text + gradehighstr[1].text
-
gradelowerstr=daytr[10].findAll("strong")
-
theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
-
weadata[7] = theLowerGrade + u'/' + theHighGrade
-
-
#取得第三天信息
-
daytr = wealist[2].findAll("td")
-
#图片
-
thePic = os.path.basename(daytr[2].img['src'])
-
weadata[11] = thePic
-
#天气
-
theWeather = daytr[3].text
-
weadata[9] += theWeather
-
#温度
-
gradehighstr=daytr[4].findAll("strong")
-
theHighGrade = gradehighstr[0].text + gradehighstr[1].text
-
gradelowerstr=daytr[10].findAll("strong")
-
theLowerGrade = gradelowerstr[0].text + gradelowerstr[1].text
-
weadata[10] = theLowerGrade + u'/' + theHighGrade
-
-
listToxml(weadata, "template/wea1.xml")
-
log('weather1获取天气信息成功', 'logs/running.log')
-
-
else:
-
log('weather1获取html文件失败')
-
log(strhtml)
-
log('weather1获取html文件失败', 'logs/running.log')
-
return False
-
-
return True
通过对比,三种方法中,xml的方式最准确,但是如果没有xml信息源,必须采集html网页,BeautifulSoup的方法维护起来要比正则表达式要方便的多,正则表达式维护比较困难
阅读(2428) | 评论(0) | 转发(0) |