分类: 服务器与存储
2008-07-21 23:58:23
python-chinese 的邮件归档的html格式比较简单,很好抓取。源代码如下,就当是学习Python的习作了。代码考虑到了网络IO读取失败的可能性,下载时没有使用多线程,只是挨个下载,就让机器慢慢工作吧。
#!/usr/bin/python
# -*- coding: cp936 -*-
# Filename: gethtml.py
import urllib
import os
from sgmllib import SGMLParser
#to parse
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
self.titles = []
self.isBody = False
def start_tr(self,attrs):
self.isBody = True
def start_a(self, attrs):
if self.isBody:
href = [v for k, v in attrs if k == 'href']
if href:
self.urls.extend(href)
self.isBody = False
class MailLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.mailUrls = []
self.isBody = False
self.times = 0
self.inLi = False
def start_p(self,attrs):
if self.times == 0:
self.isBody = True
self.times = 1
def start_li(self,attrs):
if self.isBody:
self.inLi = True
else:
self.inLi = False
def start_a(self, attrs):
href = [v for k, v in attrs if k == 'name']
if href:
if href[0] == "end":
self.isBody = False
self.inLi = False
if self.inLi:
href = [v for k, v in attrs if k == 'href']
if href:
self.mailUrls.extend(href)
self.inLi = False
def getAllPaperMail(baseUrl = ""):
sock = urllib.urlopen(baseUrl)
htmlSource = sock.read()
sock.close()
parser = URLLister()
parser.feed(htmlSource)
for url in parser.urls:
indexurl = baseUrl + url
myList = url.split('/')
path = myList[0]
path = path.replace(' ',"")
filename = myList[1]
newurl = baseUrl + path
if not os.path.exists(path):
os.mkdir(path)
filename = path + '/' + filename
#download index page
urllib.urlretrieve(indexurl, filename)
#get papermail
print "parse url= ",newurl
getPaperMail(newurl,path)
winsound.Beep(783,200)
def getPaperMail(baseUrl,path):
isOK = False
try1 = 0
try2 = 0
htmlSource = ""
while try1 < 6 and isOK == False:
try:
sock = urllib.urlopen(baseUrl)
htmlSource = sock.read()
sock.close()
except:
try2 = try2+1
pass
if try2 > try1:
try1 = try1 + 1
else:
isOK = True
if isOK == False:
print "get pageurl fail---",pageurl
winsound.Beep(783,200)
return
parser = MailLister()
parser.feed(htmlSource)
for url in parser.mailUrls:
newurl = baseUrl + '/' + url
filename = path + '/' + url
#download each mail
print "get: ",newurl
isOK = False
try1 = 0
try2 = 0
while try1 < 6 and isOK == False:
try:
urllib.urlretrieve(newurl, filename)
except:
try2 = try2+1
if try2 > try1:
try1 = try1 + 1
else:
isOK = True
if __name__ == "__main__":
print "Start."
url = r''
getAllPaperMail(url)
print "The end."