import urllib
from HTMLParser import HTMLParser
import re
class HtmlParserTool():
def __init__(self):
self.urlcontent = ''
return
def set_url(self, url):
self.urlcontent = urllib.urlopen(url).read()
if 0 == len(self.urlcontent):
print 'HtmlParserTool:read url',url,'error\n'
return
class HtmlParserGetUrl(HTMLParser):
def __init__(self):
self.url = []
self.urltag = 0
HTMLParser.__init__(self)
return
def handle_starttag(self, tag, attrs):
if 'a' == tag:
hrefv = ""
for name,value in attrs:
if name == 'href':
hrefv = value
self.url.append(hrefv)
self.urltag = 1
return
def handle_data(self, data):
return
def handle_endtag(self, tag):
if 1 == self.urltag:
self.urltag = 0
return
class HtmlSavePage():
def __init__(self):
self.Parser = HtmlParserGetUrl()
return
def urlset(self, subsite, page):
self.Parser.feed(page)
for i in range(0, len(self.Parser.url)):
if -1 != self.Parser.url[i].find('http://'):
continue
index = 0
find = 0
for j in range(0, len(self.Parser.url)):
index = page[index:len(page)].find(self.Parser.url[i])
if -1 != index:
index2 = page[0:index].rfind('href')
if -1 == index2:
index2 = page[0:index].rfind('HREF')
if -1 != index2:
if index - index2 > 2:
index = index + len(self.Parser.url[i])
continue
else:
find = 1
else:
find = 0
break
if 1 == find:
page = page[0:index] + subsite + page[index:len(page)]
return
def save(self, path, url, site):
print path,'\n',url
webtool = HtmlParserTool()
webtool.set_url(url)
page = webtool.urlcontent
#replace '/xxx'
page = re.sub('(src|SRC)="//*', 'src="' + site + '/', page)
page = re.sub('(href|HREF)="//*', 'href="' + site + '/', page)
page = re.sub('(background|BACKGROUND)="//*', 'background="' + site + '/', page)
#replace 'xxx/xxx'
index = url.rfind('/')
if -1 != index:
subsite = url[0:index]
page = re.sub('(src|SRC)="(?!/|http|\.\.)', 'src="' + subsite + '/', page)
page = re.sub('(href|HREF)="(?!/|http|\.\.)', 'href="' + subsite + '/', page)
page = re.sub('(background|BACKGROUND)="(?!/|http|\.\.)', 'background="' + subsite + '/', page)
#replace '../xxx'
index = url[0:index].rfind('/')
if -1 != index:
index = url[0:index].rfind('/')
if -1 != index:
url = url[0:index]
print 'siteurl:',url
page = re.sub('(src|SRC)="\.\./\.\./', 'src="' + url + '/', page)
page = re.sub('(href|HREF)="\.\./\.\./', 'href="' + url + '/', page)
page = re.sub('(background|BACKGROUND)="\.\./\.\./', 'background="' + url + '/', page)
savefile = open(path, 'w')
savefile.write(page)
savefile.close()
return
class HtmlParserUrlTp1(HTMLParser):
def __init__(self):
self.url = []
self.urltitle = []
self.urltag = 0
self.urlregex = ''
self.urltmp = ''
HTMLParser.__init__(self)
return
def set_urlregex(self, regex):
self.urlregex = regex
return
def handle_starttag(self, tag, attrs):
if 'a' == tag:
for name,value in attrs:
if name == 'href':
self.urltmp = value
#print self.urltmp,'\n'
if re.match(self.urlregex, self.urltmp):
self.urltag = 1
else:
self.urltmp = ''
return
def handle_data(self, data):
if 1 == self.urltag:
if len(data) > 2:
for i in range(0, len(self.url)):
if self.url[i] == self.urltmp:
self.urltmp = ''
return
self.url.append(self.urltmp)
self.urltitle.append(data)
self.urltmp = ''
return
def handle_endtag(self, tag):
if 1 == self.urltag:
self.urltag = 0
return
def set_url(self, url):
urllib.urlopen(url).read()
return
class HtmlTool():
def __init__(self):
self.HtmlTp = ''
self.urlfile = ''
self.site = ''
self.subsite = ''
self.savepath = ''
self.needsavepage = 1 #do you want to save page, 0 is not save and 1 is save
self.code = '' #set html string code,such as gbk,utf-8,etc...
self.regex = '' #set url parser's regex
self.UrlTool = HtmlParserTool()
return
def set_no_save_page(self):
self.needsavepage = 0
return
def set_code(self, code):
self.code = code
return
def set_urlregex(self, regex):
self.regex = regex
return
def set_savepath(self, path):
self.savepath = path
return
def get_site(self):
urlhead = 'http://'
index = self.urlfile[len(urlhead) : len(self.urlfile)].find('/')
self.site = self.urlfile[0 : index + len(urlhead)]
index = self.urlfile.rfind('/')
self.subsite = self.urlfile[0:index]
print 'site:',self.site
print 'sub site:',self.subsite
return
def set_tp(self, tp):
self.HtmlTp = tp
return
def open_file(self, file):
self.file = open(file, 'w')
return
def close_file(self):
self.file.close()
return
def set_url(self, url):
self.urlfile = url
self.get_site()
if 'URL_TP1' == self.HtmlTp:
self.HtmlParser = HtmlParserUrlTp1()
self.UrlTool.set_url(url)
if '' == self.UrlTool.urlcontent:
print('get ',url,' error')
return
else:
urlfile = open('urlcontent.txt', 'w')
urlfile.write(self.UrlTool.urlcontent)
urlfile.close()
if '' != self.regex:
self.HtmlParser.set_urlregex(self.regex)
if '' != self.code:
self.HtmlParser.feed(self.UrlTool.urlcontent.decode(self.code))
else:
self.HtmlParser.feed(self.UrlTool.urlcontent)
if 0 == len(self.HtmlParser.url):
print('get url error')
return
#print len(self.HtmlParser.url),len(self.HtmlParser.urltitle)
for i in range(0, len(self.HtmlParser.url)):
fileurl = self.subsite + '/' + self.HtmlParser.url[i]
self.file.write(fileurl)
self.file.write(' <- ')
if 0 == len(self.code):
self.file.write(self.HtmlParser.urltitle[i])
else:
print len(self.code)
self.file.write(self.HtmlParser.urltitle[i].encode('utf-8'))
#print fileurl
self.file.write("\r\n")
if 1 == self.needsavepage:
savetool = HtmlSavePage()
savetool.save(self.savepath + self.HtmlParser.url[i], fileurl, self.site)
self.file.write("\r\n")
print('run end\n')
return
#switch
s = 2
#get blog from CU
if 1 == s:
webtool = HtmlTool()
webtool.set_tp('URL_TP1')
#webtool.set_no_save_page()
webtool.set_savepath('./web/')
webtool.open_file('web.txt')
#webtool.set_code('gbk')
webtool.set_urlregex('showart_[0-9]*.html')
webtool.set_url('http://blog.chinaunix.net/u3/105068/index.html')
webtool.close_file()
#showart
if 2 == s:
webtool = HtmlTool()
webtool.set_tp('URL_TP1')
webtool.set_no_save_page()
webtool.open_file('web.txt')
webtool.set_code('gbk')
webtool.set_urlregex('thread-[0-9]*-1-1.html')
webtool.set_url('http://linux.chinaunix.net/bbs/forum-8-1.html')
webtool.close_file()
print len(webtool.HtmlParser.url)
for i in range(0, len(webtool.HtmlParser.url)):
webfiltertool = HtmlParserTool()
webfiltertool.set_url('http://linux.chinaunix.net/bbs/'+webtool.HtmlParser.url[i])
if -1 != webfiltertool.urlcontent.find('001.gif'):
print 'find:(',webtool.HtmlParser.url[i],') -> ',webtool.HtmlParser.urltitle[i].encode('utf-8'),'\n'
#test
subject = '<a SRC="aaaaa"><a src=""><a src="/aaaa"><a src="../../aaa">'
result = re.sub('(src|SRC)="(?!(/|http|\.\.))', 'src="http:/bbx/', subject)
print result
result = re.sub('src="\.\./\.\./', 'src="http:/xxv/', result)
print result
if re.match('showart_[0-9]*.html', 'showart_2106707.html'):
print 'ok
|