--ypfish.blog.chinaunix.net
stone_pub
全部博文(138)
book(2)
ruby(1)
python(2)
bat(2)
vbscript(1)
javascript(5)
html(1)
shell(11)
c++(16)
test(1)
office(3)
cdrecord(2)
firefox(0)
GIMP(1)
Emacs(1)
VI(2)
resume(2)
tinyxml(1)
libcurl(2)
上海(2)
dba(3)
2016年(5)
2014年(4)
2012年(1)
2011年(2)
2010年(10)
2009年(19)
2008年(97)
zc7h
xy1121
老男孩IT
cynthia
wrathpig
Phyllis6
Bsolar
牛平
doctorle
hanzhenl
jiangbin
分类:
2008-10-14 16:58:54
import os import string import re import urllib2 def downURL(url, filename): try: fp = urllib2.urlopen(url) except: print "download exception" op = open(filename, "wb") while 1: s = fp.read() if not s: break op.write(s) fp.close() op.close() return 1
def spider(startURL, times): urls = [] urls.append(startURL) i = 0 while 1: if i > times: break if len(urls)>0: url = urls.pop(0) print url, len(urls) filename = get_file_name(url) print filename downURL(url, filename) content = get_content(filename) #print content urllist = get_url(content) #i = i + 1 #if len(urls) < times: # urllist = getURL(url) for tmp_url in urllist: new_url = url + tmp_url if urls.count(new_url) == 0: urls.append(new_url) else: break return 1 def get_file_name(url): basename = '' path = string.replace(url, basename, '') print path if path[len(path) -1] == '/': if not os.access(path, os.F_OK): os.mkdir(path) return string.rstrip(path, '.') + '.html' return path def get_content(file_name): op = open(file_name, 'rb') content = '' while 1: s = op.read() if not s: break content += s return content def get_url(html_content): name_array = [] pattern=re.compile('.*') list = pattern.findall(html_content) for con in list: con_1 = string.split(con, '>') con_2 = string.split(con_1[0], '"') new_name = con_2[1] if name_array.count(new_name) == 0: name_array.append(new_name) return name_array if __name__ == '__main__': spider('', 5)
上一篇:tar ,ssh for backup
下一篇:hadoop (1)
登录 注册