pattern = re.compile("[^\>]+.shtml") while 1:
s = fp.read() ifnot s: break
urls = pattern.findall(s)
fp.close() return urls
def spider(startURL,times):
urls =[]
urls.append(startURL)
i = 0 while 1: if i > times: break; if len(urls)>0:
url = urls.pop(0) print url,len(urls)
downURL(url,str(i)+'.htm')
i = i + 1 if len(urls)<times:
urllist = getURL(url) for url in urllist: if urls.count(url)== 0:
urls.append(url) else: break return 1
spider('',10)