该程序通过获取页面的源码,从源码中读取数字来组成下一个链接的URL。
import os
import sys
import urllib
import re
def getsrc(url):
src=urllib.urlopen(url).read() # Get the page source of the page.
num=re.search('[\d]\d+',src) # search the digit from the page source.
if not num: # if cant find digits from the page source,
print src #then print the source and return the Excepion.
return num.group(1)
else:
return num.group(0) # Return the digits.
if __name__=='__main__':
url=''
for n in range(400):
print n
num=getsrc(url)
url=''+str(num) # use the return value(num) to create the next url.
print num
|
阅读(482) | 评论(0) | 转发(0) |