今天Graduate版聚归来,已是晚上10点过了,昨天刚写了一个在水源上自动发文的脚本,所以又突然想写一个自动下载图片的Python脚本。
虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。
先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:)
1 #!/usr/bin/python
2
3 # Download pictures from PPPerson @ bbs.sjtu.edu.cn
4
5 import re
6 import os
7 from urllib import ContentTooShortError
8 from urllib2 import URLError
9 import urllib, urllib2
10
11 def requestURL( url, datas, headers = None ) :
12 """Request a url"""
13
14 if not headers :
15 headers = { 'User-Agent' : 'Mozilla/3.0' }
16 # request
17 req = urllib2.Request( url, datas, headers )
18 # open url
19 try :
20 res = urllib2.urlopen( req )
21 except URLError, e :
22 if hasattr( e, 'reason' ) :
23 print "Failed to reach server: ", e.reason
24 elif hasattr( e, 'code' ) :
25 print "Can't fulfill the requset: ",e.code
26 else :
27 pass #print "Requset is successful\n"
28
29 return res
30
31 def getSubjectsURL( cmpPattern ) :
32 """Get the url of subjects"""
33
34 board_url = ''
35
36 # baord
37 data = { 'board' : 'PPPerson' }
38 datas = urllib.urlencode( data )
39
40 # get a response
41 res = requestURL( board_url, datas )
42
43 # create a compiled regular expression
44 #cmpPattern = re.compile(r'')
45
46 # get the url list of titles
47 subjectsURL = cmpPattern.findall( res.read() )
48
49 return subjectsURL
50
51 def getImagesURL( url, cmpPattern ) :
52 """Return the images url according to corresponding subject url"""
53
54 # get the url of subject
55 subject_url = ''
56 # get datas
57 datas = url
58
59 # request and get a response
60 res = requestURL( subject_url, datas )
61
62 # create a compiled pattern to find urls of images
63 #cmpPattern = re.compile( r'
', re.I )
64
65 # get the urls of images
66 imagesURL = cmpPattern.findall( res.read() )
67
68 # delete the repeated images
69 sets = set( imagesURL )
70 imagesURL = []
71 for item in sets :
72 imagesURL.append( item )
73
74 return imagesURL
75
76 def downloadImage( imageURL, subID ) :
77 """Download images"""
78
79 # image url
80 image_url = '' + imageURL
81
82 # create the directory to store images
83 # if not os.path.exists( './download' ) :
84 try :
85 os.makedirs( './download/' + subID )
86 except OSError :
87 pass
88 #print "Failed to create directories"
89
90
91 # get filename of image
92 filename = 'download/' + subID + '/' + imageURL.split( '/' )[-1]
93
94 # clear the cache that may have been built up
95 # by previous calls to urlretrieve()
96 urllib.urlcleanup()
97
98 # retrieve the image
99 try :
100 urllib.urlretrieve( image_url, filename )
101 except ContentTooShortError :
102 print "The data available was less than that of expected"
103 print "Downloading file %s was interrupted" \
104 % os.path.basename( filename )
105 else :
106 # get the size of file
107 size = os.path.getsize( filename ) / 1024
108 print ">>>File %s (%s Kb) was done..." % ( filename, size )
109
110
111 if __name__ == '__main__' :
112
113 # create compiled regular expression pattern
114 findSubjectsPattern = re.compile( \
115 r'(\d+) | .*?', re.I | re.DOTALL ) 116 findImagesPattern = re.compile( r' ', re.I ) 117 118 # get subjects' url list 119 subjectsList = getSubjectsURL( findSubjectsPattern ) 120 121 print "Downloading begins...\n" 122 123 filecount = 1 124 125 for i in range( len( subjectsList ) ) : 126 # get images url list 127 print "\nSubject %s begins..." % subjectsList[i][0] 128 imagesList = getImagesURL( subjectsList[i][1], findImagesPattern ) 129 # download all iamges 130 for j in range( len(imagesList) ) : 131 downloadImage( imagesList[j], subjectsList[i][0] ) 132 filecount += 1 133 134 print "\nAll downloads were done" 135 print "%d files were downloaded totally\n" % filecount 136
阅读(1769) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~
chinaunix网友2009-03-08 12:53:19
能不能写自动登陆bbs自动发帖的脚本啊?
|