Chinaunix首页 | 论坛 | 博客
  • 博客访问: 125062
  • 博文数量: 19
  • 博客积分: 810
  • 博客等级: 准尉
  • 技术积分: 200
  • 用 户 组: 普通用户
  • 注册时间: 2008-11-14 23:34
文章分类

全部博文(19)

文章存档

2010年(2)

2009年(12)

2008年(5)

我的朋友

分类: Python/Ruby

2008-11-16 03:01:22

      今天Graduate版聚归来,已是晚上10点过了,昨天刚写了一个在水源上自动发文的脚本,所以又突然想写一个自动下载图片的Python脚本。
      虽然类似的工具早有人写过(叫BBSPicSpider),不过它是用.NET来做的。反正就当学习Python的练手,于是开始试着编写。过程还是比较顺利,不过当中遇到很多小问题,查了很多Python的库文档加上百度,得以解决。今天太晚了,过两天再总结一下。
      先把代码贴上了,不过这个是1.0版本,基本功能已经可以实现了,不过有一点小BUG,有待进一步研究改进,有兴趣的朋友可以交流一下:)

  1 #!/usr/bin/python
  2
  3 # Download pictures from PPPerson @ bbs.sjtu.edu.cn
  4
  5 import re
  6 import os
  7 from urllib  import ContentTooShortError
  8 from urllib2 import URLError
  9 import urllib, urllib2
 10
 11 def requestURL( url, datas, headers = None ) :
 12     """Request a url"""
 13         
 14     if not headers :
 15         headers = { 'User-Agent' : 'Mozilla/3.0' }
 16     # request
 17     req = urllib2.Request( url, datas, headers )
 18     # open url
 19     try :
 20         res = urllib2.urlopen( req )
 21     except URLError, e :
 22         if hasattr( e, 'reason' ) :
 23             print "Failed to reach server: ", e.reason
 24         elif hasattr( e, 'code' ) :
 25             print "Can't fulfill the requset: ",e.code
 26     else :
 27         pass #print "Requset is successful\n"
 28
 29     return res
 30
 31 def getSubjectsURL( cmpPattern ) :
 32     """Get the url of subjects"""
 33
 34     board_url = ''
 35
 36     # baord
 37     data = { 'board' : 'PPPerson' }
 38     datas = urllib.urlencode( data )
 39
 40     # get a response
 41     res = requestURL( board_url, datas )
 42
 43     # create a compiled regular expression
 44     #cmpPattern = re.compile(r'')
 45     
 46     # get the url list of titles
 47     subjectsURL = cmpPattern.findall( res.read() )
 48
 49     return subjectsURL
 50
 51 def getImagesURL( url, cmpPattern ) :
 52     """Return the images url according to corresponding subject url"""
 53
 54     # get the url of subject
 55     subject_url = ''
 56     # get datas
 57     datas = url
 58
 59     # request and get a response
 60     res = requestURL( subject_url, datas )
 61     
 62     # create a compiled pattern to find urls of images
 63     #cmpPattern = re.compile( r'', re.I )
 64
 65     # get the urls of images
 66     imagesURL = cmpPattern.findall( res.read() )
 67
 68     # delete the repeated images
 69     sets = set( imagesURL )
 70     imagesURL = []
 71     for item in sets :
 72         imagesURL.append( item )
 73
 74     return imagesURL
 75
 76 def downloadImage( imageURL, subID ) :
 77     """Download images"""
 78
 79     # image url
 80     image_url = '' + imageURL
 81
 82     # create the directory to store images
 83     # if not os.path.exists( './download' ) :
 84     try :
 85         os.makedirs( './download/' + subID )
 86     except OSError :
 87         pass
 88         #print "Failed to create directories"
 89
 90     
 91     # get filename of image
 92     filename = 'download/' + subID + '/' + imageURL.split( '/' )[-1]
 93
 94     # clear the cache that may have been built up
 95     # by previous calls to urlretrieve()
 96     urllib.urlcleanup()
 97     
 98     # retrieve the image
 99     try :
100         urllib.urlretrieve( image_url, filename )
101     except ContentTooShortError :
102         print "The data available was less than that of expected"
103         print "Downloading file %s was interrupted\
104                         % os.path.basename( filename )
105     else :
106         # get the size of file
107         size = os.path.getsize( filename ) / 1024
108         print ">>>File %s (%s Kb) was done..." % ( filename, size )
109
110
111 if __name__ == '__main__' :
112     
113     # create compiled regular expression pattern
114     findSubjectsPattern = re.compile( \
115                     r'(\d+).*?', re.I | re.DOTALL )
116     findImagesPattern   = re.compile( r'', re.I )
117
118     # get subjects' url list
119     subjectsList = getSubjectsURL( findSubjectsPattern )
120
121     print "Downloading begins...\n"
122
123     filecount = 1
124
125     for i in range( len( subjectsList ) ) :
126         # get images url list
127         print "\nSubject %s begins..." % subjectsList[i][0]
128         imagesList = getImagesURL( subjectsList[i][1], findImagesPattern )
129         # download all iamges
130         for j in range( len(imagesList) ) :
131             downloadImage( imagesList[j], subjectsList[i][0] )
132             filecount += 1
133
134     print "\nAll downloads were done"
135     print "%d files were downloaded totally\n" % filecount
136
阅读(1769) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~

chinaunix网友2009-03-08 12:53:19

能不能写自动登陆bbs自动发帖的脚本啊?