version:python3.5
-
#-*- coding:utf-8 -*-
-
import re
-
import os
-
#import requests
-
import urllib.request
-
-
import socket
-
socket.setdefaulttimeout(5)
-
keyword = ""
-
i = 0
-
-
def dowmloadPic(html, word):
-
global i
-
pic_url = re.findall('"objURL":"(.*?)",',html,re.S)
-
print ('keyword: '+keyword+' downloading...')
-
dir = 'pictures\\'+keyword
-
if os.path.exists(dir) == False:
-
os.makedirs(dir)
-
for each in pic_url:
-
if each.endswith('.jpg') == False:
-
print("not a jpg picture"+each)
-
continue
-
print ('Downloading index('+str(i+1)+'), url:'+str(each))
-
try:
-
string = dir+"\\"+word+'_'+str(i) + '.jpg'
-
print(string)
-
urllib.request.urlretrieve(each, string)
-
#pic= requests.get(each, timeout=10)
-
except :
-
print ('failed to download the current picture')
-
continue
-
#resolve the problem of encode, make sure that chinese name could be store
-
#fp = open(string.decode('utf-8').encode('cp936'),'wb')
-
#fp.write(pic.content)
-
#fp.close()
-
i += 1
-
-
-
-
if __name__ == '__main__':
-
word = input("Input key word: ")
-
keyword = word
-
word = urllib.parse.quote(word)
-
page = 0
-
while 1 :
-
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+word+'&pn='+str(page)+'&ct=&ic=0&lm=-1&width=0&height=0'
-
#result = requests.get(url)
-
print('download url:'+ url)
-
result = urllib.request.urlopen(url).read()
-
result = result.decode('utf-8')
-
dowmloadPic(result, word)
-
page += 20
上述源码基于网友voidsky的代码修改,链接
http://blog.csdn.net/hk2291976/article/details/51188728
感谢voidsky,其blog有更多关于爬虫技术介绍,推荐参阅。
阅读(2762) | 评论(0) | 转发(0) |