抓取网页图片-使用python
2010-03-05 21:35
1. # -*- coding: utf-8 -*-
2. """
3. some function by metaphy,2007-04-03,copyleft
4. version 0.2
5. """
6. import urllib, httplib, urlparse
7. import re
8. import random
9.
10. """judge url exists or not,by others"""
11. def httpExists(url):
12. host, path = urlparse.urlsplit(url)[1:3]
13. if ':' in host:
14. # port specified, try to use it
15. host, port = host.split(':', 1)
16. try:
17. port = int(port)
18. except ValueError:
19. print 'invalid port number %r' % (port,)
20. return False
21. else:
22. # no port specified, use default port
23. port = None
24. try:
25. connection = httplib.HTTPConnection(host, port=port)
26. connection.request("HEAD", path)
27. resp = connection.getresponse( )
28. if resp.status == 200: # normal 'found' status
29. found = True
30. elif resp.status == 302: # recurse on temporary redirect
31. found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
32. else: # everything else -> not found
33. print "Status %d %s : %s" % (resp.status, resp.reason, url)
34. found = False
35. except Exception, e:
36. print e.__class__, e, url
37. found = False
38. return found
39.
40. """get html src,return lines[]"""
41. def gGetHtmlLines(url):
42. if url==None : return
43. if not httpExists(url): return
44. try:
45. page = urllib.urlopen(url)
46. html = page.readlines()
47. page.close()
48. return html
49. except:
50. print "gGetHtmlLines() error!"
51. return
52. """get html src,return string"""
53. def gGetHtml(url):
54. if url==None : return
55. if not httpExists(url): return
56. try:
57. page = urllib.urlopen(url)
58. html = page.read()
59. page.close()
60. return html
61. except:
62. print "gGetHtml() error!"
63. return
64.
65. """根据url获取文件名"""
66. def gGetFileName(url):
67. if url==None: return None
68. if url=="" : return ""
69. arr=url.split("/")
70. return arr[len(arr)-1]
71.
72. """生成随机文件名"""
73. def gRandFilename(type):
74. fname = ''
75. for i in range(16):
76. fname = fname + chr(random.randint(65,90))
77. fname = fname + chr(random.randint(48,57))
78. return fname + '.' + type
79. """根据url和其上的link,得到link的绝对地址"""
80. def gGetAbslLink(url,link):
81. if url==None or link == None : return
82. if url=='' or link=='' : return url
83. addr = ''
84. if link[0] == '/' :
85. addr = gGetHttpAddr(url) + link
86. elif len(link)>3 and link[0:4] == 'http':
87. addr = link
88. elif len(link)>2 and link[0:2] == '..':
89. addr = gGetHttpAddrFatherAssign(url,link)
90. else:
91. addr = gGetHttpAddrFather(url) + link
92.
93. return addr
94.
95. """根据输入的lines,匹配正则表达式,返回list"""
96. def gGetRegList(linesList,regx):
97. if linesList==None : return
98. rtnList=[]
99. for line in linesList:
100. matchs = re.search(regx, line, re.IGNORECASE)
101. if matchs!=None:
102. allGroups = matchs.groups()
103. for foundStr in allGroups:
104. if foundStr not in rtnList:
105. rtnList.append(foundStr)
106. return rtnList
107. """根据url下载文件,文件名参数指定"""
108. def gDownloadWithFilename(url,savePath,file):
109. #参数检查,现忽略
110. try:
111. urlopen=urllib.URLopener()
112. fp = urlopen.open(url)
113. data = fp.read()
114. fp.close()
115. file=open(savePath + file,'w+b')
116. file.write(data)
117. file.close()
118. except IOError:
119. print "download error!"+ url
120.
121. """根据url下载文件,文件名自动从url获取"""
122. def gDownload(url,savePath):
123. #参数检查,现忽略
124. fileName = gGetFileName(url)
125. #fileName =gRandFilename('jpg')
126. gDownloadWithFilename(url,savePath,fileName)
127.
128. """根据某网页的url,下载该网页的jpg"""
129. def gDownloadHtmlJpg(downloadUrl,savePath):
130. lines= gGetHtmlLines(downloadUrl)
131. regx = r"""src\s*="?(\S+)\.jpg"""
132. lists =gGetRegList(lines,regx)
133. if lists==None: return
134. for jpg in lists:
135. jpg = gGetAbslLink(downloadUrl,jpg) + '.jpg'
136. gDownload(jpg,savePath)
137. ### print gGetFileName(jpg)
138. """根据url取主站地址"""
139. def gGetHttpAddr(url):
140. if url== '' : return ''
141. arr=url.split("/")
142. return arr[0]+"//"+arr[2]
143. """根据url取上级目录"""
144. def gGetHttpAddrFather(url):
145. if url=='' : return ''
146. arr=url.split("/")
147. addr = arr[0]+'//'+arr[2]+ '/'
148. if len(arr)-1>3 :
149. for i in range(3,len(arr)-1):
150. addr = addr + arr[i] + '/'
151. return addr
152.
153. """根据url和上级的link取link的绝对地址"""
154. def gGetHttpAddrFatherAssign(url,link):
155. if url=='' : return ''
156. if link=='': return ''
157. linkArray=link.split("/")
158. urlArray = url.split("/")
159. partLink =''
160. partUrl = ''
161. for i in range(len(linkArray)):
162. if linkArray[i]=='..':
163. numOfFather = i + 1 #上级数
164. else:
165. partLink = partLink + '/' + linkArray[i]
166. for i in range(len(urlArray)-1-numOfFather):
167. partUrl = partUrl + urlArray[i]
168. if i < len(urlArray)-1-numOfFather -1 :
169. partUrl = partUrl + '/'
170. return partUrl + partLink
171.
172. """根据url获取其上的相关htm、html链接,返回list"""
173. def gGetHtmlLink(url):
174. #参数检查,现忽略
175. rtnList=[]
176. lines=gGetHtmlLines(url)
177. regx = r"""href="?(\S+)\.htm"""
178. for link in gGetRegList(lines,regx):
179. link = gGetAbslLink(url,link) + '.htm'
180. if link not in rtnList:
181. rtnList.append(link)
182. print link
183. return rtnList
184.
185. """根据url,抓取其上的jpg和其链接htm上的jpg"""
186. def gDownloadAllJpg(url,savePath):
187. #参数检查,现忽略
188. gDownloadHtmlJpg(url,savePath)
189. #抓取link上的jpg
190. links=gGetHtmlLink(url)
191. for link in links:
192. gDownloadHtmlJpg(link,savePatk,save6
阅读(1426) | 评论(0) | 转发(0) |