Chinaunix首页 | 论坛 | 博客
  • 博客访问: 206379
  • 博文数量: 48
  • 博客积分: 1935
  • 博客等级: 上尉
  • 技术积分: 491
  • 用 户 组: 普通用户
  • 注册时间: 2010-07-29 00:59
文章分类

全部博文(48)

文章存档

2011年(1)

2010年(47)

我的朋友

分类: Python/Ruby

2010-09-21 00:34:36

抓取网页图片-使用python
2010-03-05 21:35
   1. # -*- coding: utf-8 -*-
   2. """
   3. some function by metaphy,2007-04-03,copyleft
   4. version 0.2
   5. """
   6. import urllib, httplib, urlparse
   7. import re
   8. import random
   9.  
10. """judge url exists or not,by others"""
11. def httpExists(url):
12.     host, path = urlparse.urlsplit(url)[1:3]
13.     if ':' in host:
14.         # port specified, try to use it
15.         host, port = host.split(':', 1)
16.         try:
17.             port = int(port)
18.         except ValueError:
19.             print 'invalid port number %r' % (port,)
20.             return False
21.     else:
22.         # no port specified, use default port
23.         port = None
24.     try:
25.         connection = httplib.HTTPConnection(host, port=port)
26.         connection.request("HEAD", path)
27.         resp = connection.getresponse( )
28.         if resp.status == 200:       # normal 'found' status
29.             found = True
30.         elif resp.status == 302:     # recurse on temporary redirect
31.             found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))
32.         else:                        # everything else -> not found
33.             print "Status %d %s : %s" % (resp.status, resp.reason, url)
34.             found = False
35.     except Exception, e:
36.         print e.__class__, e, url
37.         found = False
38.     return found
39.  
40. """get html src,return lines[]"""
41. def gGetHtmlLines(url):
42.     if url==None : return
43.     if not httpExists(url): return  
44.     try:
45.         page = urllib.urlopen(url)    
46.         html = page.readlines()
47.         page.close()
48.         return html
49.     except:
50.         print "gGetHtmlLines() error!"
51.         return
52. """get html src,return string"""
53. def gGetHtml(url):
54.     if url==None : return
55.     if not httpExists(url): return  
56.     try:
57.         page = urllib.urlopen(url)    
58.         html = page.read()
59.         page.close()
60.         return html
61.     except:
62.         print "gGetHtml() error!"
63.         return
64.  
65. """根据url获取文件名"""
66. def gGetFileName(url):
67.     if url==None: return None
68.     if url=="" : return ""
69.     arr=url.split("/")
70.     return arr[len(arr)-1]
71.  
72. """生成随机文件名"""
73. def gRandFilename(type):
74.     fname = ''
75.     for i in range(16):
76.         fname = fname + chr(random.randint(65,90))
77.         fname = fname + chr(random.randint(48,57))
78.     return fname + '.' + type
79. """根据url和其上的link,得到link的绝对地址"""
80. def gGetAbslLink(url,link):
81.     if url==None or link == None : return  
82.     if url=='' or link=='' : return url  
83.     addr = ''  
84.     if link[0] == '/' :  
85.         addr = gGetHttpAddr(url) + link  
86.     elif len(link)>3 and link[0:4] == 'http':
87.         addr = link  
88.     elif len(link)>2 and link[0:2] == '..':
89.         addr = gGetHttpAddrFatherAssign(url,link)
90.     else:
91.         addr = gGetHttpAddrFather(url) + link  
92.  
93.     return addr  
94.  
95. """根据输入的lines,匹配正则表达式,返回list"""
96. def gGetRegList(linesList,regx):
97.     if linesList==None : return  
98.     rtnList=[]
99.     for line in linesList:
100.         matchs = re.search(regx, line, re.IGNORECASE)
101.         if matchs!=None:
102.             allGroups = matchs.groups()
103.             for foundStr in allGroups:
104.                 if foundStr not in rtnList:
105.                     rtnList.append(foundStr)
106.     return rtnList
107. """根据url下载文件,文件名参数指定"""
108. def gDownloadWithFilename(url,savePath,file):
109.     #参数检查,现忽略
110.     try:
111.         urlopen=urllib.URLopener()
112.         fp = urlopen.open(url)
113.         data = fp.read()
114.         fp.close()
115.         file=open(savePath + file,'w+b')
116.         file.write(data)
117.         file.close()
118.     except IOError:
119.         print "download error!"+ url
120.          
121. """根据url下载文件,文件名自动从url获取"""
122. def gDownload(url,savePath):
123.     #参数检查,现忽略
124.     fileName = gGetFileName(url)
125.     #fileName =gRandFilename('jpg')
126.     gDownloadWithFilename(url,savePath,fileName)
127.          
128. """根据某网页的url,下载该网页的jpg"""
129. def gDownloadHtmlJpg(downloadUrl,savePath):
130.     lines= gGetHtmlLines(downloadUrl)
131.     regx = r"""src\s*="?(\S+)\.jpg"""
132.     lists =gGetRegList(lines,regx)
133.     if lists==None: return  
134.     for jpg in lists:
135.         jpg = gGetAbslLink(downloadUrl,jpg) + '.jpg'
136.         gDownload(jpg,savePath)
137.    ###     print gGetFileName(jpg)
138. """根据url取主站地址"""
139. def gGetHttpAddr(url):
140.     if url== '' : return ''
141.     arr=url.split("/")
142.     return arr[0]+"//"+arr[2]
143. """根据url取上级目录"""
144. def gGetHttpAddrFather(url):
145.     if url=='' : return ''
146.     arr=url.split("/")
147.     addr = arr[0]+'//'+arr[2]+ '/'
148.     if len(arr)-1>3 :
149.         for i in range(3,len(arr)-1):
150.             addr = addr + arr[i] + '/'
151.     return addr
152.  
153. """根据url和上级的link取link的绝对地址"""
154. def gGetHttpAddrFatherAssign(url,link):
155.     if url=='' : return ''
156.     if link=='': return ''
157.     linkArray=link.split("/")
158.     urlArray = url.split("/")
159.     partLink =''
160.     partUrl = ''
161.     for i in range(len(linkArray)):         
162.         if linkArray[i]=='..':  
163.             numOfFather = i + 1    #上级数
164.         else:
165.             partLink = partLink + '/' + linkArray[i]
166.     for i in range(len(urlArray)-1-numOfFather):
167.         partUrl = partUrl + urlArray[i]  
168.         if i < len(urlArray)-1-numOfFather -1 :  
169.             partUrl = partUrl + '/'
170.     return partUrl + partLink
171.  
172. """根据url获取其上的相关htm、html链接,返回list"""
173. def gGetHtmlLink(url):
174.     #参数检查,现忽略
175.     rtnList=[]
176.     lines=gGetHtmlLines(url)
177.     regx = r"""href="?(\S+)\.htm"""
178.     for link in gGetRegList(lines,regx):
179.         link = gGetAbslLink(url,link) + '.htm'
180.         if link not in rtnList:
181.             rtnList.append(link)
182.             print link
183.     return rtnList
184.  
185. """根据url,抓取其上的jpg和其链接htm上的jpg"""
186. def gDownloadAllJpg(url,savePath):
187.     #参数检查,现忽略
188.     gDownloadHtmlJpg(url,savePath)
189.     #抓取link上的jpg
190.     links=gGetHtmlLink(url)
191.     for link in links:
192.         gDownloadHtmlJpg(link,savePatk,save6
阅读(1434) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~