Chinaunix首页 | 论坛 | 博客
  • 博客访问: 158901
  • 博文数量: 56
  • 博客积分: 2510
  • 博客等级: 少校
  • 技术积分: 502
  • 用 户 组: 普通用户
  • 注册时间: 2009-12-18 14:21
文章分类

全部博文(56)

文章存档

2010年(39)

2009年(17)

我的朋友

分类: Python/Ruby

2010-07-07 21:09:32

改编自IBM-developerworks上的一个例子:
命令:$python ./mimispider /
实现保存对应网页上的jpg图片,麻雀虽小,五脏俱全。

#!/usr/local/bin/python

import httplib
import sys
import re
import urllib2
from HTMLParser import HTMLParser



class miniHTMLParser( HTMLParser ):

  viewedQueue = []
  instQueue = []
  index = 0
  def get_next_link( self ):
    if self.instQueue == []:
      return ''
    else:
      return self.instQueue.pop(0)


  def gethtmlfile( self, site, page ):
    try:
      httpconn = httplib.HTTPConnection(site)
      httpconn.request("GET", page)
      resp = httpconn.getresponse()
      resppage = resp.read()
    except:
      resppage = ""

    return resppage


  def handle_starttag( self, tag, attrs ):
    if tag == 'a':
      newstr = str(attrs[0][1])
#
    elif tag == 'img':
      newstr = str(attrs[0][1])
      print 'find a img',newstr
      if (re.search('jpg',newstr) != None or re.search('jpeg',newstr) != None) and re.search('http',newstr):
        print 'downloading ',newstr
        req = urllib2.Request(newstr)
        response = urllib2.urlopen(req)
        self.index += 1
        f=open( str(self.index) + '.jpg', "wb" )
        f.write(response.read())
        f.close()
# print "adding", newstr
# self.instQueue.append( newstr )
# if re.search('http', newstr) == None:
# if re.search('mailto', newstr) == None:
# if re.search('htm', newstr) != None:
# if (newstr in self.viewedQueue) == False:
# print " adding", newstr
# self.instQueue.append( newstr )
# self.viewedQueue.append( newstr )
# else:
# print " ignoring", newstr
# else:
# print " ignoring", newstr
# else:
# print " ignoring", newstr


def main():

  if sys.argv[1] == '':
    print "usage is ./minispider.py site link"
    sys.exit(2)

  mySpider = miniHTMLParser()

  link = sys.argv[2]

  #while link != '':

  print "\nChecking link ", link

    # Get the file from the site and link
  retfile = mySpider.gethtmlfile( sys.argv[1], link )

  # Feed the file into the HTML parser
  
  mySpider.feed(retfile)

    # Search the retfile here

    # Get the next link in level traversal order
  link = mySpider.get_next_link()

  mySpider.close()

  print "\ndone\n"

if __name__ == "__main__":
  main()


原创 用Python写的图片蜘蛛人 收藏
他的代码:

#!/usr/local/bin/python
  import os
  import sys
  import re
  import urllib
  
  URL_REG = re.compile(r'(http://[^/\\]+)', re.I)
  IMG_REG = re.compile(r']*?src=([\'"])([^\1]*?)\1', re.I)
 
  def download(dir, url):
    ''''' 下载网页中的图片
    
    @dir 保存到本地的路径
    @url 网页url
    '
''
    global URL_REG, IMG_REG
 
    m = URL_REG.match(url)
    if not m:
      print '[Error]Invalid URL: ', url
      return
    host = m.group(1)
 
    if not os.path.isdir(dir):
      os.mkdir(dir)
 
    # 获取html,提取图片url
    html = urllib.urlopen(url).read()
    imgs = [item[1].lower() for item in IMG_REG.findall(html)]
    f = lambda path: path if path.startswith('http://') else \
    host + path if path.startswith('/') else url + '/' + path
    imgs = list(set(map(f, imgs)))
    print '[Info]Find %d images.' % len(imgs)
 
   # 下载图片
   for idx, img in enumerate(imgs):
     name = img.split('/')[-1]
     path = os.path.join(dir, name)
     try:
       print '[Info]Download(%d): %s'% (idx + 1, img)
       urllib.urlretrieve(img, path)
     except:
       print "[Error]Cant't download(%d): %s" % (idx + 1, img)
 
  def main():
     if len(sys.argv) != 3:
       print 'Invalid argument count.'
     return
     dir, url = sys.argv[1:]
     download(dir, url)
 
  if __name__ == '__main__':
  # download('D:\\Imgs', '')
    main()


阅读(1928) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~