Python编写的网页抓取工具,可以从百度空间上自动下载所有文章
2010-06-12 14:12
简介:
自己编写的Python程序,可以自动的将指定的Baidu空间上的所有文章下载下来。
使用方法很简单,装完Python之后,保存脚本为down.py,然后在这个文件所在的文件夹下执行:
其中的为空间地址,例如我的:
代码:
#!/usr/bin/env python # -*- coding: utf-8 -*- # ********************************************************************************* # Copyright (C) 2010 yangyingchao@gmail.com
# Author: yangyingchao
# This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 2, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details.
# You should have received a copy of the GNU General Public License along with # GNU Emacs; see the file COPYING. If not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # *********************************************************************************
import urllib2 import re import sys
pattern = "" reg_tail = "" username = ""
def downURL(url, filename): print "Download %s, save as %s"%(url, filename) try: fp = urllib2.urlopen(url) except: print "download exception" return 0 op = open(filename, "wb") while 1: s = fp.read() if not s: break op.write(s) fp.close( ) op.close( ) return 1
def getURL(url): print "Parsing %s"%url try: fp = urllib2.urlopen(url) contents = fp.readlines() except: print "exception" return []
item_list = [] for s in contents: urls = pattern.findall(s) if urls: item_list.extend(urls) fp.close( ) return item_list
def reptile(base_url): """ Download all articles from base_url. Arguments: - `base_url`: Url of website. """ page_list = [] base_page = base_url.rstrip("/")+"/blog/index/" sign_tail = u"尾页" tail = "" total_page = 10 global username print username
try: fp = urllib2.urlopen(base_page+"0") except: print "%s: Not such url"%page print sys.exc_info() else: for s in fp.readlines(): if sign_tail in s.decode("gbk"): tail = s.decode("gbk") break fp.close()
if tail: pos = tail.rfind(u"尾页") total_page =int(tail[:pos-3].split("/")[-1])
output_list = [ ] for idx in range(total_page+1): item_page = "%s%d"%(base_page, idx) item_list = getURL(item_page) if item_list: output_list.extend(item_list)
item_list = list(set(output_list)) for item in item_list: down_url = item.replace("/%s"%username, "%s"%username) local_file = down_url.split("/")[-1] ret = downURL(down_url,local_file) print "Total: %d articles."%(len(item_list)) pass
if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: %s url of baidu space"%sys.argv[0] print "Such as: %s Username" sys.exit(1) base_url = sys.argv[1] if not base_url.startswith(""): print "Wrong Type of URL??", "It works on Baidu Space only." sys.exit(1)
username = base_url.rstrip("/").split("/")[-1] print username
reg_tail = re.compile(u"%s.*?尾页"%username) pattern = re.compile("/%s/blog/item/.*?\.html"%username)
reptile (base_url)
|
|
Author:yangyingchao, 2010-06-12
|
阅读(1095) | 评论(0) | 转发(0) |