分类: Python/Ruby
2007-03-26 23:56:51
#!/bin/env python # Copyright (c) xiaosuo # License GPL v2 or above. import sys import getopt import urllib2 import re import urlparse import string import distutils.dir_util import htmllib import formatter class UrlQueue: def __init__(self, baseUrl): self.base = baseUrl[0:baseUrl.rfind("/") + 1] self.finished = [] self.pending = [] def hasPending(self): if len(self.pending) > 0: return True else: return False def baseUrl(self): return self.base def pop(self): url = self.pending.pop() self.finished.append(url) return url def append(self, url): absUrl = urlparse.urljoin(self.base, url) baseUrlLen = len(self.base) if len(absUrl) <= baseUrlLen or absUrl[0:baseUrlLen] != self.base: return False url = absUrl[baseUrlLen:] for e in self.finished: if (url == e): return False for e in self.pending: if (url == e): return False self.pending.append(url) return True class UrlFilter(htmllib.HTMLParser): def __init__(self): htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.hrefList = [] def anchor_bgn(self, href, name, type): self.hrefList.append(href) def clear(self): self.hrefList = [] class Grabber: def __init__(self, initUrl): self.initUrl = initUrl self.urlQueue = UrlQueue(initUrl) self.urlFilter = UrlFilter() def run(self): self.urlQueue.append(self.initUrl) i = 0 while True: listUrl = "article_0_%d.html" % i absUrl = urlparse.urljoin(self.urlQueue.baseUrl(), listUrl) print "Fetching %s" % absUrl page = urllib2.urlopen(absUrl).read() self.urlFilter.clear() self.urlFilter.feed(page) self.urlFilter.close() valid = False for url in self.urlFilter.hrefList: self.urlQueue.append(url) if url[0:8] == "showart_": valid = True if not valid: break file(listUrl, "w").write(page) i = i + 1 while self._grab(): pass def _grab(self): if not self.urlQueue.hasPending(): return False url = self.urlQueue.pop() absUrl = urlparse.urljoin(self.urlQueue.baseUrl(), url) print "Fetching %s" % absUrl page = urllib2.urlopen(absUrl).read() pos = url.rfind("/") if pos != -1: distutils.dir_util.mkpath(url[0:pos]) file(url, "w").write(page) pos = url.rfind(".") if pos == -1: return True if string.lower(url[pos+1:pos+4]) != "htm": return True self.urlFilter.clear() self.urlFilter.feed(page) self.urlFilter.close() for url in self.urlFilter.hrefList: self.urlQueue.append(url) return True def showHelp(prg): print "CUBlog backup script.\n" print "Usage: %s [option]... initUrl" % prg print "Options:" print " -h, --help Show the help information" if __name__ == "__main__": baseUrl = ""; # parse the arguments try: (opts, args) = getopt.getopt(sys.argv[1:], "h", \ ["help"]) except getopt.GetoptError: print "Wrong command line arguments." showHelp(sys.argv[0]) sys.exit(1) for (o, a) in opts: if o in ("-h", "--help"): showHelp(sys.argv[0]) sys.exit(0) if len(args) == 0: showHelp(sys.argv[0]) sys.exit(1) url = args[0] if url.rfind("/") == len(url) - 1: url += "index.html" Grabber(url).run() |
xiaosuo@gentux python $ ./cubk http://blog.chinaunix.net/u/5251/ |