Chinaunix首页 | 论坛 | 博客
  • 博客访问: 4496730
  • 博文数量: 192
  • 博客积分: 10014
  • 博客等级: 上将
  • 技术积分: 8232
  • 用 户 组: 普通用户
  • 注册时间: 2006-07-21 00:22







分类: Python/Ruby

2008-09-22 17:25:38


import os
import md5
import urllib
import urllib2
import mimetypes
#from gzip import GzipFile
import cStringIO
from cPickle import loads,dumps
import cookielib

class MozillaCacher(object):
    """A dictionary like object, that can cache results on a storage device."""
    def __init__(self,cachedir='.cache'):
        self.cachedir = cachedir
        if not os.path.isdir(cachedir):
    def name2fname(self,name):
        return os.path.join(self.cachedir,name)
    def __getitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            return file(fname,'rb').read()
            raise IndexError()
    def __setitem__(self,name,value):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
        f = file(fname,'wb+')
    def __delitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
    def __iter__(self):
        raise NotImplementedError()
    def has_key(self,name):
        return os.path.isfile(self.name2fname(name))

class MozillaEmulator(object):
    def __init__(self,cacher={},trycount=0):
        """Create a new MozillaEmulator object.

        @param cacher: A dictionary like object, that can cache search results on a storage device.
            You can use a simple dictionary here, but it is not recommended.
            You can also put None here to disable caching completely.
        @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
                A value of 0 means no retrying. A value of 1 means one retry. etc."
        self.cacher = cacher
        self.cookies = cookielib.CookieJar()
        self.debug = False
        self.trycount = trycount
    def _hash(self,data):
        h =
        return h.hexdigest()

    def build_opener(self,url,postdata=None,extraheaders={},forbid_redirect=False):
        txheaders = {
# 'Accept-Encoding': 'gzip, deflate',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
# 'Keep-Alive': '300',
# 'Connection': 'keep-alive',
# 'Cache-Control': 'max-age=0',
        for key,value in extraheaders.iteritems():
            txheaders[key] = value
        req = urllib2.Request(url, postdata, txheaders)
        if forbid_redirect:
            redirector = HTTPNoRedirector()
            redirector = urllib2.HTTPRedirectHandler()

        http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
        https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)

        u = urllib2.build_opener(http_handler,https_handler,urllib2.HTTPCookieProcessor(self.cookies),redirector)
        u.addheaders = [('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; hu-HU; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
        if not postdata is None:
        return (req,u)

    def download(self,url,postdata=None,extraheaders={},forbid_redirect=False,
        """Download an URL with GET or POST methods.

        @param postdata: It can be a string that will be POST-ed to the URL.
            When None is given, the method will be GET instead.
        @param extraheaders: You can add/modify HTTP headers with a dict here.
        @param forbid_redirect: Set this flag if you do not want to handle
            HTTP 301 and 302 redirects.
        @param trycount: Specify the maximum number of retries here.
            0 means no retry on error. Using -1 means infinite retring.
            None means the default value (that is self.trycount).
        @param fd: You can pass a file descriptor here. In this case,
            the data will be written into the file. Please note that
            when you save the raw data into a file then it won't be cached.
        @param onprogress: A function that has two parameters:
            the size of the resource and the downloaded size. This will be
            called for each 1KB chunk. (If the HTTP header does not contain
            the content-length field, then the size parameter will be zero!)
        @param only_head: Create the openerdirector and return it. In other
            words, this will not retrieve any content except HTTP headers.

        @return: The raw HTML page data, unless fd was specified. When fd
            was given, the return value is undefined.
        if trycount is None:
            trycount = self.trycount
        cnt = 0
        while True:
                key = self._hash(url)
                if (self.cacher is None) or (not self.cacher.has_key(key)):
                    req,u = self.build_opener(url,postdata,extraheaders,forbid_redirect)
                    openerdirector =
                    if self.debug:
                        print req.get_method(),url
                        print openerdirector.code,openerdirector.msg
                        print openerdirector.headers
                    if only_head:
                        return openerdirector
                    if openerdirector.headers.has_key('content-length'):
                        length = long(openerdirector.headers['content-length'])
                        length = 0
                    dlength = 0
                    if fd:
                        while True:
                            data =
                            dlength += len(data)
                            if onprogress:
                            if not data:
                        data = ''
                        while True:
                            newdata =
                            dlength += len(newdata)
                            data += newdata
                            if onprogress:
                            if not newdata:
                        #data =
                        if not (self.cacher is None):
                            self.cacher[key] = data
                    data = self.cacher[key]
                # d2= GzipFile(fileobj=cStringIO.StringIO(data)).read()
                # data = d2
                #except IOError:
                # pass
                return data
            except urllib2.URLError:
                cnt += 1
                if (trycount > -1) and (trycount < cnt):
                # Retry :-)
                if self.debug:
                    print "MozillaEmulator: urllib2.URLError, retryting ",cnt

    def post_multipart(self,url,fields, files, forbid_redirect=True):
        """Post fields and files to an http host as multipart/form-data.
        fields is a sequence of (name, value) elements for regular form fields.
        files is a sequence of (name, filename, value) elements for data to be uploaded as files
        Return the server's response page.
        content_type, post_data = encode_multipart_formdata(fields, files)
        result =,post_data,{
            'Content-Type': content_type,
            'Content-Length': str(len(post_data))
        return result

class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
    """This is a custom http redirect handler that FORBIDS redirection."""
    def http_error_302(self, req, fp, code, msg, headers):
        e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
        if e.code in (301,302):
            if 'location' in headers:
                newurl = headers.getheaders('location')[0]
            elif 'uri' in headers:
                newurl = headers.getheaders('uri')[0]
            e.newurl = newurl
        raise e

def encode_multipart_formdata(fields, files):
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return (content_type, body) ready for httplib.HTTP instance
    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
    for (key, filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        L.append('Content-Type: %s' % get_content_type(filename))
    L.append('--' + BOUNDARY + '--')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def get_content_type(filename):
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'


dl = MozillaEmulator()
# Make sure that we get cookies from the server before logging in
frontpage ="")
# Sign in POST
post_data = "action=sign_in&username=user1&password=pwd1"
page ="",post_data)
if "Welcome" in page:
    # Send a file
    fdata = file("inventory.txt","rb").read()

阅读(1774) | 评论(0) | 转发(0) |