Chinaunix首页 | 论坛 | 博客
  • 博客访问: 4489307
  • 博文数量: 192
  • 博客积分: 10014
  • 博客等级: 上将
  • 技术积分: 8232
  • 用 户 组: 普通用户
  • 注册时间: 2006-07-21 00:22
文章分类

全部博文(192)

文章存档

2011年(4)

2009年(14)

2008年(174)

我的朋友

分类: Python/Ruby

2008-09-22 17:25:38

 
 

import os
import md5
import urllib
import urllib2
import mimetypes
#from gzip import GzipFile
import cStringIO
from cPickle import loads,dumps
import cookielib


class MozillaCacher(object):
    """A dictionary like object, that can cache results on a storage device."""
    def __init__(self,cachedir='.cache'):
        self.cachedir = cachedir
        if not os.path.isdir(cachedir):
            os.mkdir(cachedir)
    def name2fname(self,name):
        return os.path.join(self.cachedir,name)
    def __getitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            return file(fname,'rb').read()
        else:
            raise IndexError()
    def __setitem__(self,name,value):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            os.unlink(fname)
        f = file(fname,'wb+')
        try:
            f.write(value)
        finally:
            f.close()
    def __delitem__(self,name):
        if not isinstance(name,str):
            raise TypeError()
        fname = self.name2fname(name)
        if os.path.isfile(fname):
            os.unlink(fname)
    def __iter__(self):
        raise NotImplementedError()
    def has_key(self,name):
        return os.path.isfile(self.name2fname(name))

class MozillaEmulator(object):
    def __init__(self,cacher={},trycount=0):
        """Create a new MozillaEmulator object.

        @param cacher: A dictionary like object, that can cache search results on a storage device.
            You can use a simple dictionary here, but it is not recommended.
            You can also put None here to disable caching completely.
        @param trycount: The download() method will retry the operation if it fails. You can specify -1 for infinite retrying.
                A value of 0 means no retrying. A value of 1 means one retry. etc."
""
        self.cacher = cacher
        self.cookies = cookielib.CookieJar()
        self.debug = False
        self.trycount = trycount
    def _hash(self,data):
        h = md5.new()
        h.update(data)
        return h.hexdigest()

    def build_opener(self,url,postdata=None,extraheaders={},forbid_redirect=False):
        txheaders = {
            'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language':'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3',
# 'Accept-Encoding': 'gzip, deflate',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
# 'Keep-Alive': '300',
# 'Connection': 'keep-alive',
# 'Cache-Control': 'max-age=0',
        }
        for key,value in extraheaders.iteritems():
            txheaders[key] = value
        req = urllib2.Request(url, postdata, txheaders)
        self.cookies.add_cookie_header(req)
        if forbid_redirect:
            redirector = HTTPNoRedirector()
        else:
            redirector = urllib2.HTTPRedirectHandler()

        http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
        https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)

        u = urllib2.build_opener(http_handler,https_handler,urllib2.HTTPCookieProcessor(self.cookies),redirector)
        u.addheaders = [('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; hu-HU; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
        if not postdata is None:
            req.add_data(postdata)
        return (req,u)

    def download(self,url,postdata=None,extraheaders={},forbid_redirect=False,
            trycount=None,fd=None,onprogress=None,only_head=False):
        """Download an URL with GET or POST methods.

        @param postdata: It can be a string that will be POST-ed to the URL.
            When None is given, the method will be GET instead.
        @param extraheaders: You can add/modify HTTP headers with a dict here.
        @param forbid_redirect: Set this flag if you do not want to handle
            HTTP 301 and 302 redirects.
        @param trycount: Specify the maximum number of retries here.
            0 means no retry on error. Using -1 means infinite retring.
            None means the default value (that is self.trycount).
        @param fd: You can pass a file descriptor here. In this case,
            the data will be written into the file. Please note that
            when you save the raw data into a file then it won't be cached.
        @param onprogress: A function that has two parameters:
            the size of the resource and the downloaded size. This will be
            called for each 1KB chunk. (If the HTTP header does not contain
            the content-length field, then the size parameter will be zero!)
        @param only_head: Create the openerdirector and return it. In other
            words, this will not retrieve any content except HTTP headers.

        @return: The raw HTML page data, unless fd was specified. When fd
            was given, the return value is undefined.
        "
""
        if trycount is None:
            trycount = self.trycount
        cnt = 0
        while True:
            try:
                key = self._hash(url)
                if (self.cacher is None) or (not self.cacher.has_key(key)):
                    req,u = self.build_opener(url,postdata,extraheaders,forbid_redirect)
                    openerdirector = u.open(req)
                    if self.debug:
                        print req.get_method(),url
                        print openerdirector.code,openerdirector.msg
                        print openerdirector.headers
                    self.cookies.extract_cookies(openerdirector,req)
                    if only_head:
                        return openerdirector
                    if openerdirector.headers.has_key('content-length'):
                        length = long(openerdirector.headers['content-length'])
                    else:
                        length = 0
                    dlength = 0
                    if fd:
                        while True:
                            data = openerdirector.read(1024)
                            dlength += len(data)
                            fd.write(data)
                            if onprogress:
                                onprogress(length,dlength)
                            if not data:
                                break
                    else:
                        data = ''
                        while True:
                            newdata = openerdirector.read(1024)
                            dlength += len(newdata)
                            data += newdata
                            if onprogress:
                                onprogress(length,dlength)
                            if not newdata:
                                break
                        #data = openerdirector.read()
                        if not (self.cacher is None):
                            self.cacher[key] = data
                else:
                    data = self.cacher[key]
                #try:
                # d2= GzipFile(fileobj=cStringIO.StringIO(data)).read()
                # data = d2
                #except IOError:
                # pass
                return data
            except urllib2.URLError:
                cnt += 1
                if (trycount > -1) and (trycount < cnt):
                    raise
                # Retry :-)
                if self.debug:
                    print "MozillaEmulator: urllib2.URLError, retryting ",cnt


    def post_multipart(self,url,fields, files, forbid_redirect=True):
        """Post fields and files to an http host as multipart/form-data.
        fields is a sequence of (name, value) elements for regular form fields.
        files is a sequence of (name, filename, value) elements for data to be uploaded as files
        Return the server's response page.
        "
""
        content_type, post_data = encode_multipart_formdata(fields, files)
        result = self.download(url,post_data,{
            'Content-Type': content_type,
            'Content-Length': str(len(post_data))
        },forbid_redirect=forbid_redirect
        )
        return result


class HTTPNoRedirector(urllib2.HTTPRedirectHandler):
    """This is a custom http redirect handler that FORBIDS redirection."""
    def http_error_302(self, req, fp, code, msg, headers):
        e = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
        if e.code in (301,302):
            if 'location' in headers:
                newurl = headers.getheaders('location')[0]
            elif 'uri' in headers:
                newurl = headers.getheaders('uri')[0]
            e.newurl = newurl
        raise e


           
def encode_multipart_formdata(fields, files):
    """
    fields is a sequence of (name, value) elements for regular form fields.
    files is a sequence of (name, filename, value) elements for data to be uploaded as files
    Return (content_type, body) ready for httplib.HTTP instance
    "
""
    BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
    CRLF = '\r\n'
    L = []
    for (key, value) in fields:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"' % key)
        L.append('')
        L.append(value)
    for (key, filename, value) in files:
        L.append('--' + BOUNDARY)
        L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
        L.append('Content-Type: %s' % get_content_type(filename))
        L.append('')
        L.append(value)
    L.append('--' + BOUNDARY + '--')
    L.append('')
    body = CRLF.join(L)
    content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
    return content_type, body

def get_content_type(filename):
    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'


# HOW TO USE

dl = MozillaEmulator()
# Make sure that we get cookies from the server before logging in
frontpage = dl.download("")
# Sign in POST
post_data = "action=sign_in&username=user1&password=pwd1"
page = dl.download("",post_data)
if "Welcome" in page:
    # Send a file
    fdata = file("inventory.txt","rb").read()
    dl.post_multipart('',
                    [('uploadType','Inventory'),('otherfield','othervalue')],
                    [('uploadFileName','inventory.txt',fdata)]
                )

阅读(1759) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~