Chinaunix首页 | 论坛 | 博客
  • 博客访问: 438900
  • 博文数量: 161
  • 博客积分: 5005
  • 博客等级: 上校
  • 技术积分: 1090
  • 用 户 组: 普通用户
  • 注册时间: 2008-10-20 16:38
文章分类

全部博文(161)

文章存档

2011年(21)

2010年(33)

2009年(89)

2008年(18)

我的朋友

分类: Python/Ruby

2009-06-27 04:35:12

#coding: utf-8

#import pickle
import codecs
import copy
import re
from urllib2 import build_opener, HTTPCookieProcessor
from cookielib import CookieJar

from ClientForm import ParseFile
from BeautifulSoup import BeautifulSoup

class Browser:
    '''A simple browser supporting filling forms and handling cookies'''
    cj = CookieJar()
    opener = build_opener(HTTPCookieProcessor(cj))

    def __init__(self):
        self.page = u'no page'
        self.url = u'blank'
        self.forms = None

    def go(self, url='blank'):
        '''url should be a str or urllib2.Request object'''
        if url == 'blank':
            self.page = u'welcome to Browser'
        response = self.opener.open(url)
        self.url = response.geturl()
        raw_page = response.read()
        self._dump_file(raw_page, 'feedform')
        self.page = raw_page.decode('utf-8')
        print "=> ", self.url
        self.forms = self.getForms()
        response.close()

    def show(self, codeset='utf-8'):
        '''return current page HTML source with codeset'''
        return self.page

    def getForms(self):
        '''return all forms of current page.'''
        fp = open('feedform', 'r')
        self.forms = ParseFile(fp, self.url, backwards_compat=False)
        fp.close()
        return self.forms
       
    def setForm(self, position=0, **kargs):
        """Example:
            browser.setForm(0, user='jason', password='secret'"
""
       
        for key in kargs.keys():
            self.forms[position][key] = kargs[key]

    def submitForm(self, position=0):
        """Submit the form"""
        self.go(self.forms[position].click())
   
    def _copy_response(self, response):
        """Copy a response-like object"""
        cresponse = copy.copy(response)
       
    def _dump_file(self, html, filename='temp'):
        """Write the self.page into a file object"""
        fp = open(filename, 'w')
        fp.write(html)
        fp.close()


class BaseNavigator:
    """Abstract navigator.Very basic"""
    browser = Browser()

    #def __init__(self):
        #self.browser = Browser()

    def login(self):
        """Login process for website.
           Subclass should override this method"
""
        pass


class XnNavigator(BaseNavigator):
    """Navigator for xiaonei.com"""
    URL_LOGIN = ""
    URL_FRIEND_REX = r"^\.com/profile\.do\?(portal=.+&)?id=\d+$"
    URL_GET_PROFILE_PRIFIX = ".com/profile.do?="

    def __init__(self, settings):
        self.USER = settings['USER']
        self.PSW = settings['PSW']
        self.PRINT_LOG = settings['PRINT_LOG']
        self.root = (None,)
        self.visited = []
        self.buddy_li = []

               
    def login(self):
        """Login to Xiaonei.com"""
        self._pl("Login to Xiaonei...")
        self.browser.go(self.URL_LOGIN)
        self.browser.setForm(email="canri62@gmail.com", password="SOd1tSc1")
        self.browser.submitForm()
        self._pl("Login done.")

    def start(self):
        """Start navigation as defined."""
        self.login()

        self._pl("Start travelling...")
        buddy = None
        while True: # run until be killed
            buddy_li = self._parse_buddy(self.browser.show())
            self.buddy_li.extend(buddy_li)
            for b in buddy_li:
                if b not in self.visited:
                    buddy = b
            self.browser.go(self.URL_GET_PROFILE_PRIFIX + buddy[0])
            self._pl("Visited at %s %s ." % (buddy[0], buddy[1]))
            self.visited.append(buddy)

    def stop(self):v
        """Stop navigation"""
        pass

    #maybe this function should implement in a class Buddy.
    def leave_message(self, message, user_id):
        """Leave a message to specific user.
        Beware that you should not bother your buddies when use this function."
""
        pass

    def _pl(self, message):
        """_pl stands for _print_log"""
        if self.PRINT_LOG is True:
            print message

    def _parse_buddy(self, html):
        """parse the HTML page to load the buddy list.

        html - str or unicode type contains html source.
        returns a list of tuple, which consists of two str or unicode type
        strings depending on what you give to html argument)
        like this: ('4154512', 'Jason Bourne')"
""
        soup = BeautifulSoup(html)
        # ok, there is no error-checking at all
        li = soup.findAll('a', href=re.compile(self.URL_FRIEND_REX))
        return [(tag['href'].split('=')[-1], tag.contents[0])
                   for tag in li if tag.contents] # only retain those with contents



if __name__ =='__main__':
    setting = {'USER': yourusername,
               'PSW': yourpassword,
               'PRINT_LOG': True} # whether print log
    xn = XnNavigator(setting)
    xn.start()

阅读(1060) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~