#coding: utf-8
#import pickle
import codecs
import copy
import re
from urllib2 import build_opener, HTTPCookieProcessor
from cookielib import CookieJar
from ClientForm import ParseFile
from BeautifulSoup import BeautifulSoup
class Browser:
'''A simple browser supporting filling forms and handling cookies'''
cj = CookieJar()
opener = build_opener(HTTPCookieProcessor(cj))
def __init__(self):
self.page = u'no page'
self.url = u'blank'
self.forms = None
def go(self, url='blank'):
'''url should be a str or urllib2.Request object'''
if url == 'blank':
self.page = u'welcome to Browser'
response = self.opener.open(url)
self.url = response.geturl()
raw_page = response.read()
self._dump_file(raw_page, 'feedform')
self.page = raw_page.decode('utf-8')
print "=> ", self.url
self.forms = self.getForms()
response.close()
def show(self, codeset='utf-8'):
'''return current page HTML source with codeset'''
return self.page
def getForms(self):
'''return all forms of current page.'''
fp = open('feedform', 'r')
self.forms = ParseFile(fp, self.url, backwards_compat=False)
fp.close()
return self.forms
def setForm(self, position=0, **kargs):
"""Example:
browser.setForm(0, user='jason', password='secret'"""
for key in kargs.keys():
self.forms[position][key] = kargs[key]
def submitForm(self, position=0):
"""Submit the form"""
self.go(self.forms[position].click())
def _copy_response(self, response):
"""Copy a response-like object"""
cresponse = copy.copy(response)
def _dump_file(self, html, filename='temp'):
"""Write the self.page into a file object"""
fp = open(filename, 'w')
fp.write(html)
fp.close()
class BaseNavigator:
"""Abstract navigator.Very basic"""
browser = Browser()
#def __init__(self):
#self.browser = Browser()
def login(self):
"""Login process for website.
Subclass should override this method"""
pass
class XnNavigator(BaseNavigator):
"""Navigator for xiaonei.com"""
URL_LOGIN = ""
URL_FRIEND_REX = r"^\.com/profile\.do\?(portal=.+&)?id=\d+$"
URL_GET_PROFILE_PRIFIX = ".com/profile.do?="
def __init__(self, settings):
self.USER = settings['USER']
self.PSW = settings['PSW']
self.PRINT_LOG = settings['PRINT_LOG']
self.root = (None,)
self.visited = []
self.buddy_li = []
def login(self):
"""Login to Xiaonei.com"""
self._pl("Login to Xiaonei...")
self.browser.go(self.URL_LOGIN)
self.browser.setForm(email="canri62@gmail.com", password="SOd1tSc1")
self.browser.submitForm()
self._pl("Login done.")
def start(self):
"""Start navigation as defined."""
self.login()
self._pl("Start travelling...")
buddy = None
while True: # run until be killed
buddy_li = self._parse_buddy(self.browser.show())
self.buddy_li.extend(buddy_li)
for b in buddy_li:
if b not in self.visited:
buddy = b
self.browser.go(self.URL_GET_PROFILE_PRIFIX + buddy[0])
self._pl("Visited at %s %s ." % (buddy[0], buddy[1]))
self.visited.append(buddy)
def stop(self):v
"""Stop navigation"""
pass
#maybe this function should implement in a class Buddy.
def leave_message(self, message, user_id):
"""Leave a message to specific user.
Beware that you should not bother your buddies when use this function."""
pass
def _pl(self, message):
"""_pl stands for _print_log"""
if self.PRINT_LOG is True:
print message
def _parse_buddy(self, html):
"""parse the HTML page to load the buddy list.
html - str or unicode type contains html source.
returns a list of tuple, which consists of two str or unicode type
strings depending on what you give to html argument)
like this: ('4154512', 'Jason Bourne')"""
soup = BeautifulSoup(html)
# ok, there is no error-checking at all
li = soup.findAll('a', href=re.compile(self.URL_FRIEND_REX))
return [(tag['href'].split('=')[-1], tag.contents[0])
for tag in li if tag.contents] # only retain those with contents
if __name__ =='__main__':
setting = {'USER': yourusername,
'PSW': yourpassword,
'PRINT_LOG': True} # whether print log
xn = XnNavigator(setting)
xn.start()
|