#!/usr/bin/python
#encoding=utf-8
#使用前请查找并更改用户名和密码
import cookielib, urllib2, urllib, sys, time
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup # For processing HTML
def formalize(text):
result = ''
lines = text.split(u'\n')
for line in lines:
line = line.strip()
if len(line) == 0:
continue
result += line + u'\n\n'
return result
#登陆校内网
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
exheaders = [("User-Agent","Mozilla/4.0 (compatible; MSIE 7.1; Windows NT 5.1; SV1)"),]
opener.addheaders=exheaders
url_login = ''
body = (('email','xxxxx@gmail.com'), ('password','*********')) #TODO:更改登录名和密码
print "ERROR! you need to update the password to be successful!"
req1 = opener.open(url_login, urllib.urlencode(body)) #这时,cookie已经进来了。
#下载糗事百科,一个个发帖
body = {'relative_optype':'publisher', 'blogControl':'1'}
url_post = 'http://blog.xiaonei.com/NewEntry.do'
#发帖部分
count = 0
for i in range(11, 12):
url = "%d" % i
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup("".join(data))
contents = soup.findAll('div', "content")
stories = [str(text) for text in contents]
for story in stories:
count += 1
print "processing page %d, %d items added" % (i, count)
minisoup = BeautifulSoup(story)
#text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
#text = urllib.unquote(unescape(text, {'"':'"'}))
text = str(minisoup)
#text = text.encode("utf-8")
title = '糗事-%d' % count
text += ' 来自糗事百科 '
body['title'] = title
body['body'] = text
req2 = opener.open(url_post, urllib.urlencode(body)) #不出意外的话,就已经发帖成功了
|