Chinaunix首页 | 论坛 | 博客
  • 博客访问: 205388
  • 博文数量: 48
  • 博客积分: 1935
  • 博客等级: 上尉
  • 技术积分: 491
  • 用 户 组: 普通用户
  • 注册时间: 2010-07-29 00:59
文章分类

全部博文(48)

文章存档

2011年(1)

2010年(47)

我的朋友

分类: Python/Ruby

2010-09-21 00:39:38

[python]抓取新浪微博
2010-03-02 20:11
不包含“我的首页”里的信息。。。
#encoding=cp936
from urllib import urlencode,urlopen
from urllib2 import build_opener,HTTPCookieProcessor
from cookielib import CookieJar
from re import findall

class crawler():
...def __init__(self):
......pass

...def post(self,url,headers,body):
......cj=CookieJar()
......self.opener=build_opener(HTTPCookieProcessor(cj))
......self.opener.addheaders=headers
......return self.opener.open(url,urlencode(body)).read()

...def getWebcode(self,url):
......self.data=self.opener.open(url).read()

...def getwebcode(self,url):
......self.data=urlopen(url).read()

...def fetchall(self,patt):
......return findall(patt,self.data)

...def webcode(self):
......return self.data

def listit(url,c,f):
...c.getWebcode(url)
...try:
......webcode=unicode(c.fetchall(r"""
    ([\S\s]+)
""")[0],"utf8").encode("gb2312")
...except:
......webcode=unicode(c.fetchall(r"""
    ([\S\s]+)
""")[0],"utf8")
...for item in webcode.split(""):
......try:
.........typex=findall("""type=\"(\d+)\"""",item)[0]
.........if typex!="3":
............datex=findall(""" ............date=datex.split(">")[1]
.........else:
............datex=findall("""
............date=datex.split(">")[1]
.........print date
.........try:
............try:
...............f.write(date.encode("gb2312"))
............except:
...............f.write(date.encode("gbk"))
.........except:
............f.write(unicode(date,"gb2312").encode("gb2312"))
.........f.write(":\n")
.........sms=findall("""

.........if sms!=[]:
............for sm in sms:
...............data=sm.split("\" type=\"",1)[1][3:]
...............print "

"+data
...............f.write("

")
...............try:
..................try:
.....................f.write(data.encode("gb2312"))
..................except:
.....................f.write(data.encode("gbk"))
...............except:
..................f.write(unicode(data,"gb2312").encode("gb2312"))
...............f.write("
\n")
.........pics=findall("""""",item)
.........if pics!=[]:
............for pic in pics:
...............print pic
...............f.write("")
...............f.write("
\n")
......except:
.........pass

...next_page=findall(r"""
下一 页""",unicode(c.webcode(),"utf8").encode("gbk"))
...if next_page!=[]:
......print "next:",next_page[0]
......listit(next_page[0],c,f)

if __name__=="__main__":
...account=raw_input("Account:")
...password=raw_input("Password:")
...mic=raw_input("Microblog:")

...c=crawler()
...headers=[("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"),
......... ("Referer","login.php?url=http%3A%2F%2Ft.sina.com.cn%2Fi55m411")]
...body=(("service","miniblog"),
......  ("client","ssologin.js(v1.3.5)"),
......  ("entry","miniblog"),
......  ("encoding","utf-8"),
......  ("gateway","1"),
......  ("savestate","7"),
......  ("from",""),
......  ("useticket","0"),
......  ("username",account),
......  ("password",password),
......  ("url","ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"),
......  ("returntype","META"))
...c.post("(v1.3.5)",headers,body)
...f=open("microblog.htm","a+")
...listit(mic,c,f)
...f.close()
...raw_input("All done!")

阅读(2882) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~