【Python】统计个人新浪微博词频并给出相应的柱状图-LaoLiulaoliu-ChinaUnix博客

miraclemiracle.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

laoliulaoliu

博客访问： 4663513
博文数量： 1214
博客积分： 13195
博客等级：上将
技术积分： 9105
用户组：普通用户
注册时间： 2007-01-19 14:41

个人简介

C++,python,热爱算法和机器学习

文章分类

全部博文（1214）

cloud（3）
operation（9）
tornado（4）
mac_os（1）
golang（4）
架构（13）
git（4）
security（29）
shell（1）
macbook（1）
ruby（13）
javascript（15）
design（3）
testing（1）
mac（1）
bigdata（69）
nosql（46）
R（9）
gcj/acm（6）
NLP（10）
小说（3）
matlab（4）
web（44）
java（66）
product（7）
c#（1）
language（4）
machine learning（76）
science（4）
opencourse（2）
windows（3）
search（33）
algorithm（65）
database（51）
compiler（11）
ACE（5）
poem（1）
programming（29）
python（140）
assembly（1）
linux（49）
C++（16）
book（2）
cate（1）
phliosophy（3）
mental（30）
Science fiction（1）
Software（5）
c（23）
network（65）
CS（15）
thinking（10）
BSD（13）
solaris10（2）
life（57）
Debian（16）
economy（7）
Mathematics（57）
OS（8）
ibm（2）
gentoo（32）
未分配的博文（8）

文章存档

2021年（13）

2020年（49）

2019年（14）

2018年（27）

2017年（69）

2016年（100）

2015年（106）

2014年（240）

2013年（5）

2012年（193）

2011年（155）

2010年（93）

2009年（62）

2008年（51）

2007年（37）

我的朋友

相关博文

【Python】统计个人新浪微博词频并给出相应的柱状图

分类： Python/Ruby

2014-01-19 13:30:02

文章来源：http://blog.sina.com.cn/s/blog_a6c0d4220101c0ge.html
本文介绍如何进行个人新浪微博词频统计，并给出相应的柱状图分析，编程环境为Python 2.7。该文主要包括三个部分：新浪微博API的使用、文本过滤及分词和词频统计。

一、新浪微博API的使用

首先在新浪微博开放平台http://open.weibo.com/development/上申请开发者账号，获取个人APP_KEY和APP_SECRET，下载并安装Python SDK。本文介绍的方法无需每次验证，直接运行即可。

# -*- coding: UTF-8 -*-

from weibo import APIClient

from re import split

import urllib,httplib

import webbrowser

import operator

import numpy as np

import matplotlib.pyplot as plt

class iWInsightor(object):

def __init__(self,ID,PW):

self.ACCOUNT = ID

self.PASSWORD = PW

self.CALLBACK_URL = ''

self.APP_KEY = 'XXXXXXX'#Yours

self.APP_SECRET = 'XXXXXX'#Yours

self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL)

self.url = self.client.get_authorize_url()

self.get_Authorization()

def get_code(self):

conn = httplib.HTTPSConnection('api.weibo.com')

postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0})

conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'})

res = conn.getresponse()

location = res.getheader('location')

code = location.split('=')[1]

conn.close()

return code

def get_Authorization(self):

code = self.get_code()

r = self.client.request_access_token(code)

access_token = r.access_token

expires_in = r.expires_in

self.client.set_access_token(access_token, expires_in)

#发送微博消息

def post_weibo(self,message):

self.client.post.statuses__update(status=message.decode('gbk'))

#获取当前用户ID

def getCurrentUid(self):

try:

uid = self.client.account.get_uid.get()['uid']

return uid

except Exception:

print 'get userid failed'

return

#获取用户关注列表

def getFocus(self,userid):

focuses = self.client.get.friendships__friends(uid=userid,count=200)

Resfocus = []

for focus in focuses["users"]:

try:

Resfocus.append((focus["screen_name"],focus["gender"]))

except Exception:

print 'get focus failed'

return

return Resfocus

#获取用户标签

def getTags(self,userid):

try:

tags = self.client.tags.get(uid=userid)

except Exception:

print 'get tags failed'

return

userTags = []

sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True)

for tag in sortedT:

for item in tag:

if item != 'weight':

userTags.append(tag[item])

return userTags

#获取用户发布的微博

def getWeibo(self,uesrid,infile):

contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100)

for content in contents.statuses:

try:

f = open(infile,'a')

f.write(content.text)

f.write('\n')

f.close()

except Exception:

print 'get text failed'

def autolabel(self,rects):

for rect in rects:

height = rect.get_height()

plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height))

#画出用户的关注男女比例图

def getSexplot(self,userid,m,f,n):

res = self.client.get.users__show(uid=userid)

ind = np.arange(1,4)

width = 0.25

plt.subplot(111)

rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center')

plt.ylabel('The Focus Number')

plt.title('Sex Analysis(effective samples:%d)' % (m+f+n))

plt.xticks(ind, ("Male","Female","Unknown") )

self.autolabel(rects1)

plt.legend((rects1,),("User:%s" % res["screen_name"],))

plt.show()

if __name__ == '__main__':

usrID = raw_input('请输入新浪微博用户名：')

usrPW = raw_input('请输入新浪微博密码:')

AppClient = iWInsightor(usrID, usrPW)

userid = AppClient.getCurrentUid()

infile = "E://data/weibo.dat"#微博内容保存路径及文件名

AppClient.getWeibo(userid,infile)

#Focus = AppClient.getFocus(userid)

#m = 0

#f = 0

#n = 0

#for i in Focus:

#if i[1] == "m":

#m = m+1

#elif i[1] == "f":

#f = f+1

#else:

#n = n+1

#AppClient.getSexplot(userid,m,f,n)

二、文本过滤及分词

微博中常常含有一些词汇，其对词频统计无任何作用，利用英文字母数字、汉语标点符号以及其他个性符号，这些我们需要在分词前将其滤除。此外，你还可以添加自己想滤除的符号或者字词。

中文与英文句子比较而言，有一个非常有趣的现象，那就是英文单词之间是有空格的，而中文则不然。因此，分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词，可以添加自定义词典（因为分词字典很多词可能没涉及到），下载地址为。

# -*- coding: UTF-8-*-

import string

import jieba

extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典

jieba.load_userdict(extra_dict)

def filter_str(instr):

deEstr = string.punctuation + ' ' + string.digits + string.letters

deCstr = '，。《》【】（）！？★”“、：…'

destr = deEstr + deCstr

outstr = ''

for char in instr.decode('utf-8'):

if char not in destr:

outstr += char

return outstr

fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本

fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本

for line in fp_in:

str_delete = filter_str(line)

seg_list = jieba.cut(str_delete,cut_all=True)

str_join = ' '.join(seg_list)

fp_out.write(str_join)

fp_in.close()

fp_out.close()

三、词频统计

词频统计就是指统计出某个文本中各个词出现的次数，这里使用python中的词典数据结构易得。我用的是matplotlib画柱状图，画出top-K个高频词。这里需要注意的是图中的中文显示问题，在使用之前，需要修改相应的设置，具体方法不妨去google一下，我就不详细介绍了。

# -*- coding: UTF-8-*-

import string

import numpy

import pylab

def getstr(word, count):

countstr = word + ',' + str(count)

return countstr

def get_wordlist(infile):

c = open(infile).readlines()

wordlist = []

for line in c:

if len(line)>1:

words = line.split(' ')

for word in words:

if len(word)>1:

wordlist.append(word)

return wordlist

def get_wordcount(wordlist, outfile):

out = open(outfile, 'w')

wordcnt ={}

for i in wordlist:

if i in wordcnt:

wordcnt[i] += 1

else:

wordcnt[i] = 1

worddict = wordcnt.items()

worddict.sort(key=lambda a: -a[1])

for word,cnt in worddict:

out.write(getstr(word.encode('gbk'), cnt)+'\n')

out.close()

return wordcnt

def barGraph(wcDict):

wordlist=[]

for key,val in wcDict.items():

if val>5 and len(key)>3:

wordlist.append((key.decode('utf-8'),val))

wordlist.sort()

keylist=[key for key,val in wordlist]

vallist=[val for key,val in wordlist]

barwidth=0.5

xVal=numpy.arange(len(keylist))

pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45)

pylab.bar(xVal,vallist,width=barwidth,color='y')

pylab.title(u'微博词频分析图')

pylab.show()

if __name__ == '__main__':

myfile = 'F://NLP/iWInsightor/weibo_filter.dat'

outfile = 'F://NLP/iWInsightor/result.dat'

wordlist = get_wordlist(myfile)

wordcnt = get_wordcount(wordlist,outfile)

barGraph(wordcnt)

至此，我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作，尚有诸多不足之处。

阅读(3013) | 评论(0) | 转发(0) |

上一篇：python @property

下一篇：Python图表绘制：matplotlib绘图库入门

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6