分类: Python/Ruby
2021-10-08 16:19:58
# Changjin Lake film data from Weibo
# @Time: 20211006
# @Author: heheyang
import requests
import json
import re
import pprint
import pandas as pd
def comments_singlePage_crawl(url,headers,comments_info, id):
"""
评论单页爬取
:param url:
:param headers:
:return:
"""
# 获取html码
html = requests.get(url, headers).text
# json解析html
html_dict = json.loads(html)
comments_data = html_dict["data"]["data"]
for comment in comments_data:
comments_info["id"].append(id)
comments_info["date"].append(comment["created_at"])
# 筛选出text中的文本信息
text = re.sub("", "", comment["text"])
text = re.sub("", "", text)
comments_info["text"].append(text)
def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):
"""
单页爬取函数
:param url: 待爬取url
:param headers: 请求头
:param mblog_info: mblog信息存储字典
"""
# 获取html码
html = requests.get(url,headers).text
# json解析html
html_dict = json.loads(html)
users = html_dict["data"]["cards"]
# 博文存储
for user in users:
mblog = user["mblog"]
mblog_info["id"].append(mblog["id"])
mblog_info["date"].append(mblog["created_at"])
# 筛选出text中的文本信息
text = re.sub("","",mblog["text"])
text = re.sub("","",text)
mblog_info["text"].append(text)
# 构造评论的url
comments_url = "%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])
# 保存评论
i = 0
while True:
try:
comments_url_ = comments_url + str(i)
comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])
i += 1
except:
break
pprint.pprint(comments_info)
def weibo_bowen_data_crawl(url,headers):
"""
博文信息爬取函数
:param url: 待爬取网站url
:param headers: 请求头
:return: 博文信息存储字典mblog_info
"""
# 博文信息存储字典
mblog_info = {
"id": [],
"date": [],
"text": []
}
# 评论信息保存字典
comments_info = {
"id":[],
"date":[],
"text":[],
}
# 爬取10页博文
for i in range(1,10):
url_ = url + str(i)
# 外汇跟单gendan5.com添加博文信息
weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)
return mblog_info,comments_info
def bowen_data_store(mblog_info,comments_info):
"""
数据处理并保存到excel中
:param mblog_info: 博文信息
:return: 保存到excel
"""
# 保存表1
data = pd.DataFrame(mblog_info)
data["num"] = data.index + 1
data["keyword"] = ["Film Changjin Lake"]*len(data["num"])
df = data.loc[:,["num","keyword","id","date","text"]]
df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")
#保存表2
comments_data = pd.DataFrame(comments_info)
comments_data["num"] = comments_data.index + 1
df_c = comments_data.loc[:,["num","id","date","text"]]
df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")
if __name__ == '__main__':
# 微博url
url = "%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=7377392724&containerid=1076037377392724&page=" # 长津湖微博
# 请求头
headers = {
"cookie":"自行添加",
"user-agent":"自行添加"
}
mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)
bowen_data_store(mblog_info,comments_info)