Chinaunix首页 | 论坛 | 博客
  • 博客访问: 29271
  • 博文数量: 9
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 80
  • 用 户 组: 普通用户
  • 注册时间: 2015-11-09 17:27
个人简介

python技术爱好者,涉猎广泛,主攻大型网站架构与大数据。

文章分类
文章存档

2017年(3)

2015年(6)

我的朋友
最近访客

分类: Python/Ruby

2017-02-18 01:49:10

对象在投行工作,他们公司打算投资影视行业,需要我的对象,做数据分析,于是身为程序猿的我,登场了。(⊙﹏⊙)

点击(此处)折叠或打开

  1. #!/usr/bin/env python
  2. # coding: utf-8

  3. '''
  4. 导出数据'''

  5. import math
  6. from urllib import quote
  7. from datetime import datetime, date, timedelta

  8. import xlwt
  9. import gevent
  10. import requests
  11. from gevent import monkey
  12. from gevent.pool import Pool
  13. monkey.patch_all() # noqa

  14. import config
  15. from mail import mail_multipart

  16. UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) '
  17.       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 '
  18.       'Safari/537.36')

  19. category2param = {
  20.     0: 'NETWORK_DRAMA',
  21.     1: 'NETWORK_MOVIE',
  22.     2: 'NETWORK_VARIETY',
  23.     3: 'TV_DRAMA',
  24.     4: 'ANIME',
  25. }

  26. category2sheet_name = {
  27.     0: u'网络剧',
  28.     1: u'网络大电影',
  29.     2: u'网络综艺',
  30.     3: u'电视剧',
  31.     4: u'网络动漫',
  32. }


  33. class HttpClient(object):

  34.     def __init__(self, tries=5):
  35.         assert 1 <= tries < 10
  36.         self.session = requests.Session()
  37.         self.tries = tries

  38.     def fetch(self, url, method='GET', data=None, **options):
  39.         req = requests.Request(
  40.             method,
  41.             url,
  42.             data=data,
  43.             headers=options.get('headers'),
  44.             cookies=options.get('cookies'))
  45.         req = self.session.prepare_request(req)
  46.         req.headers['Date'] = datetime.now().strftime(
  47.             '%a, %d %b %Y %H:%M:%S GMT')
  48.         req.headers['User-Agent'] = UA
  49.         for i in range(self.tries):
  50.             try:
  51.                 resp = self.session.send(req)
  52.             except:
  53.                 gevent.sleep(0.5)
  54.                 continue
  55.             if resp:
  56.                 break
  57.         else:
  58.             raise StandardError(u'网络请求失败.')
  59.         return resp


  60. def export_data(date):
  61.     xls_name = u'%s 影视统计分析表.xls' % str(date)
  62.     workbook = xlwt.Workbook(encoding='utf-8')
  63.     pool = Pool(5)
  64.     for i in range(5):
  65.         pool.spawn(process_items, workbook, date, i)
  66.     pool.join()
  67.     workbook.save(xls_name)
  68.     return xls_name


  69. def standardizing(play_count_text):
  70.     if play_count_text is None:
  71.         return 'unknown'
  72.     last_idx = 0
  73.     play_count = 0.0
  74.     if u'亿' in play_count_text:
  75.         idx = play_count_text.index(u'亿')
  76.         play_count = \
  77.             play_count + float(play_count_text[:idx]) * 10000
  78.         last_idx = idx + 1
  79.     if u'万' in play_count_text:
  80.         idx = play_count_text.index(u'万')
  81.         play_count = \
  82.             play_count + float(play_count_text[last_idx:idx])
  83.     return int(play_count)


  84. def process_items(workbook, date, category):
  85.     assert 0 <= category <= 4
  86.     param = category2param[category]
  87.     sheet_name = category2sheet_name[category]
  88.     client = HttpClient()
  89.     api_url = '%s&date=%s' % (param, date) # noqa
  90.     items = client.fetch(api_url).json()
  91.     if not isinstance(items, list):
  92.         raise StandardError(u'数据未更新')
  93.     sheet = workbook.add_sheet(sheet_name)
  94.     headers = [
  95.         u'排名', u'剧名', u'平台', u'播放量(万)',
  96.         u'累计播放量(万)', u'上线天数', u'名次变动',
  97.     ]
  98.     for i, header in enumerate(headers):
  99.         sheet.write(0, i, header)
  100.     for i, item in enumerate(items, 1):
  101.         name = item['name']
  102.         detail_url = '%s' \
  103.             % quote(name.encode('utf-8'))
  104.         try:
  105.             detail_info = client.fetch(detail_url)
  106.             play_count = detail_info.json()['total_play_count']
  107.         except KeyError:
  108.             play_count = None
  109.         increase_count = int(item['increaseCount'])
  110.         rise, rise_text = int(item['rise']), ''
  111.         if rise >= 3:
  112.             rise_text = u'上升%d' % rise
  113.         elif rise <= -3:
  114.             rise_text = u'下降%d' % abs(rise)
  115.         sheet.write(i, 0, str(i))
  116.         sheet.write(i, 1, name)
  117.         sheet.write(i, 2, item['platformName'])
  118.         sheet.write(i, 3, int(math.ceil(increase_count/10000.0)))
  119.         sheet.write(i, 4, str(standardizing(play_count)))
  120.         sheet.write(i, 5, item['days'])
  121.         sheet.write(i, 6, rise_text)

  122. if __name__ == '__main__':
  123.     yesterday = date.today() - timedelta(days=1)
  124.     xls_name = export_data(yesterday)
  125.     mail = dict()
  126.     mail['to'] = config.RECEIPTS
  127.     mail['attachment'] = [xls_name]
  128.     mail['subject'] = u'影视剧分析统计邮件'
  129.     mail_multipart(mail)

阅读(1586) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~