Chinaunix首页 | 论坛 | 博客
  • 博客访问: 56224
  • 博文数量: 18
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 145
  • 用 户 组: 普通用户
  • 注册时间: 2017-02-03 22:58
文章分类

全部博文(18)

文章存档

2017年(18)

我的朋友

分类: Python/Ruby

2017-02-17 01:59:49

测试

点击(此处)折叠或打开

  1. #coding:utf-8

  2. from bs4 import BeautifulSoup
  3. import urllib2
  4. import re
  5. import sys

  6. HomePage=""
  7. '''
  8.     获取网站的菜单列表地址
  9. '''
  10. def getMenuList():
  11.     menulist={}
  12.     webdata=urllib2.urlopen(HomePage).read()
  13.     soup=BeautifulSoup(webdata,'lxml')
  14.     menu=soup.find_all(id='menu')
  15. # print urls
  16.     for m in menu:
  17.         url=m.find_all('a')
  18.         for u in url:
  19.             href=u.get('href')
  20.             title=u.get_text()
  21.             if not re.match(r'http:', href):
  22.                 href=HomePage+href
  23. # print title,href
  24.             menulist[title]=href
  25.     return menulist

  26. '''
  27.     获取实际视频文件的地址
  28. '''
  29. def getFuliMp4Url(url):
  30.     webdata=urllib2.urlopen(url).read()
  31.     soup=BeautifulSoup(webdata,'lxml')
  32.     urls=soup.find_all(type="text/javascript")

  33.     for u in urls:
  34.         return soup.title.get_text().split('-')[1],u.get_text().split('"')[1]
  35.         
  36. '''
  37.     获取视频播发器地址
  38. '''
  39. def getFuliPlayer(url):
  40.     webdata=urllib2.urlopen(url).read()
  41.     soup=BeautifulSoup(webdata,'lxml')
  42.     urls=soup.find_all(title='第1集')
  43.     for u in urls:
  44.         href=u.get('href')
  45.         if not re.match(r'http', href):
  46.             href=HomePage+href
  47.         return href
  48.     
  49. '''
  50.     获取当前页所有播发器页面地址
  51. '''
  52. def getFuliCurrPageUrls(url):
  53.     player_onepage=[]
  54.     webdata=urllib2.urlopen(url).read()
  55.     soup=BeautifulSoup(webdata,'lxml')
  56.     content=soup.find_all(id="content")
  57.     for c in content:
  58.         us=c.find_all('a')
  59.         for u in us:
  60.             addr=u.get('href')
  61.             if re.match(r'http', addr):
  62. # print addr
  63.                 player_onepage.append(addr)
  64.     return player_onepage
  65. '''
  66.     获取福利片页面索引
  67. '''
  68. def getFuliPageIndex():
  69.     allpage_url=[]
  70.     #从主页获取 '福利片'页面url
  71.     menu=getMenuList()
  72.     curr_page=menu.get(u'福利片')
  73.     allpage_url.append(curr_page)
  74.     while True:
  75.         
  76.         webdata=urllib2.urlopen(curr_page).read()
  77.         soup=BeautifulSoup(webdata,'lxml')
  78.         urls=soup.find_all('a')
  79.         for u in urls:
  80.             if u.get_text()==u'下一页':
  81.                 next_page_url=u.get('href')
  82.     # print next_page_url
  83.         if not re.match(r'http:', next_page_url):
  84.             next_page_url=HomePage+next_page_url
  85.         if next_page_url:
  86.             print "current"+curr_page
  87.             if next_page_url in allpage_url:
  88.                 return allpage_url
  89.             allpage_url.append(next_page_url)
  90.             curr_page=next_page_url

  91.         else:
  92.             return allpage_url
  93.         
  94.         #测试1个页面
  95. # return allpage_url
  96.             
  97.                    

  98.     
  99.     
  100. if __name__=="__main__":

  101.     import sys
  102.     reload(sys)
  103.     sys.setdefaultencoding('utf8')

  104.     fd=open('fuli_allmp4.txt','w')
  105.     fd.close()
  106.     fd=open('fuli_allmp4.txt','a+')

  107.     fd_notitle=open('fuli_allmp4_notitle.txt','w')
  108.     fd_notitle.close()
  109.     fd_notitle=open('fuli_allmp4_notitle.txt','a+')
  110.     try:
  111.         #获取福利片所有页面索引
  112.         pageindex=getFuliPageIndex()
  113.         #遍历所有页面
  114.         for page in pageindex:
  115.             print page
  116.             #获取当前页面所有视频指向地址
  117.             curr_urls=getFuliCurrPageUrls(page)
  118.             #遍历当前页所有视频播发器地址
  119.             for url in curr_urls:
  120.                 #获取当前视频播发器地址
  121.                 player_url=getFuliPlayer(url)
  122.                 #获取实际视频地址
  123.                 mp4title,mp4_url=getFuliMp4Url(player_url)
  124.                 print mp4title,mp4_url
  125.                 fd.write(mp4title)
  126.                 fd.write('|'+mp4_url+'\n')
  127.                 fd_notitle.write(mp4_url+'\n')
  128.                  
  129.     except Exception,err:
  130.         print err
  131. # continue
  132.     fd.close()
  133.     fd_notitle.close()



得到地址后使用迅雷打包下载,比较快速。
阅读(1279) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~