Chinaunix首页 | 论坛 | 博客
  • 博客访问: 56211
  • 博文数量: 18
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 145
  • 用 户 组: 普通用户
  • 注册时间: 2017-02-03 22:58
文章分类

全部博文(18)

文章存档

2017年(18)

我的朋友

分类: Python/Ruby

2017-02-14 22:12:36

爬取某视频网站数据

找到规律,通过特征值过滤得到视频url,通过迅雷批量下载。

点击(此处)折叠或打开

  1. #coding:utf-8
  2. '''
  3. Created on 2017年2月11日

  4. @author: erain
  5. 牛牛视频网络URL提取
  6.     
  7. '''
  8. import urllib2
  9. import os
  10. from nt import mkdir, chdir

  11. WebMain=""
  12. AV_Lab={
  13.         'fuli':'list',
  14.         }
  15. AV_Type={
  16.          'fuli':'index1',
  17.          'dianshiju':'index2',
  18.          'zongyi':'index3',
  19.          'lunli':'index5',
  20.          }
  21. # [
  22. # 'index1', #福利 http://www.niump4.com/list/index1.html
  23. # #第二页 格式http://www.niump4.com/list/index1_2.html
  24. # 'index2', #电视剧 http://www.niump4.com/list/index2.html
  25. # 'index3', #综艺 http://www.niump4.com/list/index3.html
  26. # 'index4', #未知 http://www.niump4.com/list/index4.html
  27. # 'index5', #伦理 http://www.niump4.com/list/index5.html
  28. # 'index6', #视频 http://www.niump4.com/list/index6.html

  29. # ]


  30. #从 http://www.niump4.com/list/index4.html 获取VideoURL
  31. def AV_fuli_UrlPageN(page=0):
  32.     av_tag='fuli'
  33.     
  34.     try:
  35.         mkdir(av_tag)
  36.     except Exception,err:
  37.         pass
  38.     chdir(av_tag)
  39.     
  40.     
  41.     #1 获取网页数据
  42.     if page<2:
  43.         pageurl=WebMain+AV_Lab.get(av_tag)+'/'+AV_Type.get(av_tag)+'.html'
  44.     else:
  45.         pageurl=WebMain+AV_Lab.get(av_tag)+'/'+AV_Type.get(av_tag)+'_%d'%page+'.html'
  46.     #打开连接
  47.     webdata=urllib2.urlopen(pageurl).read()
  48.     #保存网页
  49.     f=open("web_tmp",'w')
  50.     f.write(webdata)
  51.     f.close()
  52.     
  53.     
  54.     #读取网页到lines
  55.     f=open("web_tmp",'r')
  56.     lines=f.readlines()
  57.     f.close()
  58.     
  59.     print "AV Fili From Page :"+pageurl
  60.     
  61.     #解析网页,根据特征值提取视频URL
  62.     #<li><a href="/v/9062.html" target="_blank"><img src=""><h3><font color="">48-筱崎爱</font></h3><span class="movie_date"><!--[if lt IE 9 ]><span class="bg_top">
  63.     
  64.     av_video_savfile=av_tag+'_videourl%d.txt'%page #videourl 文件存在
  65.     mp4_url_file=av_tag+"mp4_url_page%d.txt"%page
  66.     mp4_url_file_noname=av_tag+"mp4_url_name%d.txt"%page
  67.     cap_value='movie_date' #特征值过滤
  68.     
  69.     #删除文件
  70.     if av_video_savfile in os.listdir(os.getcwd()):
  71.         os.remove(av_video_savfile)
  72.     if mp4_url_file in os.listdir(os.getcwd()):
  73.         os.remove(mp4_url_file)
  74.     if mp4_url_file_noname in os.listdir(os.getcwd()):
  75.         os.remove(mp4_url_file_noname)
  76.     f=open(av_video_savfile,'a')#追加方式打开
  77.     mp4_f=open(mp4_url_file,'a')
  78.     mp4_f_noname=open(mp4_url_file_noname,'a')
  79.     for line in lines:
  80. # print line
  81.         if line.find(cap_value)>0 :
  82. # print line
  83. # print line.split('"')[1]
  84.             vd_url=line.split('"')[1] #本页面所有视频连接 http://www.niump4.com//v/9073.html
  85.             wdat=urllib2.urlopen(WebMain+vd_url).read()
  86.             if 1:
  87.                 #获取单个视频连接
  88.                 tmp_file=open("video_url_tmp",'w')
  89.                 tmp_file.write(wdat)
  90.                 tmp_file.close()
  91.                 tmp_file=open("video_url_tmp",'r')
  92.                 data_all=tmp_file.readlines()
  93.                 for d in data_all:
  94. # print d
  95.                     if d.find("href")>0 and d.find("video")>0:#<ul><li><a title='??1??' href='/video/?9264-0-0.html' target="_blank">??1??</a></li></ul>
  96.                         url=WebMain+d.split("'")[3]
  97. # print url #http://www.niump4.com//video/?9157-0-0.html
  98.                         f.write(url+'\n')
  99.                 tmp_file.close()
  100.                 if 1: #获取MP4 uRL
  101.                     '''
  102.                      var VideoInfoList=unescape('mp4%24%24%u7B2C1%u96C6%24http%3A//z.syasn.com/3a/3a16.mp4%3Fend%3D180%24mp4'); var urlinfo='http://'+document.domain+'/video/?8745-<from>-<pos>.html' var
  103. ????????????????????
  104. ????????????????????========>z.syasn.com/3a/3a16.mp4
  105. ????????????????????
  106. ????????????????????'''
  107. ????????????????????mp4web=urllib2.urlopen(url).read()
  108. ????????????????????tmp_file=open("mp4_url_tmp",'w')
  109. ????????????????????tmp_file.write(mp4web)
  110. ????????????????????tmp_file.close()
  111. ????????????????????tmp_file=open("mp4_url_tmp",'r')
  112. ????????????????????mp4_webdata=tmp_file.readlines()
  113. ????????????????????for mp4url in mp4_webdata:
  114. ????????????????????????if(mp4url.find('xTitle')>0):
  115. # <script>var sitePath='';var play_vid='158';var xTitle='女演员大作战';//play_vid变量:智能报错用到</script>
  116. ????????????????????????????mp4_name=mp4url.split('=')[3].split("'")[1].decode('gbk')
  117. # print mp4_name
  118. ????????????????????????if mp4url.find("unescape")>0:
  119. # print mp4url
  120. ????????????????????????????try:
  121. ????????????????????????????????mp4=mp4url.split('(')[1].split(')')[0].split("//")[1].split("%")[0]
  122. ???????????????????????????????
  123. # print 'http://'+mp4 #http://z.syasn.com/m/m283.mp4
  124. ????????????????????????????????mp4_f.write(mp4_name.encode('utf-8'))
  125. ????????????????????????????????mp4_f.write('|'+'http://'+mp4+'\n')
  126. ????????????????????????????????mp4_f_noname.write('http://'+mp4+'\n')
  127. ????????????????????????????????print mp4_name+'|'+mp4
  128. ????????????????????????????except Exception,err:
  129. ????????????????????????????????print err
  130. ????????????????????????????????continue
  131. ????????????????????????????
  132. ????????????????????????
  133. ????????????????????tmp_file.close()
  134. ????????????
  135. ????????????
  136. ????f.close()
  137. ????mp4_f.close()
  138. ????mp4_f_noname.close()
  139. ????
  140. ????chdir('../')
  141. ????
  142. ????

  143. if __name__=="__main__":
  144. ????for page in range(15,20):
  145. ????????AV_fuli_UrlPageN(page)
  146. ????pass

阅读(1288) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~