爬取某视频网站数据
找到规律,通过特征值过滤得到视频url,通过迅雷批量下载。
-
#coding:utf-8
-
'''
-
Created on 2017年2月11日
-
-
@author: erain
-
牛牛视频网络URL提取
-
-
'''
-
import urllib2
-
import os
-
from nt import mkdir, chdir
-
-
WebMain=""
-
AV_Lab={
-
'fuli':'list',
-
}
-
AV_Type={
-
'fuli':'index1',
-
'dianshiju':'index2',
-
'zongyi':'index3',
-
'lunli':'index5',
-
}
-
# [
-
# 'index1', #福利 http://www.niump4.com/list/index1.html
-
# #第二页 格式http://www.niump4.com/list/index1_2.html
-
# 'index2', #电视剧 http://www.niump4.com/list/index2.html
-
# 'index3', #综艺 http://www.niump4.com/list/index3.html
-
# 'index4', #未知 http://www.niump4.com/list/index4.html
-
# 'index5', #伦理 http://www.niump4.com/list/index5.html
-
# 'index6', #视频 http://www.niump4.com/list/index6.html
-
-
# ]
-
-
-
#从 http://www.niump4.com/list/index4.html 获取VideoURL
-
def AV_fuli_UrlPageN(page=0):
-
av_tag='fuli'
-
-
try:
-
mkdir(av_tag)
-
except Exception,err:
-
pass
-
chdir(av_tag)
-
-
-
#1 获取网页数据
-
if page<2:
-
pageurl=WebMain+AV_Lab.get(av_tag)+'/'+AV_Type.get(av_tag)+'.html'
-
else:
-
pageurl=WebMain+AV_Lab.get(av_tag)+'/'+AV_Type.get(av_tag)+'_%d'%page+'.html'
-
#打开连接
-
webdata=urllib2.urlopen(pageurl).read()
-
#保存网页
-
f=open("web_tmp",'w')
-
f.write(webdata)
-
f.close()
-
-
-
#读取网页到lines
-
f=open("web_tmp",'r')
-
lines=f.readlines()
-
f.close()
-
-
print "AV Fili From Page :"+pageurl
-
-
#解析网页,根据特征值提取视频URL
-
#<li><a href="/v/9062.html" target="_blank"><img src=""><h3><font color="">48-筱崎爱</font></h3><span class="movie_date"><!--[if lt IE 9 ]><span class="bg_top">
-
-
av_video_savfile=av_tag+'_videourl%d.txt'%page #videourl 文件存在
-
mp4_url_file=av_tag+"mp4_url_page%d.txt"%page
-
mp4_url_file_noname=av_tag+"mp4_url_name%d.txt"%page
-
cap_value='movie_date' #特征值过滤
-
-
#删除文件
-
if av_video_savfile in os.listdir(os.getcwd()):
-
os.remove(av_video_savfile)
-
if mp4_url_file in os.listdir(os.getcwd()):
-
os.remove(mp4_url_file)
-
if mp4_url_file_noname in os.listdir(os.getcwd()):
-
os.remove(mp4_url_file_noname)
-
f=open(av_video_savfile,'a')#追加方式打开
-
mp4_f=open(mp4_url_file,'a')
-
mp4_f_noname=open(mp4_url_file_noname,'a')
-
for line in lines:
-
# print line
-
if line.find(cap_value)>0 :
-
# print line
-
# print line.split('"')[1]
-
vd_url=line.split('"')[1] #本页面所有视频连接 http://www.niump4.com//v/9073.html
-
wdat=urllib2.urlopen(WebMain+vd_url).read()
-
if 1:
-
#获取单个视频连接
-
tmp_file=open("video_url_tmp",'w')
-
tmp_file.write(wdat)
-
tmp_file.close()
-
tmp_file=open("video_url_tmp",'r')
-
data_all=tmp_file.readlines()
-
for d in data_all:
-
# print d
-
if d.find("href")>0 and d.find("video")>0:#<ul><li><a title='??1??' href='/video/?9264-0-0.html' target="_blank">??1??</a></li></ul>
-
url=WebMain+d.split("'")[3]
-
# print url #http://www.niump4.com//video/?9157-0-0.html
-
f.write(url+'\n')
-
tmp_file.close()
-
if 1: #获取MP4 uRL
-
'''
-
var VideoInfoList=unescape('mp4%24%24%u7B2C1%u96C6%24http%3A//z.syasn.com/3a/3a16.mp4%3Fend%3D180%24mp4'); var urlinfo='http://'+document.domain+'/video/?8745-<from>-<pos>.html'
var
-
????????????????????
-
????????????????????========>z.syasn.com/3a/3a16.mp4
-
????????????????????
-
????????????????????'''
-
????????????????????mp4web=urllib2.urlopen(url).read()
-
????????????????????tmp_file=open("mp4_url_tmp",'w')
-
????????????????????tmp_file.write(mp4web)
-
????????????????????tmp_file.close()
-
????????????????????tmp_file=open("mp4_url_tmp",'r')
-
????????????????????mp4_webdata=tmp_file.readlines()
-
????????????????????for mp4url in mp4_webdata:
-
????????????????????????if(mp4url.find('xTitle')>0):
-
# <script>var sitePath='';var play_vid='158';var xTitle='女演员大作战';//play_vid变量:智能报错用到</script>
-
????????????????????????????mp4_name=mp4url.split('=')[3].split("'")[1].decode('gbk')
-
# print mp4_name
-
????????????????????????if mp4url.find("unescape")>0:
-
# print mp4url
-
????????????????????????????try:
-
????????????????????????????????mp4=mp4url.split('(')[1].split(')')[0].split("//")[1].split("%")[0]
-
???????????????????????????????
-
# print 'http://'+mp4 #http://z.syasn.com/m/m283.mp4
-
????????????????????????????????mp4_f.write(mp4_name.encode('utf-8'))
-
????????????????????????????????mp4_f.write('|'+'http://'+mp4+'\n')
-
????????????????????????????????mp4_f_noname.write('http://'+mp4+'\n')
-
????????????????????????????????print mp4_name+'|'+mp4
-
????????????????????????????except Exception,err:
-
????????????????????????????????print err
-
????????????????????????????????continue
-
????????????????????????????
-
????????????????????????
-
????????????????????tmp_file.close()
-
????????????
-
????????????
-
????f.close()
-
????mp4_f.close()
-
????mp4_f_noname.close()
-
????
-
????chdir('../')
-
????
-
????
-
-
if __name__=="__main__":
-
????for page in range(15,20):
-
????????AV_fuli_UrlPageN(page)
-
????pass