爬虫Ubuntu Packages-angrad-ChinaUnix博客

Linux爱好者angrad.blog.chinaunix.net

首页　| 　博文目录　| 　关于我

angrad

博客访问： 487971
博文数量： 59
博客积分： 345
博客等级：二等列兵
技术积分： 1380
用户组：普通用户
注册时间： 2011-06-18 22:44

个人简介

to be myself

文章分类

全部博文（59）

正则表达式（1）
python（2）
问题集锦（1）
MCU（1）
GUI（3）
嵌入式（4）
其他（2）
算法&数据结构（6）
ACM_OJ（22）
linuxe设备驱动（9）
uboot（4）
开发环境搭建（2）
随笔（2）
未分配的博文（0）

文章存档

2017年（5）

2013年（47）

2012年（3）

2011年（4）

我的朋友

最新代码位置

点击(此处)折叠或打开

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Angrad
import urllib.request
import re
import os
resource_base=''
search_base=resource_base+'/search?keywords='
keywords='sendmail'
search_suffix='&searchon=names&suite=zesty§ion=all'
search_url = search_base+keywords+search_suffix
allres=[]
all_res_list=[]
#arch=amd64
arch='i386'
def Init():
global search_url
search_url = search_base+keywords+search_suffix
def GetDoc(url):
print("url: "+url)
con = urllib.request.urlopen(url)
doc = con.read()
con.close()
doc = doc.decode('utf-8')
return doc
def FullUrl(url):
return resource_base+url
def ResAlreadyExists(res):
list = res.split('/')
if list[-1] not in allres:
allres.append(list[-1])
return False
return True
def SaveFile():
file_object = open(keywords+'_res_url.txt', 'w')
file_object.writelines(all_res_list)
file_object.close( )
def Download():
print("start to download all res")
dir = keywords + "_download"
if not os.path.exists(dir):
os.makedirs(dir)
for res in all_res_list:
res = res.strip("\n")
name = res.split('/')
print("download "+dir+"\\"+name[-1])
urllib.request.urlretrieve(res, dir+"\\"+name[-1])
def GetArchUrl(res_str_list):
index = 0
#ARCH all
if len(res_str_list) == 1:
return index
for res in res_str_list:
list = res.split('/')
#list[2] amd64
#['', 'zesty', 'arm64', 'libc6', 'download']
if list[2] == arch:
return index
index = index+1
return index
def GetRes(url):
#self_res_str[0]
doc_orig = GetDoc(url)
self_res_pattern=r'
= re.findall(self_res_pattern, doc_orig)
= GetArchUrl(self_res_str)
= FullUrl(self_res_str[index])
:https://packages.ubuntu.com/zesty/all/sendmail/download
print("self res:"+self_res_str)
print("\n")
=r'
= GetDoc(self_res_str)
= re.findall(self_deb_pattern, doc)
.append(self_deb_str[0]+"\n")
:http://mirrors.kernel.org/ubuntu/pool/universe/s/sendmail/sendmail_8.15.2-8ubuntu1_all.deb
print("deb url:"+self_deb_str[0])
print("\n")
[0][1]
=r'(?<=(dep:))[\s]*
= re.findall(dep_res_pattern, doc_orig)
for r in dep_res_str:
if ResAlreadyExists(r[1]):
return
= FullUrl(r[1])
:https://packages.ubuntu.com/zesty/sendmail-base
print("dep res:"+dep_url)
(dep_url)
print("\n")
= input("Input key word: ")
()
: https://packages.ubuntu.com/search?keywords=sendmail&searchon=names&suite=zesty&section=all
= GetDoc(search_url)
[0]
= r'
= re.findall(first_page_pattern, doc)
if len(first_page_str) < 1:
print("no such res: " + keywords)
._exit(0)
= FullUrl(first_page_str[0])
:https://packages.ubuntu.com/zesty/sendmail
print("first page:"+first_page_str)
print("\n")
(first_page_str)
()
()

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Angrad

import urllib.request
import re
import os

resource_base=''
search_base=resource_base+'/search?keywords='
keywords='sendmail'
search_suffix='&searchon=names&suite=zesty§ion=all'
search_url = search_base+keywords+search_suffix

allres=[]
all_res_list=[]
#arch=amd64
arch='i386'

def Init():
    global search_url
    search_url = search_base+keywords+search_suffix

def GetDoc(url):
    print("url: "+url)
    con = urllib.request.urlopen(url)
    doc = con.read()
    con.close()
    doc = doc.decode('utf-8')
    return doc

def FullUrl(url):
    return resource_base+url

def ResAlreadyExists(res):
    list = res.split('/')
    if list[-1] not in allres:
        allres.append(list[-1])
        return False
    return True

def SaveFile():
    file_object = open(keywords+'_res_url.txt', 'w')
    file_object.writelines(all_res_list)
    file_object.close( )

def Download():
    print("start to download all res")
    dir = keywords + "_download"
    if not os.path.exists(dir):
        os.makedirs(dir)
    for res in all_res_list:
        res = res.strip("\n")
        name = res.split('/')
        print("download "+dir+"\\"+name[-1])
        urllib.request.urlretrieve(res, dir+"\\"+name[-1])

def GetArchUrl(res_str_list):
    index = 0

    #ARCH all
    if len(res_str_list) == 1:
        return index

    for res in res_str_list:
        list = res.split('/')
        #list[2] amd64
        #['', 'zesty', 'arm64', 'libc6', 'download']
        if list[2] == arch:
            return index
        index = index+1
    return index

def GetRes(url):
    #self_res_str[0]
    doc_orig = GetDoc(url)
    self_res_pattern=r'     index = GetArchUrl(self_res_str)
    self_res_str = FullUrl(self_res_str[index])
    #self res:/zesty/all/sendmail/download
    print("self res:"+self_res_str)
    print("\n")

    #self_deb
    self_deb_pattern=r'

    self_deb_str = re.findall(self_deb_pattern, doc)
    all_res_list.append(self_deb_str[0]+"\n")
    #deb url:
    print("deb url:"+self_deb_str[0])
    print("\n")

    #dep_res_str[0][1]
    dep_res_pattern=r'(?<=(dep:))[\s]*     for r in dep_res_str:
        if ResAlreadyExists(r[1]):
            return
        dep_url = FullUrl(r[1])
        #dep res:/zesty/sendmail-base
        print("dep res:"+dep_url)
        GetRes(dep_url)
    print("\n")

keywords = input("Input key word: ")
Init()
#search_url: /search?keywords=sendmail&searchon=names&suite=zesty§ion=all
doc = GetDoc(search_url)

#first_page_str[0]
first_page_pattern = r'
if len(first_page_str) < 1:
    print("no such res: " + keywords)
    os._exit(0)

first_page_str = FullUrl(first_page_str[0])
#first page:/zesty/sendmail
print("first page:"+first_page_str)
print("\n")

GetRes(first_page_str)
SaveFile()
Download()

阅读(2050) | 评论(2) | 转发(0) |

上一篇：AStyle 选项

下一篇：没有了

给主人留下些什么吧！~~

angrad2017-09-21 11:22:01

angrad：Line 59:  self_res_pattern=r\'<th><a href=\\\"([a-zA-Z0-9\\.\\/\\-]+)\\\"\'
Line 66:  self_deb_pattern=r\'<li><a href=\\\"([a-zA-Z0-9\\.\\/\\-:_+]+)\\\"\'
Line 74:  dep_res_pattern=r\'(?<=(dep:</span>))[\\s]*<a href=\\\"([a-zA-Z0-9\\.\\/\\-]+)\\\"\'

确保上面三个pattern是这些字符

回复 | 举报

angrad2017-09-21 11:21:45

Line 59:  self_res_pattern=r\'<th><a href=\\\"([a-zA-Z0-9\\.\\/\\-]+)\\\"\'
Line 66:  self_deb_pattern=r\'<li><a href=\\\"([a-zA-Z0-9\\.\\/\\-:_+]+)\\\"\'
Line 74:  dep_res_pattern=r\'(?<=(dep:</span>))[\\s]*<a href=\\\"([a-zA-Z0-9\\.\\/\\-]+)\\\"\'

回复 | 举报

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6