Chinaunix首页 | 论坛 | 博客
  • 博客访问: 75383
  • 博文数量: 27
  • 博客积分: 2120
  • 博客等级: 大尉
  • 技术积分: 270
  • 用 户 组: 普通用户
  • 注册时间: 2009-10-25 20:51
文章分类
文章存档

2011年(1)

2010年(10)

2009年(16)

我的朋友

分类: Python/Ruby

2010-05-02 19:55:27


获取博客和论坛帖子链接:

import urllib
from HTMLParser import HTMLParser
import re

class HtmlParserTool():
    def __init__(self):
        self.urlcontent = ''

    def set_url(self, url):
        self.urlcontent = urllib.urlopen(url).read()
        if 0 == len(self.urlcontent):
            print 'HtmlParserTool:read url',url,'error\n'

class HtmlParserGetUrl(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltag = 0
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
            self.url.append(hrefv)
            self.urltag = 1

    def handle_data(self, data):
        return

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

class HtmlSavePage():
    def __init__(self):
        self.Parser = HtmlParserGetUrl()

    def urlset(self, subsite, page):
        self.Parser.feed(page)
        for i in range(0, len(self.Parser.url)):
            if -1 != self.Parser.url[i].find('http://'):
                continue
            index = 0
            find = 0
            for j in range(0, len(self.Parser.url)):
                index = page[index:len(page)].find(self.Parser.url[i])
                if -1 != index:
                    index2 = page[0:index].rfind('href')
                    if -1 == index2:
                        index2 = page[0:index].rfind('HREF')
                    if -1 != index2:
                        if index - index2 > 2:
                            index = index + len(self.Parser.url[i])
                            continue
                        else:
                            find = 1
                else:
                    find = 0
                    break
            if 1 == find:
                page = page[0:index] + subsite + page[index:len(page)]

    def save(self, path, url, site):
        print path,'\n',url
        webtool = HtmlParserTool()
        webtool.set_url(url)
        page = webtool.urlcontent

        #replace '/xxx'
        page = re.sub('(src|SRC)="//*', 'src="' + site + '/', page)
        page = re.sub('(href|HREF)="//*', 'href="' + site + '/', page)
        page = re.sub('(background|BACKGROUND)="//*', 'background="' + site + '/', page)

        #replace 'xxx/xxx'
        index = url.rfind('/')

        if -1 != index:
            subsite = url[0:index]
            page = re.sub('(src|SRC)="(?!/|http|\.\.)', 'src="' + subsite + '/', page)
            page = re.sub('(href|HREF)="(?!/|http|\.\.)', 'href="' + subsite + '/', page)
            page = re.sub('(background|BACKGROUND)="(?!/|http|\.\.)', 'background="' + subsite + '/', page)

        #replace '../xxx'
        index = url[0:index].rfind('/')
        if -1 != index:
            index = url[0:index].rfind('/')
            if -1 != index:
                url = url[0:index]
                print 'siteurl:',url
                page = re.sub('(src|SRC)="\.\./\.\./', 'src="' + url + '/', page)
                page = re.sub('(href|HREF)="\.\./\.\./', 'href="' + url + '/', page)
                page = re.sub('(background|BACKGROUND)="\.\./\.\./', 'background="' + url + '/', page)
        savefile = open(path, 'w')
        savefile.write(page)
        savefile.close()

class HtmlParserTP1(HTMLParser):
    def __init__(self):
        self.title = ''
        self.url = []
        self.urltitle = []
        self.urltag = 0
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if 'a' == tag.lower():
            self.urlTP1 = 1
            titlev = ""
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
                if name == 'title':
                    titlev = value
            if len(titlev) > 0:
                return

            if -1 != hrefv.find('showart'):
                for i in range(0, len(self.url)):
                    if self.url[i] == hrefv:
                        return
                self.url.append(hrefv)
                self.urltag = 1

    def handle_data(self, data):
        if 1 == self.urltag:
            self.urltitle.append(data)

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

    def set_url(self, url):
        urllib.urlopen(url).read()

class HtmlParserTP2(HTMLParser):
    def __init__(self):
        self.title = ''
        self.url = []
        self.urltitle = []
        self.urltag = 0
        self.tmp1 = ''
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        if 'a' == tag.lower():
            self.urlTP1 = 1
            titlev = ""
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value

            if re.match('thread-[0-9]*-1-1.html', hrefv):
            #if -1 != hrefv.find('thread-') and -1 != hrefv.find('.html') :
                self.tmp1 = hrefv
                self.urltag = 1

    def handle_data(self, data):
        if 1 == self.urltag:
            if len(data) > 2:
                for i in range(0, len(self.url)):
                    if self.url[i] == self.tmp1:
                        return
                self.url.append(self.tmp1)
                self.urltitle.append(data)

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

    def set_url(self, url):
        urllib.urlopen(url).read()

class HtmlTool():
    def __init__(self):
        self.HtmlTp = ''
        self.urlfile = ''
        self.site = ''
        self.subsite = ''
        self.savepath = ''
        self.needsavepage = 1
        self.UrlTool = HtmlParserTool()

    def set_no_save_page(self):
        self.needsavepage = 0

    def set_savepath(self, path):
        self.savepath = path

    def get_site(self):
        urlhead = 'http://'
        index = self.urlfile[len(urlhead) : len(self.urlfile)].find('/')
        self.site = self.urlfile[0 : index + len(urlhead)]
        index = self.urlfile.rfind('/')
        self.subsite = self.urlfile[0:index]
        print 'site:',self.site
        print 'sub site:',self.subsite

    def set_tp(self, tp):
        self.HtmlTp = tp

    def open_file(self, file):
        self.file = open(file, 'w+')

    def close_file(self):
        self.file.close()

    def set_url(self, url):
        self.urlfile = url
        self.get_site()

        code = 1

        if('TP1' == self.HtmlTp):
            self.HtmlParser = HtmlParserTP1()
            self.UrlTool.set_url(url)
            self.HtmlParser.feed(self.UrlTool.urlcontent)

        if('TP2' == self.HtmlTp):
            self.HtmlParser = HtmlParserTP2()
            self.UrlTool.set_url(url)
            self.HtmlParser.feed(self.UrlTool.urlcontent.decode('gbk'))
            code = 0
        
        if len(self.HtmlParser.url) > 0:
            print len(self.HtmlParser.url),len(self.HtmlParser.urltitle)
            for i in range(0, len(self.HtmlParser.url)):
                fileurl = self.subsite + '/' + self.HtmlParser.url[i]
                self.file.write(fileurl)
                self.file.write(' <- ')
                if 0 == code:
                    self.file.write(self.HtmlParser.urltitle[i].encode('utf-8'))
                else:
                    self.file.write(self.HtmlParser.urltitle[i].decode('gbk').encode('utf-8'))
                #print fileurl
                self.file.write("\r\n")

                if 1 == self.needsavepage:
                    savetool = HtmlSavePage()
                    savetool.save(self.savepath + self.HtmlParser.url[i], fileurl, self.site)
                        
                self.file.write("\r\n")


#test
subject = ''
result = re.sub('(src|SRC)="(?!(/|http|\.\.))', 'src="http:/bbx/', subject)
print result
result = re.sub('src="\.\./\.\./', 'src="http:/xxv/', result)
print result
if re.match('thread-[0-9]*-1-1.html', 'thread-1698004-1-1.html'):
    print 'ok'


#switch
s = 1

#get blog from CU
if 1 == s:
    webtool = HtmlTool()
    webtool.set_tp('TP1')
    webtool.set_savepath('./web/')
    webtool.open_file('web.txt')
    #webtool.set_url('http://blog.chinaunix.net/u1/44067/index.php')
    webtool.set_url('http://blog.chinaunix.net/u2/62235/index.html')
    webtool.close_file()

#get froum form CU
if 2 == s:
    webtool = HtmlTool()
    webtool.set_tp('TP2')
    webtool.set_no_save_page()
    webtool.open_file('web.txt')
    webtool.set_url('')
    webtool.close_file()
    

    


获取精华贴,不过速度比较慢


import urllib
from HTMLParser import HTMLParser
import re

class HtmlParserTool():
    def __init__(self):
        self.urlcontent = ''
        return

    def set_url(self, url):
        self.urlcontent = urllib.urlopen(url).read()
        if 0 == len(self.urlcontent):
            print 'HtmlParserTool:read url',url,'error\n'
        return

class HtmlParserGetUrl(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltag = 0
        HTMLParser.__init__(self)
        return

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
            self.url.append(hrefv)
            self.urltag = 1
            return

    def handle_data(self, data):
        return

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0
            return

class HtmlSavePage():
    def __init__(self):
        self.Parser = HtmlParserGetUrl()
        return

    def urlset(self, subsite, page):
        self.Parser.feed(page)
        for i in range(0, len(self.Parser.url)):
            if -1 != self.Parser.url[i].find('http://'):
                continue
            index = 0
            find = 0
            for j in range(0, len(self.Parser.url)):
                index = page[index:len(page)].find(self.Parser.url[i])
                if -1 != index:
                    index2 = page[0:index].rfind('href')
                    if -1 == index2:
                        index2 = page[0:index].rfind('HREF')
                    if -1 != index2:
                        if index - index2 > 2:
                            index = index + len(self.Parser.url[i])
                            continue
                        else:
                            find = 1
                else:
                    find = 0
                    break
            if 1 == find:
                page = page[0:index] + subsite + page[index:len(page)]
            return

    def save(self, path, url, site):
        print path,'\n',url
        webtool = HtmlParserTool()
        webtool.set_url(url)
        page = webtool.urlcontent

        #replace '/xxx'
        page = re.sub('(src|SRC)="//*', 'src="' + site + '/', page)
        page = re.sub('(href|HREF)="//*', 'href="' + site + '/', page)
        page = re.sub('(background|BACKGROUND)="//*', 'background="' + site + '/', page)

        #replace 'xxx/xxx'
        index = url.rfind('/')

        if -1 != index:
            subsite = url[0:index]
            page = re.sub('(src|SRC)="(?!/|http|\.\.)', 'src="' + subsite + '/', page)
            page = re.sub('(href|HREF)="(?!/|http|\.\.)', 'href="' + subsite + '/', page)
            page = re.sub('(background|BACKGROUND)="(?!/|http|\.\.)', 'background="' + subsite + '/', page)

        #replace '../xxx'
        index = url[0:index].rfind('/')
        if -1 != index:
            index = url[0:index].rfind('/')
            if -1 != index:
                url = url[0:index]
                print 'siteurl:',url
                page = re.sub('(src|SRC)="\.\./\.\./', 'src="' + url + '/', page)
                page = re.sub('(href|HREF)="\.\./\.\./', 'href="' + url + '/', page)
                page = re.sub('(background|BACKGROUND)="\.\./\.\./', 'background="' + url + '/', page)
        savefile = open(path, 'w')
        savefile.write(page)
        savefile.close()
        return

class HtmlParserUrlTp1(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltitle = []
        self.urltag = 0
        self.urlregex = ''
        self.urltmp = ''
        HTMLParser.__init__(self)
        return

    def set_urlregex(self, regex):
        self.urlregex = regex
        return

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            for name,value in attrs:
                if name == 'href':
                    self.urltmp = value

            #print self.urltmp,'\n'
            if re.match(self.urlregex, self.urltmp):
                self.urltag = 1
            else:
                self.urltmp = ''
        return

    def handle_data(self, data):
        if 1 == self.urltag:
            if len(data) > 2:
                for i in range(0, len(self.url)):
                    if self.url[i] == self.urltmp:
                        self.urltmp = ''
                        return
                self.url.append(self.urltmp)
                self.urltitle.append(data)
                self.urltmp = ''
        return

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0
        return

    def set_url(self, url):
        urllib.urlopen(url).read()
        return


class HtmlTool():
    def __init__(self):
        self.HtmlTp = ''
        self.urlfile = ''
        self.site = ''
        self.subsite = ''
        self.savepath = ''
        self.needsavepage = 1 #do you want to save page, 0 is not save and 1 is save
        self.code = '' #set html string code,such as gbk,utf-8,etc...
        self.regex = '' #set url parser's regex
        self.UrlTool = HtmlParserTool()
        return

    def set_no_save_page(self):
        self.needsavepage = 0
        return

    def set_code(self, code):
        self.code = code
        return

    def set_urlregex(self, regex):
        self.regex = regex
        return

    def set_savepath(self, path):
        self.savepath = path
        return

    def get_site(self):
        urlhead = '
http://'
        index = self.urlfile[len(urlhead) : len(self.urlfile)].find('
/')
        self.site = self.urlfile[0 : index + len(urlhead)]
        index = self.urlfile.rfind('
/')
        self.subsite = self.urlfile[0:index]
        print '
site:',self.site
        print '
sub site:',self.subsite
        return

    def set_tp(self, tp):
        self.HtmlTp = tp
        return

    def open_file(self, file):
        self.file = open(file, '
w')
        return

    def close_file(self):
        self.file.close()
        return

    def set_url(self, url):
        self.urlfile = url
        self.get_site()

        if '
URL_TP1' == self.HtmlTp:
            self.HtmlParser = HtmlParserUrlTp1()
            self.UrlTool.set_url(url)
            
        if '
' == self.UrlTool.urlcontent:
            print('
get ',url,' error')
            return
        else:
            urlfile = open('
urlcontent.txt', 'w')
            urlfile.write(self.UrlTool.urlcontent)
            urlfile.close()
            
        if '
' != self.regex:
            self.HtmlParser.set_urlregex(self.regex)

        if '
' != self.code:
            self.HtmlParser.feed(self.UrlTool.urlcontent.decode(self.code))
        else:
            self.HtmlParser.feed(self.UrlTool.urlcontent)

        if 0 == len(self.HtmlParser.url):
            print('
get url error')
            return

        #print len(self.HtmlParser.url),len(self.HtmlParser.urltitle)
        for i in range(0, len(self.HtmlParser.url)):
            fileurl = self.subsite + '
/' + self.HtmlParser.url[i]
            self.file.write(fileurl)
            self.file.write('
<- ')
            if 0 == len(self.code):
                self.file.write(self.HtmlParser.urltitle[i])
            else:
                print len(self.code)
                self.file.write(self.HtmlParser.urltitle[i].encode('
utf-8'))
            #print fileurl
            self.file.write("\r\n")

            if 1 == self.needsavepage:
                savetool = HtmlSavePage()
                savetool.save(self.savepath + self.HtmlParser.url[i], fileurl, self.site)

            self.file.write("\r\n")
        print('
run end\n')
        return


#switch
s = 2

#get blog from CU
if 1 == s:
    webtool = HtmlTool()
    webtool.set_tp('
URL_TP1')
    #webtool.set_no_save_page()
    webtool.set_savepath('
./web/')
    webtool.open_file('
web.txt')
    #webtool.set_code('
gbk')
    webtool.set_urlregex('
showart_[0-9]*.html')
    webtool.set_url('
http://blog.chinaunix.net/u3/105068/index.html')
    webtool.close_file()

#showart

if 2 == s:
    webtool = HtmlTool()
    webtool.set_tp('
URL_TP1')
    webtool.set_no_save_page()
    webtool.open_file('
web.txt')
    webtool.set_code('
gbk')
    webtool.set_urlregex('
thread-[0-9]*-1-1.html')
    webtool.set_url('
http://linux.chinaunix.net/bbs/forum-8-1.html')
    webtool.close_file()
    print len(webtool.HtmlParser.url)
    for i in range(0, len(webtool.HtmlParser.url)):
        webfiltertool = HtmlParserTool()
        webfiltertool.set_url('
http://linux.chinaunix.net/bbs/'+webtool.HtmlParser.url[i])
        if -1 != webfiltertool.urlcontent.find('
001.gif'):
            print '
find:(',webtool.HtmlParser.url[i],') -> ',webtool.HtmlParser.urltitle[i].encode('utf-8'),'\n'







#test
subject = '
<a SRC="aaaaa"><a src=""><a src="/aaaa"><a src="../../aaa">'
result = re.sub('
(src|SRC)="(?!(/|http|\.\.))', 'src="http:/bbx/', subject)
print result
result = re.sub('
src="\.\./\.\./', 'src="http:/xxv/', result)
print result
if re.match('
showart_[0-9]*.html', 'showart_2106707.html'):
    print '
ok


阅读(737) | 评论(1) | 转发(0) |
给主人留下些什么吧!~~

pikyshen2010-05-03 21:55:41

修改版: import urllib from HTMLParser import HTMLParser import re class HtmlParserTool(): def __init__(self): self.urlcontent = '' def set_url(self, url): self.urlcontent = urllib.urlopen(url).read() if 0 == len(self.urlcontent): print 'HtmlParserTool:read url',url,'error\n' class HtmlParserGetUrl(HTMLParser): def __init__(self): self.url = [] self.urltag = 0 HTMLParser.__init__(self) def handl