Chinaunix首页 | 论坛 | 博客
  • 博客访问: 77063
  • 博文数量: 27
  • 博客积分: 2120
  • 博客等级: 大尉
  • 技术积分: 270
  • 用 户 组: 普通用户
  • 注册时间: 2009-10-25 20:51





分类: Python/Ruby

2010-05-02 19:55:27


import urllib
from HTMLParser import HTMLParser
import re

class HtmlParserTool():
    def __init__(self):
        self.urlcontent = ''

    def set_url(self, url):
        self.urlcontent = urllib.urlopen(url).read()
        if 0 == len(self.urlcontent):
            print 'HtmlParserTool:read url',url,'error\n'

class HtmlParserGetUrl(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltag = 0

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
            self.urltag = 1

    def handle_data(self, data):

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

class HtmlSavePage():
    def __init__(self):
        self.Parser = HtmlParserGetUrl()

    def urlset(self, subsite, page):
        for i in range(0, len(self.Parser.url)):
            if -1 != self.Parser.url[i].find('http://'):
            index = 0
            find = 0
            for j in range(0, len(self.Parser.url)):
                index = page[index:len(page)].find(self.Parser.url[i])
                if -1 != index:
                    index2 = page[0:index].rfind('href')
                    if -1 == index2:
                        index2 = page[0:index].rfind('HREF')
                    if -1 != index2:
                        if index - index2 > 2:
                            index = index + len(self.Parser.url[i])
                            find = 1
                    find = 0
            if 1 == find:
                page = page[0:index] + subsite + page[index:len(page)]

    def save(self, path, url, site):
        print path,'\n',url
        webtool = HtmlParserTool()
        page = webtool.urlcontent

        #replace '/xxx'
        page = re.sub('(src|SRC)="//*', 'src="' + site + '/', page)
        page = re.sub('(href|HREF)="//*', 'href="' + site + '/', page)
        page = re.sub('(background|BACKGROUND)="//*', 'background="' + site + '/', page)

        #replace 'xxx/xxx'
        index = url.rfind('/')

        if -1 != index:
            subsite = url[0:index]
            page = re.sub('(src|SRC)="(?!/|http|\.\.)', 'src="' + subsite + '/', page)
            page = re.sub('(href|HREF)="(?!/|http|\.\.)', 'href="' + subsite + '/', page)
            page = re.sub('(background|BACKGROUND)="(?!/|http|\.\.)', 'background="' + subsite + '/', page)

        #replace '../xxx'
        index = url[0:index].rfind('/')
        if -1 != index:
            index = url[0:index].rfind('/')
            if -1 != index:
                url = url[0:index]
                print 'siteurl:',url
                page = re.sub('(src|SRC)="\.\./\.\./', 'src="' + url + '/', page)
                page = re.sub('(href|HREF)="\.\./\.\./', 'href="' + url + '/', page)
                page = re.sub('(background|BACKGROUND)="\.\./\.\./', 'background="' + url + '/', page)
        savefile = open(path, 'w')

class HtmlParserTP1(HTMLParser):
    def __init__(self):
        self.title = ''
        self.url = []
        self.urltitle = []
        self.urltag = 0

    def handle_starttag(self, tag, attrs):
        if 'a' == tag.lower():
            self.urlTP1 = 1
            titlev = ""
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
                if name == 'title':
                    titlev = value
            if len(titlev) > 0:

            if -1 != hrefv.find('showart'):
                for i in range(0, len(self.url)):
                    if self.url[i] == hrefv:
                self.urltag = 1

    def handle_data(self, data):
        if 1 == self.urltag:

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

    def set_url(self, url):

class HtmlParserTP2(HTMLParser):
    def __init__(self):
        self.title = ''
        self.url = []
        self.urltitle = []
        self.urltag = 0
        self.tmp1 = ''

    def handle_starttag(self, tag, attrs):
        if 'a' == tag.lower():
            self.urlTP1 = 1
            titlev = ""
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value

            if re.match('thread-[0-9]*-1-1.html', hrefv):
            #if -1 != hrefv.find('thread-') and -1 != hrefv.find('.html') :
                self.tmp1 = hrefv
                self.urltag = 1

    def handle_data(self, data):
        if 1 == self.urltag:
            if len(data) > 2:
                for i in range(0, len(self.url)):
                    if self.url[i] == self.tmp1:

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

    def set_url(self, url):

class HtmlTool():
    def __init__(self):
        self.HtmlTp = ''
        self.urlfile = '' = ''
        self.subsite = ''
        self.savepath = ''
        self.needsavepage = 1
        self.UrlTool = HtmlParserTool()

    def set_no_save_page(self):
        self.needsavepage = 0

    def set_savepath(self, path):
        self.savepath = path

    def get_site(self):
        urlhead = 'http://'
        index = self.urlfile[len(urlhead) : len(self.urlfile)].find('/') = self.urlfile[0 : index + len(urlhead)]
        index = self.urlfile.rfind('/')
        self.subsite = self.urlfile[0:index]
        print 'site:',
        print 'sub site:',self.subsite

    def set_tp(self, tp):
        self.HtmlTp = tp

    def open_file(self, file):
        self.file = open(file, 'w+')

    def close_file(self):

    def set_url(self, url):
        self.urlfile = url

        code = 1

        if('TP1' == self.HtmlTp):
            self.HtmlParser = HtmlParserTP1()

        if('TP2' == self.HtmlTp):
            self.HtmlParser = HtmlParserTP2()
            code = 0
        if len(self.HtmlParser.url) > 0:
            print len(self.HtmlParser.url),len(self.HtmlParser.urltitle)
            for i in range(0, len(self.HtmlParser.url)):
                fileurl = self.subsite + '/' + self.HtmlParser.url[i]
                self.file.write(' <- ')
                if 0 == code:
                #print fileurl

                if 1 == self.needsavepage:
                    savetool = HtmlSavePage()
           + self.HtmlParser.url[i], fileurl,

subject = ''
result = re.sub('(src|SRC)="(?!(/|http|\.\.))', 'src="http:/bbx/', subject)
print result
result = re.sub('src="\.\./\.\./', 'src="http:/xxv/', result)
print result
if re.match('thread-[0-9]*-1-1.html', 'thread-1698004-1-1.html'):
    print 'ok'

s = 1

#get blog from CU
if 1 == s:
    webtool = HtmlTool()

#get froum form CU
if 2 == s:
    webtool = HtmlTool()



import urllib
from HTMLParser import HTMLParser
import re

class HtmlParserTool():
    def __init__(self):
        self.urlcontent = ''

    def set_url(self, url):
        self.urlcontent = urllib.urlopen(url).read()
        if 0 == len(self.urlcontent):
            print 'HtmlParserTool:read url',url,'error\n'

class HtmlParserGetUrl(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltag = 0

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            hrefv = ""
            for name,value in attrs:
                if name == 'href':
                    hrefv = value
            self.urltag = 1

    def handle_data(self, data):

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

class HtmlSavePage():
    def __init__(self):
        self.Parser = HtmlParserGetUrl()

    def urlset(self, subsite, page):
        for i in range(0, len(self.Parser.url)):
            if -1 != self.Parser.url[i].find('http://'):
            index = 0
            find = 0
            for j in range(0, len(self.Parser.url)):
                index = page[index:len(page)].find(self.Parser.url[i])
                if -1 != index:
                    index2 = page[0:index].rfind('href')
                    if -1 == index2:
                        index2 = page[0:index].rfind('HREF')
                    if -1 != index2:
                        if index - index2 > 2:
                            index = index + len(self.Parser.url[i])
                            find = 1
                    find = 0
            if 1 == find:
                page = page[0:index] + subsite + page[index:len(page)]

    def save(self, path, url, site):
        print path,'\n',url
        webtool = HtmlParserTool()
        page = webtool.urlcontent

        #replace '/xxx'
        page = re.sub('(src|SRC)="//*', 'src="' + site + '/', page)
        page = re.sub('(href|HREF)="//*', 'href="' + site + '/', page)
        page = re.sub('(background|BACKGROUND)="//*', 'background="' + site + '/', page)

        #replace 'xxx/xxx'
        index = url.rfind('/')

        if -1 != index:
            subsite = url[0:index]
            page = re.sub('(src|SRC)="(?!/|http|\.\.)', 'src="' + subsite + '/', page)
            page = re.sub('(href|HREF)="(?!/|http|\.\.)', 'href="' + subsite + '/', page)
            page = re.sub('(background|BACKGROUND)="(?!/|http|\.\.)', 'background="' + subsite + '/', page)

        #replace '../xxx'
        index = url[0:index].rfind('/')
        if -1 != index:
            index = url[0:index].rfind('/')
            if -1 != index:
                url = url[0:index]
                print 'siteurl:',url
                page = re.sub('(src|SRC)="\.\./\.\./', 'src="' + url + '/', page)
                page = re.sub('(href|HREF)="\.\./\.\./', 'href="' + url + '/', page)
                page = re.sub('(background|BACKGROUND)="\.\./\.\./', 'background="' + url + '/', page)
        savefile = open(path, 'w')

class HtmlParserUrlTp1(HTMLParser):
    def __init__(self):
        self.url = []
        self.urltitle = []
        self.urltag = 0
        self.urlregex = ''
        self.urltmp = ''

    def set_urlregex(self, regex):
        self.urlregex = regex

    def handle_starttag(self, tag, attrs):
        if 'a' == tag:
            for name,value in attrs:
                if name == 'href':
                    self.urltmp = value

            #print self.urltmp,'\n'
            if re.match(self.urlregex, self.urltmp):
                self.urltag = 1
                self.urltmp = ''

    def handle_data(self, data):
        if 1 == self.urltag:
            if len(data) > 2:
                for i in range(0, len(self.url)):
                    if self.url[i] == self.urltmp:
                        self.urltmp = ''
                self.urltmp = ''

    def handle_endtag(self, tag):
        if 1 == self.urltag:
            self.urltag = 0

    def set_url(self, url):

class HtmlTool():
    def __init__(self):
        self.HtmlTp = ''
        self.urlfile = '' = ''
        self.subsite = ''
        self.savepath = ''
        self.needsavepage = 1 #do you want to save page, 0 is not save and 1 is save
        self.code = '' #set html string code,such as gbk,utf-8,etc...
        self.regex = '' #set url parser's regex
        self.UrlTool = HtmlParserTool()

    def set_no_save_page(self):
        self.needsavepage = 0

    def set_code(self, code):
        self.code = code

    def set_urlregex(self, regex):
        self.regex = regex

    def set_savepath(self, path):
        self.savepath = path

    def get_site(self):
        urlhead = '
        index = self.urlfile[len(urlhead) : len(self.urlfile)].find('
/') = self.urlfile[0 : index + len(urlhead)]
        index = self.urlfile.rfind('
        self.subsite = self.urlfile[0:index]
        print '
        print '
sub site:',self.subsite

    def set_tp(self, tp):
        self.HtmlTp = tp

    def open_file(self, file):
        self.file = open(file, '

    def close_file(self):

    def set_url(self, url):
        self.urlfile = url

        if '
URL_TP1' == self.HtmlTp:
            self.HtmlParser = HtmlParserUrlTp1()
        if '
' == self.UrlTool.urlcontent:
get ',url,' error')
            urlfile = open('
urlcontent.txt', 'w')
        if '
' != self.regex:

        if '
' != self.code:

        if 0 == len(self.HtmlParser.url):
get url error')

        #print len(self.HtmlParser.url),len(self.HtmlParser.urltitle)
        for i in range(0, len(self.HtmlParser.url)):
            fileurl = self.subsite + '
/' + self.HtmlParser.url[i]
<- ')
            if 0 == len(self.code):
                print len(self.code)
            #print fileurl

            if 1 == self.needsavepage:
                savetool = HtmlSavePage()
       + self.HtmlParser.url[i], fileurl,

run end\n')

s = 2

#get blog from CU
if 1 == s:
    webtool = HtmlTool()


if 2 == s:
    webtool = HtmlTool()
    print len(webtool.HtmlParser.url)
    for i in range(0, len(webtool.HtmlParser.url)):
        webfiltertool = HtmlParserTool()
        if -1 != webfiltertool.urlcontent.find('
            print '
find:(',webtool.HtmlParser.url[i],') -> ',webtool.HtmlParser.urltitle[i].encode('utf-8'),'\n'

subject = '
<a SRC="aaaaa"><a src=""><a src="/aaaa"><a src="../../aaa">'
result = re.sub('
(src|SRC)="(?!(/|http|\.\.))', 'src="http:/bbx/', subject)
print result
result = re.sub('
src="\.\./\.\./', 'src="http:/xxv/', result)
print result
if re.match('
showart_[0-9]*.html', 'showart_2106707.html'):
    print '

阅读(790) | 评论(1) | 转发(0) |

pikyshen2010-05-03 21:55:41

修改版: import urllib from HTMLParser import HTMLParser import re class HtmlParserTool(): def __init__(self): self.urlcontent = '' def set_url(self, url): self.urlcontent = urllib.urlopen(url).read() if 0 == len(self.urlcontent): print 'HtmlParserTool:read url',url,'error\n' class HtmlParserGetUrl(HTMLParser): def __init__(self): self.url = [] self.urltag = 0 HTMLParser.__init__(self) def handl