from sgmllib import SGMLParser
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
self.url = []
def handle_starttag(self,tag,method,attributes):
href = [v for k, v in attributes if k=='href']
if href:
self.urls.extend(href)
def start_a(self,attributes):
pass
#def start_td(self,attrs):
# pass
#def start_div(self,attrs):
# pass
#def do_table(self,attrs):
# pass
import urllib
import mp3parser
from sgmllib import SGMLParser
import re
class HtmlParser():
def __init__(self):
pass
def readHtml(self,url):
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
def parserHtml(self,html):
parser = mp3parser.URLLister()
parser.feed(html)
parser.close()
return parser.urls
if __name__ == "__main__":
url = ""
hp = HtmlParser()
html = hp.readHtml(url)
#print html
urls = hp.parserHtml(html)
阅读(603) | 评论(0) | 转发(0) |