''' Function: get_links(url) parameter: url urlparse.urlparse: parse a url into six compents, returing a 6-tuple (scheme,netloc,path,params,query and fragment),please frefer to for more info about urlparse. HTTPConnection.request(method, url[, body[, headers]]) ''' import urllib, urllister import urlparse import httplib import time import urllib2
def get_links(url): usock=urllib.urlopen(url) parser=urllister.URLLister() #Create a instance parser.feed(usock.read()) #Put the resource(html) into parser,and get the relevent segments from the resource. usock.close() parser.close() uhost=urlparse.urlparse(url) for url in parser.urls: print url up=urlparse.urlparse(url)
if up.netloc=="": #Some link may not contain 'http:'(called absolute path') conn=httplib.HTTPConnection(uhost.netloc) conn.request("GET","/"+up.path+"?"+up.params+up.query+up.fragment) res=conn.getresponse() status=res.status reason=res.reason #data=res.read() conn.close() else: conn=httplib.HTTPConnection(uhost.netloc) conn.request("GET",up.path+"?"+up.params+up.query+up.fragment) res=conn.getresponse() status=res.status reason=res.reason #data=res.read() conn.close()
print url,status,reason
if __name__ == '__main__': url=raw_input("Please enter the url you want to check:\n") get_links(url)
|
This programe can be used to check the status of the links in the given web site. it will return the status of each links.
urllister.py
from sgmllib import SGMLParser
class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls=[]
def start_a(self,attars): href=[v for k,v in attars if k=='href'] if href: self.urls.extend(href)
|
阅读(449) | 评论(0) | 转发(0) |