|
找建设银行的挂马网站,通过google的搜索结果来查找。 通过建设银行的首页的title进行检索,然后进行人工排除。
#!/usr/bin/env python
import urllib2 import re
opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')]
URL = "http://www.google.cn/search?as_q=%E6%AC%A2%E8%BF%8E%E8%AE%BF%E9%97%AE%E4%B8%AD%E5%9B%BD%E5%BB%BA%E8%AE%BE%E9%93%B6%E8%A1%8C%E7%BD%91%E7%AB%99&complete=1&hl=zh-CN&newwindow=1&num=100&btnG=Google+%E6%90%9C%E7%B4%A2&as_epq=&as_oq=&as_eq=&lr=&cr=&as_ft=i&as_filetype=&as_qdr=all&as_occt=title&as_dt=i&as_sitesearch=&as_rights=" sock = opener.open(URL) #sock = urllib.urlopen(URL); htmlsource = sock.read()
#print htmlsource #ippattern = re.compile(r'<h2 class=r><a href="http://([^"/]+)/\S*" target') #ippattern = re.compile(r'<h2 class=r><a href="http://([^"/]+)/(\s|\S*)" target') #ippattern = re.compile(r'<h2 class=r><a href="http://([^/]+)/.+?" target') ippattern = re.compile(r'<h2 class=r><a href="(.+?)" target') list = re.findall(ippattern,htmlsource)
result = []
format = re.compile(r'.*http://(.+?)/') for i in list: #print i #temp = re.findall(format,i) #result.extend(temp) temp = format.search(i) #print temp.group(1) result.append(temp.group(1))
sock.close()
result = dict.fromkeys(result).keys()
outfile = open('blacklist.txt','w') len = len(result) for i in range(len): outfile.write(result[i]+'\n') outfile.close()
|