#coding:utf-8
import urllib
import re
from bs4 import BeautifulSoup
from distutils.filelist import findall
page = urllib.urlopen('')
contents = page.read()
soup = BeautifulSoup(contents,"html.parser")
tag =soup.find('div', class_='wrap')
tag3 =tag.find_all('li')
for v in tag3:
print v.text.encode("utf8")
#下面是过滤操作
c = get_http(url)
c.encoding = "gb2312" #网页若是gb2312
soup = BeautifulSoup(c.text, 'lxml',from_encoding='utf-8')
#去掉注释
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[s.extract() for s in comments ]
sff = soup.find('div',id="contentText")
#去掉内部script
[s.extract() for s in sff('script')]
#去掉内部div
[s.extract() for s in sff('div')]
阅读(6321) | 评论(0) | 转发(0) |