这是个好东西,有相见恨晚之意,也比较容易上手,对很多的功能都进行了封装,做信息采集的朋友应该尽快使用它来做抓取工作.下面给一个完整的例子:
这是一个用来抽取某个网站中产品评论的类,有兴趣的朋友可以仔细看下,欢迎交流!
package com;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
class WrapperComment implements CatchCommentInterface{
String url="";//define a visit url
StringBuffer sb=new StringBuffer();//to sava the replaced content
WrapperComment(String newsLink){//define a constructor
this.url=newsLink;
}
public String getPageContent(){
String pageContent="";
try{
Parser parser=new Parser(url);//contructor
parser.setEncoding("GB2312");//set encode
TagNameFilter bodyFilter=new TagNameFilter("body");//get the body content
NodeList nodelist=parser.extractAllNodesThatMatch(bodyFilter);
pageContent=nodelist.toHtml();//output pagecontent
}catch(Exception e){
e.printStackTrace();
}
return pageContent;
}
public NodeList getTableContent(String content){
NodeList nodelist=null;
NodeFilter[] nodeFilter=new NodeFilter[3];
try{
Parser parser=new Parser(content);//contructor
parser.setEncoding("GB2312");//set encode
TagNameFilter tableFilter=new TagNameFilter("table");//get the table content
HasAttributeFilter tableAttribute=new HasAttributeFilter("bgcolor","#cfe0fc");//hava the attribute "bgcolor"
HasAttributeFilter tableAttribute2=new HasAttributeFilter("width","100%");//hava the attribute "width"
nodeFilter[0]=tableFilter;
nodeFilter[1]=tableAttribute;
nodeFilter[2]=tableAttribute2;
AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
nodelist.remove(nodelist.size()-1);//to remove the last element
}catch(Exception e){
e.printStackTrace();
}
return nodelist;
}
public void getCommentDetail(NodeList nodelist){
int nodesize=nodelist.size();
String tableContent="";
for(int i=0;i tableContent=nodelist.elementAt(i).toHtml();
System.out.println(getCommentTime(tableContent));
System.out.println(getCommentText(tableContent));
System.out.println("----------------------------------");
}
}
public String getCommentTime(String content){
String commentTime="";
String tempText="";
try{
Parser parser=new Parser(content);
parser.setEncoding("GB2312");
TagNameFilter trFilter=new TagNameFilter("td");
TagNameFilter bFilter=new TagNameFilter("b");
HasChildFilter childFilter=new HasChildFilter(bFilter);
AndFilter andFilter=new AndFilter(trFilter,childFilter);
NodeList nodelist=parser.extractAllNodesThatMatch(andFilter);
TableColumn td=(TableColumn)nodelist.elementAt(0);
tempText=td.toHtml();
String regx="2006-[0-9]{1,2}-[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}";
Pattern pattern=Pattern.compile(regx);
Matcher matcher=pattern.matcher(tempText);
if(matcher.find()){
commentTime=matcher.group();
}else{
commentTime="no find";
}
}catch(Exception e){
e.printStackTrace();
}
return commentTime;
}
public String getCommentText(String content){
String commentText="";
int nodesize;
try{
Parser parser=new Parser(content);
parser.setEncoding("GB2312");
NodeClassFilter span=new NodeClassFilter(Span.class);
HasAttributeFilter idFilter=new HasAttributeFilter("id");
AndFilter andFilter=new AndFilter(span,idFilter);
NodeList nodelist=parser.extractAllNodesThatMatch(andFilter);
nodesize=nodelist.size();
for(int i=0;i Span sp=(Span)nodelist.elementAt(i);
commentText=commentText+sp.toPlainTextString();
}
}catch(Exception e){
e.printStackTrace();
}
return commentText;
}
}