Chinaunix首页 | 论坛 | 博客
  • 博客访问: 2501484
  • 博文数量: 709
  • 博客积分: 12251
  • 博客等级: 上将
  • 技术积分: 7905
  • 用 户 组: 普通用户
  • 注册时间: 2005-07-17 00:00
个人简介

实现有价值的IT服务

文章存档

2012年(7)

2011年(147)

2009年(3)

2008年(5)

2007年(74)

2006年(431)

2005年(42)

分类: IT职场

2007-02-13 11:48:35

利用htmlparser进行网页信息的抽取
2006-08-29 21:54

这是个好东西,有相见恨晚之意,也比较容易上手,对很多的功能都进行了封装,做信息采集的朋友应该尽快使用它来做抓取工作.下面给一个完整的例子:

这是一个用来抽取某个网站中产品评论的类,有兴趣的朋友可以仔细看下,欢迎交流!

package com;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;

class WrapperComment implements CatchCommentInterface{
 String url="";//define a visit url
 StringBuffer sb=new StringBuffer();//to sava the replaced content
 WrapperComment(String newsLink){//define a constructor
  this.url=newsLink;
 }
 
 public String getPageContent(){
  String pageContent="";
  try{
   Parser parser=new Parser(url);//contructor
   parser.setEncoding("GB2312");//set encode
   TagNameFilter bodyFilter=new TagNameFilter("body");//get the body content
   NodeList nodelist=parser.extractAllNodesThatMatch(bodyFilter);
   pageContent=nodelist.toHtml();//output pagecontent
  }catch(Exception e){
   e.printStackTrace();
  }
  return pageContent;
 }
 
 public NodeList getTableContent(String content){
  NodeList nodelist=null;
  NodeFilter[] nodeFilter=new NodeFilter[3];
  try{
   Parser parser=new Parser(content);//contructor
   parser.setEncoding("GB2312");//set encode
   TagNameFilter tableFilter=new TagNameFilter("table");//get the table content
   HasAttributeFilter tableAttribute=new HasAttributeFilter("bgcolor","#cfe0fc");//hava the attribute "bgcolor"
   HasAttributeFilter tableAttribute2=new HasAttributeFilter("width","100%");//hava the attribute "width"
   nodeFilter[0]=tableFilter;
   nodeFilter[1]=tableAttribute;
   nodeFilter[2]=tableAttribute2;
   AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
   nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
   nodelist.remove(nodelist.size()-1);//to remove the last element
  }catch(Exception e){
   e.printStackTrace();
  }
  return nodelist;
 }
 
 public void getCommentDetail(NodeList nodelist){
  int nodesize=nodelist.size();
  String tableContent="";
  for(int i=0;i   tableContent=nodelist.elementAt(i).toHtml();
   System.out.println(getCommentTime(tableContent));
   System.out.println(getCommentText(tableContent));
   System.out.println("----------------------------------");
  }
 }
 
 public String getCommentTime(String content){
  String commentTime="";
  String tempText="";
  try{
   Parser parser=new Parser(content);
   parser.setEncoding("GB2312");
   TagNameFilter trFilter=new TagNameFilter("td");
   TagNameFilter bFilter=new TagNameFilter("b");
   HasChildFilter childFilter=new HasChildFilter(bFilter);
   AndFilter andFilter=new AndFilter(trFilter,childFilter);
   NodeList nodelist=parser.extractAllNodesThatMatch(andFilter);
   TableColumn td=(TableColumn)nodelist.elementAt(0);
   tempText=td.toHtml();
   String regx="2006-[0-9]{1,2}-[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}";
   Pattern pattern=Pattern.compile(regx);
   Matcher matcher=pattern.matcher(tempText);
   if(matcher.find()){
    commentTime=matcher.group();
   }else{
    commentTime="no find";
   }
  }catch(Exception e){
   e.printStackTrace();
  }
  return commentTime;
 }
 
 public String getCommentText(String content){
  String commentText="";
  int nodesize;
  try{
   Parser parser=new Parser(content);
   parser.setEncoding("GB2312");
   NodeClassFilter span=new NodeClassFilter(Span.class);
   HasAttributeFilter idFilter=new HasAttributeFilter("id");
   AndFilter andFilter=new AndFilter(span,idFilter);
   NodeList nodelist=parser.extractAllNodesThatMatch(andFilter);
   nodesize=nodelist.size();
   for(int i=0;i    Span sp=(Span)nodelist.elementAt(i);
    commentText=commentText+sp.toPlainTextString();
   }
  }catch(Exception e){
   e.printStackTrace();
  }
  return commentText;
 }
}

阅读(2492) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~