文章来源:http://www.blogjava.net/wangdei/archive/2008/01/17/175938.html
http://www.blogjava.net/wangdei/archive/2008/01/24/177569.html
HTML Parser 简述:这是一个在 SourceForge.net 上比较活跃的项目之一,目前的最新版本是 1.6 发行版, (我现在用在 的也是1.6).他是一个对现有的 HTML 进行分析的快速实时的解析器,事实上在应用过程中你更为惊叹于 HTML Parser 给你带来一些周到的处理。他主要用在这几个方面:
文本信息抽取,
链接提取,用于自动给页面的链接文本加上链接的标签
资源提取,例如对一些图片、声音的资源的处理
链接检查,用于检查HTML中的链接是否有效
页面内容的监控
呵呵.废话少说:)上代码.
public class BaseAction {
public static final Logger logger = Logger.getLogger(BaseAction.class);
public String keyWords = "姚明|姚明NBA";
public static NodeList getAllNodeList(String urlOrfile, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction getAllNodeList(" + urlOrfile + ")");
Parser parser;
try {
parser = new Parser(urlOrfile);
parser.setEncoding(Constent.Encode);
NodeList list = parser.parse(filter);
return list;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public List<String> parseLink(String file, NodeFilter filter) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parseLink(" + file + ")");
List<String> hrefList = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
logger.debug("textnode=" + line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
logger.debug("link=" + line);
}
if (HttpParserUtil.isTrimEmpty(line))
continue;
hrefList.add(line);
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return hrefList;
}
/**
* 取link 与textnode 返回的是href连接
*
* @param file
* @param filter
* @return
*/
public Map<String, String> parseLinkWithText(String file,
NodeFilter filter, Pattern pHtml, Pattern pPhp) {
if (logger.isDebugEnabled())
logger.debug("SinaAction parseLinkWithText(" + file + ")");
Map<String, String> map = new HashMap<String, String>();
List<String> list = new ArrayList<String>();
try {
NodeList nodelist = getAllNodeList(file, filter);
if(nodelist==null)
return null;
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("textnode=" + line);
list.add(line);
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
if (HttpParserUtil.isTrimEmpty(line))
continue;
if (logger.isDebugEnabled())
logger.debug("link=" + line);
list.add(line);
}
}
int endPostion = list.size();
for (int i = 0; i < endPostion; i++) {
String getCurr = list.get(i);
Matcher mHtml = pHtml.matcher(getCurr);
Matcher mPhp = pPhp.matcher(getCurr);
if ((mHtml.matches() == true || mPhp.matches() == true)
&& i < (endPostion - 1)) {
String getNext = list.get(i + 1);
Matcher mHtmlNext = pHtml.matcher(getNext);
Matcher mPhpNext = pPhp.matcher(getNext);
if ((mHtml.matches() == true && mHtmlNext.matches() == false)
|| (mPhp.matches() == true && mPhpNext.matches() == false)) {
map.put(getCurr, getNext);
i = i + 1;
} else {
}
}
}
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return map;
}
/**
* 分析内容
*
* @param list
* @return
*/
public String parserContent(NodeList list) {
return parserContent(list, false);
}
public String parserContent(NodeList list, boolean isCreateFile) {
return parserContent(list,isCreateFile,list.size()+1);
}
public String parserContent(NodeList list, int listIndex) {
return parserContent(list,false,listIndex);
}
public String parserContent(NodeList list, boolean isCreateFile,int listIndex) {
if (logger.isDebugEnabled())
logger.debug("BaseAction parserContent()");
StringBuffer content = new StringBuffer();
if(list.size() < listIndex){//说明是整个取出进行rex
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
NodeList sublist = node.getChildren();
if (sublist == null)
continue;
Node[] listNode = sublist.toNodeArray();
for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
}else{
Node node = list.elementAt(listIndex);
if (node == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
NodeList sublist = node.getChildren();
if (sublist == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
Node[] listNode = sublist.toNodeArray();
if (listNode == null){
logger.warn("the listIndex may is wrong! please do it");
return null;
}
for (Node inNode : listNode) {
if (HttpParserUtil.isTrimEmpty(inNode.getText()))
continue;
logger.debug(inNode.toHtml());
content.append(inNode.toHtml());
if (isCreateFile)
content.append("\n");
}
}
if (content.toString() == null) {
logger.warn("you get the text is null");
}
return content.toString();
}
/**
* 抓取meta标签
* @param list
* @return
*/
public MetaModel getMetaInfo(NodeList list){
MetaModel metaModel = new MetaModel();
for (int index = 0; index < list.size(); index++) {
Node firstNode = list.elementAt(index);
if(!(firstNode instanceof Html))
continue;
NodeList htmlList = firstNode.getChildren();
for (int i = 0; i < htmlList.size(); i++) {
Node htmlNode = htmlList.elementAt(i);
if (!(htmlNode instanceof HeadTag))
continue;
NodeList headList = htmlNode.getChildren();
for(int j = 0; j < headList.size(); j++){
Node headNode = headList.elementAt(j);
if(headNode instanceof TitleTag){
TitleTag titleTag = (TitleTag) headNode;
metaModel.setTitle(titleTag.getTitle());
}
if (!(headNode instanceof MetaTag))
continue;
MetaTag it = (MetaTag) headNode;
if(it.getMetaTagName()==null)
continue;
String keywords = it.getMetaTagName().toLowerCase();
if ("keywords".equals(keywords)) {
metaModel.setKeywords(it.getMetaContent().replaceAll("Hoopchina", keyWords));
} else if ("description".equals(keywords)) {
metaModel.setDescription(it.getMetaContent()
.replaceAll("Hoopchina", keyWords)
.replaceAll("虎扑体育论坛", keyWords));
}
}//end headList
}//end htmlList
break;
}//end
return metaModel;
}
/**
*
* @param currDate
* 刚抓取的时间
* @param lastDate
* 数据库中最新时间
* @return
*/
public static boolean isDateAfter(String currDate, String lastDate) {
try {
DateFormat df = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss",
Locale.US);
return df.parse(currDate).after(df.parse(lastDate));
} catch (ParseException e) {
e.printStackTrace();
return false;
}
}
/**
* X-Powered-By=[mod_xlayout_jh/0.0.1vhs.markII.remix]
* ETag=["16be25-1cc81-150d9280"] null=[HTTP/1.0 200 OK] Date=[Mon, 17 Dec
* 2007 07:10:23 GMT] Content-Type=[text/html] Cache-Control=[max-age=60]
* Connection=[close] Expires=[Mon, 17 Dec 2007 07:11:23 GMT]
* Accept-Ranges=[bytes] X-Cache=[HIT from sh-14.sina.com.cn]
* Server=[Apache/2.0.59 (Unix)] Last-Modified=[Mon, 17 Dec 2007 07:08:42
* GMT] Vary=[Accept-Encoding]
*/
public String getUrlDate(String urlAddr) {
// DateFormat df = new SimpleDateFormat("E, dd MMM yyyy
// HH:mm:ss",Locale.US);
String date = null;
try {
if (logger.isInfoEnabled())
logger.info("now open the " + urlAddr);
URL url = new URL(urlAddr);
URLConnection conn = url.openConnection();
conn.connect();
conn.setConnectTimeout(10*1000);
Map<String, List<String>> map = conn.getHeaderFields();
Set<String> set = map.keySet();
for (String key : set) {
if (logger.isDebugEnabled())
logger.debug(key + "=" + map.get(key));
}
if (conn.getHeaderFields().toString().indexOf("200 OK") == -1) {
logger.warn(urlAddr + "can't connect!");
return null;
}
date = conn.getHeaderField("Date");
// Date=[Mon, 17 Dec 2007 07:10:23 GMT]
// df.parse(date);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return date;
}
/**
* X-Powered-By=[mod_xlayout_jh/0.0.1vhs.markII.remix]
* ETag=["16be25-1cc81-150d9280"] null=[HTTP/1.0 200 OK] Date=[Mon, 17 Dec
* 2007 07:10:23 GMT] Content-Type=[text/html] Cache-Control=[max-age=60]
* Connection=[close] Expires=[Mon, 17 Dec 2007 07:11:23 GMT]
* Accept-Ranges=[bytes] X-Cache=[HIT from sh-14.sina.com.cn]
* Server=[Apache/2.0.59 (Unix)] Last-Modified=[Mon, 17 Dec 2007 07:08:42
* GMT] Vary=[Accept-Encoding]
*/
public HttpHeads getHttpHeads(String urlAddr) {
// DateFormat df = new SimpleDateFormat("E, dd MMM yyyy
// HH:mm:ss",Locale.US);
String date = null;
try {
if (logger.isInfoEnabled())
logger.info("now open the " + urlAddr);
URL url = new URL(urlAddr);
URLConnection conn = url.openConnection();
conn.connect();
conn.setConnectTimeout(10*1000);
Map<String, List<String>> map = conn.getHeaderFields();
Set<String> set = map.keySet();
for (String key : set) {
if (logger.isDebugEnabled())
logger.debug(key + "=" + map.get(key));
}
logger.debug("contentLength()=" + conn.getContentLength());
if (conn.getHeaderFields().toString().indexOf("200 OK") == -1) {
logger.warn(urlAddr + "can't connect!");
return null;
}
HttpHeads httpHeads = new HttpHeads();
date = conn.getHeaderField("Date");
httpHeads.setDate(date);
httpHeads.setAccept_Ranges(conn.getHeaderField("Accept-Ranges"));
httpHeads.setCache_Control(conn.getHeaderField("Cache-Control"));
httpHeads.setConnection(conn.getHeaderField("Connection"));
httpHeads.setContent_Type(conn.getHeaderField("Content_Type"));
httpHeads.setETag(conn.getHeaderField("ETag"));
httpHeads.setExpires(conn.getHeaderField("Expires"));
httpHeads.setLast_Modified(conn.getHeaderField("Last_Modified"));
httpHeads.setServer(conn.getHeaderField("Server"));
httpHeads.setVary(conn.getHeaderField("Vary"));
httpHeads.setX_Cache(conn.getHeaderField("Cache"));
httpHeads.setX_Powered_By(conn.getHeaderField("Powered_By"));
httpHeads.setContentLength(conn.getContentLength());
return httpHeads;
// Date=[Mon, 17 Dec 2007 07:10:23 GMT]
// df.parse(date);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public BaseAction() {
super();
}
}
这个不知大家明白不?
参考资料
HtmlParser给我们提供的Tag比较多.但是像这样的好
像都是不行哦.因为HtmlParser只提供到 TableColumn, TableHeader, TableRow, TableTag,
没有直接提供tr,td这种一看就明白的Tag,那我们如何解决这个问题.方法就是继承HtmlParser的Tag.呵呵.其实继承他很简单.看代码.
public class TrTag extends CompositeTag {
/**
* The set of names handled by this tag.
*/
private static final String[] mIds = new String[] { "TR" };
/**
* The set of end tag names that indicate the end of this tag.
*/
private static final String[] mEndTagEnders = new String[] { "TABLE" };
/**
* Create a new div tag.
*/
public TrTag() {
}
/**
* Return the set of names handled by this tag.
*
* @return The names to be matched that create tags of this type.
*/
public String[] getIds() {
return (mIds);
}
/**
* Return the set of end tag names that cause this tag to finish.
*
* @return The names of following end tags that stop further scanning.
*/
public String[] getEndTagEnders() {
return (mEndTagEnders);
}
}
呵呵.有了这一招,对待这样的易如反掌了.当然了对待其他标签也一样的做法.
阅读(779) | 评论(0) | 转发(0) |