Chinaunix首页 | 论坛 | 博客
  • 博客访问: 218162
  • 博文数量: 67
  • 博客积分: 2510
  • 博客等级: 少校
  • 技术积分: 890
  • 用 户 组: 普通用户
  • 注册时间: 2008-01-17 09:42
文章分类

全部博文(67)

文章存档

2010年(5)

2009年(7)

2008年(55)

我的朋友

分类: WINDOWS

2009-05-22 09:57:31

 

目前,java解析html的类库还是比较多,但是都是停留在静态解析阶段,只能提取静态html页面里面的各种标签元素,但是现在的html页面里面充斥这大量的脚本,比如下面这种情况下

就很难用目前的html java解析库来提取图片来源,因为他要进一步执行java script后才能确定src里面的内容。因此需要一种等javascript执行完毕后再来提取html元素的组件。
在网上 google baidu了好久后,终于发现了一种java webbrowser组件jrex,该组件封装了molizza的Gecko库,提供java类库给应用程序调用。利用jrex,可以写一个java应用程序访问某个页面,然后等该页面下载完毕并且执行了javascript后,jre browser引擎把文档转换成dom模型,重新结构化该html页面,比如上面这个例子,src属性就被设置为abc.jpg.
下面是一个例子
import org.mozilla.jrex.* ;
import org.mozilla.jrex.ui.* ;
import org.mozilla.jrex.window.* ;
import org.mozilla.jrex.navigation.* ;
import org.mozilla.jrex.event.progress.* ;
import org.w3c.dom.* ;
import java.lang.Exception.* ;
import javax.swing.*;
import java.net.*;

public class Render
implements org.mozilla.jrex.event.progress.ProgressListener {

String url ; // The page to be processed.

// These variables can be used in subclasses and are created from
// url. baseURL can be used to construct the absolute URL of the
// relative URL's in the page. hostBase is just the
// part of the URL and can be used to construct the full URL of
// URLs in the page that are site relative, e.g., "/xyzzy.jpg".
// Variable host is set to the host part of url, e.g., host.com.

String baseURL ;
String hostBase ;
String host ;

// The JRexCanvas is the main browser component. The WebNavigator
// is used to access the DOM.

JRexCanvas canvas = null ;
WebNavigation navigation = null ;

// An event handler sets "done" to true when the document is loaded.

boolean done = false ;

/**
Create a Render object with a target URL.
*/

public Render(String URL) {
url = URL ;
}

/** Load the given URL in Gecko. When the page is loaded,
recurse on the DOM and call doElement()/doTagEnd() for
each Element node. Execution can hang if the page causes a
window to be popped up. Return false on error.
*/

public boolean parsePage() {

// Parse the URL and build baseURL and hostURL for use by doElement()
// and doTagEnd().

URI uri = null ;
try {
uri = new URI(url) ;
}
catch(Exception e) {
System.out.println(e) ;
return false ;
}

String path = uri.getPath() ;
baseURL = " + uri.getHost() + path + "/" ;
hostBase = " + uri.getHost() ;
host = uri.getHost() ;

// Start up JRex/Gecko.

try {
JRexFactory.getInstance().startEngine();
}
catch (Exception e) {
System.err.println("Unable to start up JRex Engine.");
e.printStackTrace();
return false ;
}

// Get a window manager and put the browser in a Swing frame.
// Based on Dietrich Kappe's code.

JRexWindowManager winManager=(JRexWindowManager)
JRexFactory.getInstance().getImplInstance(JRexFactory.WINDOW_MANAGER);
winManager.create(JRexWindowManager.SINGLE_WINDOW_MODE);
JPanel panel = new JPanel();
JFrame frame = new JFrame();
frame.getContentPane().add(panel);
winManager.init(panel);

// Get the JRexCanvas, set Render to handle progress events so
// we can determine when the page is loaded, and get the
// WebNavigator object.

canvas = (JRexCanvas) winManager.getBrowserForParent(panel);
canvas.addProgressListener(this) ;
navigation = canvas.getNavigator() ;

// Load and process the page.

try {
navigation.loadURI(url, WebNavigationConstants.LOAD_FLAGS_NONE,
null, null, null);

// Swing magic.

frame.setSize(640, 480);
frame.setVisible(false);

// Check if the DOM has loaded every two seconds.

while(!done) {
Thread.sleep(2000) ;
}

// Get the DOM and recurse on its nodes.

Document doc = navigation.getDocument() ;
Element ex = doc.getDocumentElement() ;
doTree((Node) ex) ;
}
catch(Exception e) {
System.out.println("Trouble walking DOM: " + e) ;
return false ;
}

return true ;
}

/**
Recurse the DOM starting with Node node. For each Node of
type Element, call doElement() with it and recurse over its
children. The Elements refer to the HTML tags, and the children
are tags contained inside the parent tag.
*/

public void doTree(Node node) {
if(node instanceof Element) {
Element element = (Element) node ;

// Visit tag.

doElement(element) ;

// Visit all the children, i.e., tags contained in this tag.

NodeList nl = element.getChildNodes() ;
if(nl == null) return ;
int num = nl.getLength() ;
for(int i=0; idoTree(nl.item(i)) ;

// Process the end of this tag.

doTagEnd(element) ;
}
}

/**
Simple doElement() to print the tag name of the Element. Override
to do something real.
*/

public void doElement(Element element) {
System.out.println("<" + element.getTagName() + ">") ;
}

/**
Simple doTagEnd() to print the closing tag of the Element.
Override to do something real.
*/

public void doTagEnd(Element element) {
System.out.println("") ;
}

// org.mozilla.jrex.event.progress.ProgressListener methods.
// onStateChange() seems the best place to watch for the
// completion of the loading of the DOM.

/** Noop */
public void onLinkStatusChange(ProgressEvent event) { }
/** Noop */
public void onLocationChange(ProgressEvent event) { }
/** Noop */
public void onProgressChange(ProgressEvent event) { }
/** Noop */
public void onSecurityChange(ProgressEvent event) { }

/** onStateChange is invoked several times when DOM loading is
complete. Set the done flag the first time.
*/

public void onStateChange(ProgressEvent event) {
if(!event.isLoadingDocument()) {
if(done) return ;
done = true ;
}
}

/** Noop */
public void onStatusChange(ProgressEvent event) { }

/**
Main: java com.benjysbrain.htmlgrab.Render [url]. Run
JRex on the given page, wait for the page to load, and
traverse the DOM, printing tag names only.
*/

public static void main(String[] args) {
String url =" ;
if(args.length == 1) url = args[0] ;
Render p = new Render(url) ;
p.parsePage() ;
System.exit(0) ;
}
}

运行程序需要设置好类路径
-Djrex.dom.enable=true
-Djrex.gre.path=%JREX_GRE_PATH%

阅读(1164) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~