分类:
2008-10-15 16:31:14
HTMLPage类:
HTMLPage类中主要也就几种用途,而从HTMLPage类中抓取图片是一个非常重要的一个功能,诚然还有超链接和表单。而在HTMLPage类的内置类Parser中,大部分工作都是由handleSimpleTag(简单标签)和和handleStartTag(起始标签方法来完成。
详细代码清单如下:
package com.heaton.bot;
import java.util.*;
import com.heaton.bot.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
/**
* The HTMLPage class is used to parse an HTML page and store
* that page, in a parsed form, in memory.
* are exchanged with a webserver.
*/
public class HTMLPage {
/**
* A list of images on this page.
*/
protected Vector images = new Vector();
/**
* A list of links on this page.
*/
protected Vector links = new Vector();
/**
* A list of forms on this page.
*/
protected Vector forms = new Vector();
/**
* The underlying HTTP object for this page.
*/
protected HTTP http;
/**
* The base URL to resolve relative URL's.
*/
protected String base;
/**
* Construct an HTMLPage object.
*
* @param http The HTTP object(or subclass) to use to
* download pages.
*/
public HTMLPage(HTTP http)
{
this.http = http;
}
/**
* Called to open a page and read it in. If null
* is specified for the callback(回调), then the other
* methods in this class may be used to look at
* images, links and forms.
* open是进入HTMLPage类的入口点。
* @param url The URL to read.
* @param callback A callback class to handle the parse, or null
* to use the built in one.
* @exception java.io.IOException
* @exception javax.swing.text.BadLocationException
*/
public void open(String url,
HTMLEditorKit.ParserCallback callback)
throws IOException,BadLocationException
{
http.send(url,null);
base = url;
processPage(callback);
}
/**
* Internal function called to start the parse.
*
* @param callback The callback object to use.
* @exception java.io.IOException
*/
protected void processPage(HTMLEditorKit.ParserCallback callback)
throws IOException
{
/*
* 创建一个字符串阅读器。
*/
StringReader r = new StringReader(http.getBody());
/*
* 创建新的解析器。
*/
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
/*
* 程序检查是否提供了定制的回调类。如果提供了回调类,则
* 使用该会掉泪,从而结束了HTMLPage类的工作。如果没有提
* 供回调类,HTMLPage则使用内置的回调类,该回调类为Parser。
*/
[1]