Chinaunix首页 | 论坛 | 博客
  • 博客访问: 821062
  • 博文数量: 754
  • 博客积分: 7000
  • 博客等级: 少将
  • 技术积分: 5005
  • 用 户 组: 普通用户
  • 注册时间: 2008-09-12 12:54
文章分类

全部博文(754)

文章存档

2011年(1)

2008年(753)

我的朋友
最近访客

分类:

2008-09-12 13:09:46

    package com.tag;

 

    import java.net.MalformedURLException;

    import java.net.URL;

    import org.apache.commons.httpclient.Header;

    import org.apache.commons.httpclient.HeaderElement;

    import org.apache.commons.httpclient.HttpClient;

    import org.apache.commons.httpclient.NameValuePair;

    import org.apache.commons.httpclient.methods.GetMethod;

    import toptrack.tools.JQueryBase;

 

    /**

     * 得到网页编码格式

     * @author dl

     */

    public class JHtmlUpdateCheck {

        /**文本内容编码识别类*/

        private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();

        static {

            detector.add(new cpdetector.io.HTMLCodepageDetector(false));

            detector.add(cpdetector.io.JChardetFacade.getInstance());

        }

 

        /**

         *
方法说明:得到网页编码格式

         *
输入参数:strUrl 网页链接; timeout 超时设置

         *
返回类型:网页编码

         */

        public static String getEncoding(String strUrl, int timeout) {

            String strEncoding = null;

            HttpClient client = new HttpClient();

            client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);

            GetMethod method = new GetMethod(strUrl);

            method.setFollowRedirects( true );

            int statusCode;

            try {

                statusCode = client.executeMethod(method);

                if( statusCode != -1) {

                    //从http头得到网页编码

                    strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

                    if (strEncoding != null) {

                        method.releaseConnection();

                        return strEncoding;

                    }

                    //通过解析meta得到网页编码

                    String strHtml = method.getResponseBodyAsString().toLowerCase();

                    StringBuffer strBuffer = new StringBuffer();

                    int pos = JQueryBase.getTagText(strHtml, "", strBuffer, false, 0);

                    while (strBuffer.length() > 0) {

                        StringBuffer strEncodingBuffer = new StringBuffer();

                        JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);

                        if (strEncodingBuffer.length() > 0) {

                            strEncoding = strEncodingBuffer.toString();

                            method.releaseConnection();

                            return strEncoding;

                        }

                        strBuffer = new StringBuffer();

                        pos = JQueryBase.getTagText(strHtml, "", strBuffer, false, pos);

                    }

                    //分析字节得到网页编码

                    strEncoding = getFileEncoding(strUrl, timeout);

                    //设置默认网页字符编码

                    if (strEncoding == null)

                        strEncoding = "GBK";

                }

                method.releaseConnection();

            } catch (Exception e) {

                // TODO Auto-generated catch block

                System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

                return null;

            }

 

            return strEncoding;

        }

 

        /**

         *
方法说明:通过http头得到网页编码信息

         *
输入参数:contentheade rhttp头

         *
返回类型:网页编码

         */

        protected static String getContentCharSet(Header contentheader) {

            String charset = null;

            if (contentheader != null) {

                HeaderElement values[] = contentheader.getElements();

                if (values.length == 1) {

                    NameValuePair param = values[0].getParameterByName("charset");

                    if (param != null) {

                        charset = param.getValue();

                    }

                }

            }

            return charset;

        }

 

 

[1]  

【责编:landy】

--------------------next---------------------

阅读(793) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~