Chinaunix首页 | 论坛 | 博客
  • 博客访问: 545932
  • 博文数量: 298
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 3077
  • 用 户 组: 普通用户
  • 注册时间: 2019-06-17 10:57
文章分类

全部博文(298)

文章存档

2022年(96)

2021年(201)

2019年(1)

我的朋友

分类: Java

2021-09-24 12:28:56


点击(此处)折叠或打开


  1. package com.fh.util;

  2. import java.io.BufferedReader;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.io.InputStreamReader;
  6. import java.net.URL;
  7. import java.net.URLConnection;
  8. import java.util.ArrayList;
  9. import java.util.List;
  10. import java.util.regex.Matcher;
  11. import java.util.regex.Pattern;

  12. /**
  13.  * 说明:爬取网页
  14.  * 作者:FH Admin
  15.  * from:fhadmin.cn
  16.  */
  17. public class GetWeb {

  18.     /**
  19.      * 获取当前网页的code
  20.      *
  21.      * @param httpUrl
  22.      * 网页地址
  23.      * @return
  24.      * @throws IOException
  25.      */
  26.     public static String getHtmlCode(String httpUrl) throws IOException {
  27.         String content = "";         // 定义字符串content
  28.         URL url = new URL(httpUrl); // 生成传入的URL的对象
  29.         BufferedReader reader = new BufferedReader(new InputStreamReader(
  30.                 url.openStream(), "utf-8"));// 获得当前url的字节流(缓冲)
  31.         String input;
  32.         while ((input = reader.readLine()) != null) { // 当前行存在数据时
  33.             content += input;         // 将读取数据赋给content
  34.         }
  35.         reader.close();             // 关闭缓冲区
  36.         return content;
  37.     }

  38.     /**
  39.      * 把网页中的所有图片的完整路径放到list里面
  40.      *
  41.      * @param wwwurl
  42.      * 要爬的网页连接
  43.      * @throws IOException
  44.      */
  45.     public static List<String> getImagePathList(String httpUrl)
  46.             throws IOException {

  47.         // 通过扩展名匹配网页图片的正则表达式
  48.         // String searchImgReg =
  49.         // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  50.         // String searchImgReg2 =
  51.         // "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  52.         // 通过img标签匹配网页图片的正则表达式
  53.         String searchImgReg = "<(img|IMG)\\b[^>]*\\b(src|SRC|src2|SRC2)\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>";
  54.         List<String> imgList = new ArrayList<String>();     // 存放图片的list
  55.         String content = null;
  56.         content = getHtmlCode(httpUrl);                        // 获得content
  57.         Pattern pattern = Pattern.compile(searchImgReg);     // 讲编译的正则表达式对象赋给pattern
  58.         Matcher matcher = pattern.matcher(content);         // 对字符串content执行正则表达式
  59.         while (matcher.find()) {
  60.             String quote = matcher.group(3);
  61.             String imgsrc = (quote == null || quote.trim().length() == 0) ? matcher.group(4).split("\\s+")[0] : matcher.group(4);
  62.             if (!imgsrc.startsWith("http://") && !imgsrc.startsWith("https://")) {             // 检验地址是否http://
  63.                 String[] httpUrlarr = httpUrl.split("/");
  64.                 String wwwhost = httpUrlarr[0] + "//" + httpUrlarr[2]; //获取域名完整地址
  65.                 if(!isNetFileAvailable(wwwhost + "/" + imgsrc)){
  66.                     for(int i=3;i<httpUrlarr.length;i++){
  67.                         wwwhost = wwwhost + "/" + httpUrlarr[i];
  68.                         if(isNetFileAvailable(wwwhost + "/" + imgsrc)){
  69.                             imgsrc = wwwhost + "/" + imgsrc;
  70.                             break;
  71.                         }
  72.                     }
  73.                 }else{
  74.                     imgsrc = wwwhost + "/" + imgsrc;
  75.                 }
  76.             }
  77.             imgList.add(imgsrc);
  78.         }
  79.         return imgList;

  80.     }

  81.     /**
  82.      * 获取网页的标题
  83.      *
  84.      * @param httpUrl
  85.      * 要爬的网页连接
  86.      * @return
  87.      */
  88.     public static String getTilte(String httpUrl) {

  89.         String searchTitle = "(|<TITLE>)(.*?)(|)"; // 获取网页的标题的正则表达式
  90.         Pattern pattern = Pattern.compile(searchTitle); // 获得content
  91.         try {
  92.             Matcher matcher = pattern.matcher(getHtmlCode(httpUrl));
  93.             while (matcher.find()) {
  94.                 return matcher.group(2);
  95.             }
  96.         } catch (IOException e) {
  97.             e.printStackTrace();
  98.         }
  99.         return null;

  100.     }

  101.     /**
  102.      * 检测网络资源是否存在 
  103.      *
  104.      * @param strUrl
  105.      * @return
  106.      */
  107.     public static boolean isNetFileAvailable(String strUrl) {
  108.         InputStream netFileInputStream = null;
  109.         try {
  110.             URL url = new URL(strUrl);
  111.             URLConnection urlConn = url.openConnection();
  112.             netFileInputStream = urlConn.getInputStream();
  113.             if (null != netFileInputStream) {
  114.                 return true;
  115.             } else {
  116.                 return false;
  117.             }
  118.         } catch (IOException e) {
  119.             return false;
  120.         } finally {
  121.             try {
  122.                 if (netFileInputStream != null)
  123.                     netFileInputStream.close();
  124.             } catch (IOException e) {
  125.             }
  126.         }
  127.     }
  128. }



阅读(873) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~