Chinaunix首页 | 论坛 | 博客
  • 博客访问: 21515
  • 博文数量: 29
  • 博客积分: 10
  • 博客等级: 民兵
  • 技术积分: 300
  • 用 户 组: 普通用户
  • 注册时间: 2011-03-14 22:37
个人简介

------------

文章分类

全部博文(29)

分类: LINUX

2014-01-17 19:48:00

一念行者 的blog,内容很好,保存之,脚本如下:
这个脚本把所有的页面html文件取下来。

点击(此处)折叠或打开

  1. #!/bin/bash
  2. #set -x

  3. START_PAGE=$1

  4. declare -A G_MAP_BLOG_NAME_ADDR
  5. declare -A G_MAP_PAGE_NAME_ADDR
  6. declare -A G_MAP_NAME_PARSE_STAT
  7. declare -A G_MAP_NAME_IDX
  8. declare -a G_MAP_IDX_NAME

  9. G_NUM_HTML=0
  10. G_THIS_INC=0
  11. G_LINKS_IN_THIS_PAGE=

  12. function get_links_in_page() {
  13.     page=$1
  14.     G_LINKS_IN_THIS_PAGE=
  15.     if [ ! -e $page ]
  16.     then
  17.         echo "File '$page' not exist"
  18.         return 1
  19.     fi
  20.     articles=` cat $1 |sed 's/href=/\nhref=/g' |grep "href=\"http:\/\/blog\.sina\.com\.cn\/s\/blog_[a-z0-9]\+\.html\"" |sed -n '/href=\"/ p' |sed 's/.*href=\"\([^"]\+\).*/\1/g'`

  21.     pagelinks=`cat $1 |sed 's/href=/\nhref=/g' |grep "href=\"http:\/\/blog\.sina\.com\.cn\/s\/articlelist_[0-9a-z_]\+\.html\"" |sed -n '/href=\"/ p' |sed 's/.*href=\"\([^"]\+\).*/\1/g'`

  22.     G_LINKS_IN_THIS_PAGE="$articles $pagelinks"
  23.     return 0
  24. }

  25. function push_html() {
  26.     if [ "aa$1" == "aa" ]
  27.     then
  28.         echo "Must provide a parameter for 'push_html()'"
  29.         return 1
  30.     fi
  31.     html_link=$1
  32.     html_file=${html_link##*\/}
  33.     if [ "${html_file:(-4)}" != "html" ]
  34.     then
  35.         echo "Invalid parameter(not html) for 'push_html()': $html_link"
  36.         return 1
  37.     fi

  38.     if [ -e $html_file ]
  39.     then
  40.         # already get the file
  41.         return 999
  42.     fi

  43.     if [ "${html_file:0:4}" == "arti" ]
  44.     then
  45.         G_MAP_PAGE_NAME_ADDR[$html_file]=$html_link
  46.     elif [ "${html_file:0:4}" == "blog" ]
  47.     then
  48.         G_MAP_BLOG_NAME_ADDR[$html_file]=$html_link
  49.     else
  50.         echo "Invlid parameter(not blog or article) for 'push_html()': $html_link"
  51.         return 1
  52.     fi

  53.     G_MAP_NAME_PARSE_STAT[$html_file]=NOT_PARSED
  54.     G_MAP_NAME_IDX[$html_file]=$G_NUM_HTML
  55.     G_MAP_IDX_NAME[$G_NUM_HTML]=$html_file

  56.     (( G_NUM_HTML ++))
  57.     (( G_THIS_INC ++ ))

  58. }
  59. function is_local_link() {
  60.     blogtitle">http://blog.sina.com.cn/xnfm
  61.     if [ "aa$1" == "aa" ]
  62.     then
  63.         echo "In 'is_local_link()', you must provide a parameter"
  64.         return 0
  65.     fi
  66.     if [ ! -e $1 ]
  67.     then
  68.         echo "In 'is_local_link()', file '$1' not exist"
  69.         return 0
  70.     fi
  71.     cat $1 |grep "blogtitle\">
  72.     if [ $? -eq 0 ]
  73.     then
  74.         return 1
  75.     else
  76.         return 0
  77.     fi
  78. }

  79. function parse_html() {
  80.     idx=0
  81.     while [ $idx -lt $G_NUM_HTML ] && [ "aa${G_MAP_IDX_NAME[$idx]}" != "aa" ]
  82.     do
  83.         html_name=${G_MAP_IDX_NAME[$idx]}
  84.         if [ "${G_MAP_NAME_PARSE_STAT[$html_name]}" == "NOT_PARSED" ]
  85.         then
  86.             if [ "${html_name:0:4}" == "blog" ]
  87.             then
  88.                 html_link=${G_MAP_BLOG_NAME_ADDR[$html_name]}
  89.             elif [ "${html_name:0:4}" == "arti" ]
  90.             then
  91.                 html_link=${G_MAP_PAGE_NAME_ADDR[$html_name]}
  92.             else
  93.                 echo "Something must be error, get invalid html name from G_MAP_IDX_NAME"
  94.                 echo "idx: $idx; name: ${G_MAP_IDX_NAME[$idx]}"
  95.             fi

  96.             wget $html_link
  97.             if [ $? -ne 0 ]
  98.             then
  99.                 echo "get '$html_link' fail\n"
  100.             else
  101.                 is_local_link $html_name
  102.                 if [ $? -eq 1 ] #local link
  103.                 then
  104.                     get_links_in_page $html_name
  105.                     if [ $? -eq 0 ]
  106.                     then
  107.                         G_MAP_NAME_PARSE_STAT[$html_name]="PARSED"
  108.                         for xx in $G_LINKS_IN_THIS_PAGE
  109.                         do
  110.                             push_html $xx
  111.                         done
  112.                     else
  113.                         echo "Fail 'get_links_in_page $html_name'"
  114.                     fi
  115.                 fi
  116.             fi
  117.         fi

  118.         (( idx ++ ))
  119.     done
  120. }

  121. push_html $START_PAGE

  122. while [ "1" == "1" ]
  123. do
  124.     G_THIS_INC=0
  125.     parse_html
  126.     if [ $G_THIS_INC -ne 0 ]
  127.     then
  128.         continue
  129.     else
  130.         echo "No more file"
  131.         break
  132.     fi
  133. done

  134. rm article*html
  135. for ff in $.html
  136. do
  137.     is_local_link $ff
  138.     if [ $? -ne 1 ]
  139.     then
  140.         rm $ff
  141.     fi
  142. done

阅读(329) | 评论(0) | 转发(0) |
0

上一篇:没有了

下一篇:ubuntu 14.04 中配置 LAMP

给主人留下些什么吧!~~