Version 0.8.0 and 0.9.0
Place in the bin sub-directory within your Nutch install and run.
CALL THE SCRIPT USING THE FULL PATH TO THE SCRIPT OR IT WON'T WORK
Example Usage
./usr/local/nutch/bin/recrawl /usr/local/tomcat/webapps/ROOT /usr/local/nutch/crawl 10 31
Setting adddays at 31 causes all pages will to be recrawled.
Changes for 0.9.0
No changes necessary for this to run with Nutch 0.9.0.
Code
#!/bin/bash
# Nutch recrawl script. # Based on 0.7.2 script at http://today.java.net/pub/a/today/2006/02/16/introduction-to-nutch-2.html
# # The script merges the new segments all into one segment to prevent redundant # data. However, if your crawl/segments directory is becoming very large, I # would suggest you delete it completely and generate a new crawl. This probaly # needs to be done every 6 months. # # Modified by Matthew Holt # mholt at elon dot edu
if [ -n "$1" ] then tomcat_dir=$1 else echo "Usage: recrawl servlet_path crawl_dir depth adddays [topN]" echo "servlet_path - Path of the nutch servlet (full path, ie: /usr/local/tomc at/webapps/ROOT)" echo "crawl_dir - Path of the directory the crawl is located in. (full path, i e: /home/user/nutch/crawl)" echo "depth - The link depth from the root page that should be crawled." echo "adddays - Advance the clock # of days for fetchlist generation. [0 for n one]" echo "[topN] - Optional: Selects the top # ranking URLS to be crawled." exit 1 fi
if [ -n "$2" ] then crawl_dir=$2 else echo "Usage: recrawl servlet_path crawl_dir depth adddays [topN]" echo "servlet_path - Path of the nutch servlet (full path, ie: /usr/local/tomc at/webapps/ROOT)" echo "crawl_dir - Path of the directory the crawl is located in. (full path, i e: /home/user/nutch/crawl)" echo "depth - The link depth from the root page that should be crawled." echo "adddays - Advance the clock # of days for fetchlist generation. [0 for n one]" echo "[topN] - Optional: Selects the top # ranking URLS to be crawled." exit 1 fi
if [ -n "$3" ] then depth=$3 else echo "Usage: recrawl servlet_path crawl_dir depth adddays [topN]" echo "servlet_path - Path of the nutch servlet (full path, ie: /usr/local/tomc at/webapps/ROOT)" echo "crawl_dir - Path of the directory the crawl is located in. (full path, i e: /home/user/nutch/crawl)" echo "depth - The link depth from the root page that should be crawled." echo "adddays - Advance the clock # of days for fetchlist generation. [0 for n one]" echo "[topN] - Optional: Selects the top # ranking URLS to be crawled." exit 1 fi
if [ -n "$4" ] then adddays=$4 else echo "Usage: recrawl servlet_path crawl_dir depth adddays [topN]" echo "servlet_path - Path of the nutch servlet (full path, ie: /usr/local/tomcat/webapps/ROOT)" echo "crawl_dir - Path of the directory the crawl is located in. (full path, ie: /home/user/nutch/crawl)" echo "depth - The link depth from the root page that should be crawled." echo "adddays - Advance the clock # of days for fetchlist generation. [0 for n one]" echo "[topN] - Optional: Selects the top # ranking URLS to be crawled." exit 1 fi
if [ -n "$5" ] then topn="-topN $5" else topn="" fi
#Sets the path to bin nutch_dir=`dirname $0`
# Only change if your crawl subdirectories are named something different webdb_dir=$crawl_dir/crawldb segments_dir=$crawl_dir/segments linkdb_dir=$crawl_dir/linkdb index_dir=$crawl_dir/index
# The generate/fetch/update cycle for ((i=1; i <= depth ; i++)) do $nutch_dir/nutch generate $webdb_dir $segments_dir $topn -adddays $adddays segment=`ls -d $segments_dir
|
阅读(1183) | 评论(1) | 转发(0) |