此信息采集程序的实现原理:
首先,给定一个文章列表页面的URL,通过crawl爬行抓去该页面中的article的URL;
然后,对同一个web应用的每一篇文章而言,其title,content都是可以定位到的。通过匹配取出其中内容。
此程序包含两个页面,保存为*.php后可以运行在SAE平台下,以下是详细代码:
crawler.php
- <h2>Please input a valid url to CRAWLING</h2>
-
<form action="" method="post">
-
<input type="text" name="site" value="" style="width:200px;" />
-
<input type="submit" name="submit" value="CRAWLING" />
-
</form>
-
<?php
-
if(isset($_POST['submit'])){
-
$link_to_dig = $_POST['site'];
-
//$link_to_dig = "";
-
-
-
$f = new SaeFetchurl();
-
$original_file = $f->fetch($link_to_dig);
-
//$original_file = file_get_contents($link_to_dig);
-
if(!$original_file)
-
die("Error loading {$link_to_dig}");
-
-
$path_info = parse_url($link_to_dig);
-
$base = $path_info['scheme'] . "://" . $path_info['host'];
-
-
$stripped_file = strip_tags($original_file, "");
-
$fixed_file = preg_replace("/]*)href=\"\//is", ", $stripped_file);
-
$fixed_file = preg_replace("/]*)href=\"\?/is", ", $fixed_file);
-
preg_match_all("/]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $fixed_file, $matches);
-
-
-
-
$result = print_r($matches, true);
-
$result = str_replace("<", "<", $result);
-
print "
"
. $result . "";
-
-
-
}
-
-
?>
gatherinformation.php
- <html >
-
<head>
-
<title>Gather information</title>
-
</head>
-
<body>
-
<h2>Gather information</h2>
-
<form action="" method="post">
-
<table style="width:800px;">
-
<tr><th>URL:</th><td ><input type="text" name="url" value="" style="width:400px;" />increase tid if you like</td></tr>
-
<tr><th>Start:</th><td><input type="text" name="start" value="" style="width:400px;" />Input: <?php print htmlentities("
"
); ?></td></tr>-
<tr><th>End:</th><td><input type="text" name="end" value="" style="width:400px;" />Input: <?php print htmlentities("
); ?></td></tr>-
<tr><th></th><td><input type="submit" name="submit" value="Grasp" /></td></tr>
-
</table>
-
<div>
-
<?php
-
-
if(isset($_POST['submit'])){
-
$url = $_POST['url'];
-
$start = $_POST['start'];
-
$end = $_POST['end'];
-
if ($source = file_get_contents($url)) {
-
//test();
-
echo extract_content($source,$start,$end);
-
}
-
}
-
?>
-
</div>
-
</form>
-
</body>
-
</html>
-
<?php
-
-
function extract_content($string,$start,$end)
-
{
-
$pos = stripos($string,$start);
-
$str1 = substr($string,$pos);
-
$str2 = substr($str1,strlen($start));
-
$second_pos = stripos($str2,$end);
-
$str3 = substr($str2,0,$second_pos);
-
$content = trim($str3);
-
return $content;
-
}
-
function getTextBetweenTags($tag, $html, $strict=0)
-
{
-
-
$dom = new DOMDocument('1.0', 'utf-8');
-
if($strict==1)
-
{
-
$dom->loadXML($html);
-
}
-
else
-
{
-
$dom->loadHTML($html);
-
}
-
-
/*** discard white space ***/
-
$dom->preserveWhiteSpace = false;
-
$content = $dom->getElementsByTagname($tag);
-
$out = array();
-
foreach ($content as $item)
-
{
-
$out[] = $item->nodeValue;
-
}
-
return $out;
-
}
-
-
function getTextBetweenComments($string, $start,$end)
-
{
-
$pattern = "/<$start>(.*?)<\/$end>/";
-
preg_match($pattern, $string, $matches);
-
return $matches[1];
-
}
-
-
function search($source,$start,$count){
-
return substr($source,strpos($source, $start)+strlen($start), (int)$count);
-
}
-
-
-
function test(){
-
$string ='Street23 -ParisStreet 33- Berlin Street 453- London';
-
$array = explode("",$string);
-
$desired_array = array();
-
foreach($array as $value)
-
{
-
$value = trim(strip_tags($value));;
-
if($value)
-
{
-
list($street,$city) = explode("-",$value);
-
$desired_array[trim($street)] = trim($city);
-
}
-
}
-
echo "
"
;
-
print_r($desired_array);
-
echo "";
-
}
-
-
?>
阅读(2010) | 评论(0) | 转发(0) |