分类:
2009-12-30 17:57:48
/**
* 定义了每个采集插件需要使用的类,
* Main 主体类
* Log
* Content
* Status
*/
require_once('Gather_Func.php');
abstract class Gather_Base{
//类的构造参数
private $gl_ID;
private $OS;
protected $RMT_name;
private $Encoding;
//类的私有参数
private $timeStart;
private $timeEnd;
private $eclapse;
private $readPages;
private $log_file;
private $retStatus;
private $db;
protected $areasSell=Array();
protected $areasBuy=Array();
/**
* 构造方法
*
* @param unknown_type $RMT_name 这个网站的RMT名称,要和管理系统中相对应
* @param unknown_type $Encoding 这个网站页面的编码
* @return Gather_Base
*/
public function Gather_Base($RMT_name,$Encoding){
$this->RMT_name=$RMT_name;
$this->Encoding=$Encoding;
}
/**
* 为子类提供一个方法,用来判断从一个URL来的内容是不是已经更新过的
*
* @param unknown_type $url URL地址
* @param unknown_type $doc 页面内容
* @return unknown 是否是更新过的
*/
protected final function isNew($url,$doc){
$md5=md5($doc);
$sql="select count(*) from rmt_gather_md5 where url='".$url."'";
$count=$this->db->Single($sql);
if($count){
$sql="select md5 from rmt_gather_md5 where url='".$url."'";
$lastMd5=$this->db->Single($sql);
if($lastMd5==$md5){
return false;
}else{
$sql="update rmt_gather_md5 set md5='".$md5."' ,updated=".time()." where url='".$url."'";
$this->db->Query($sql);
return true;
}
}else{
$sql="insert into rmt_gather_md5(url,md5,updated) values('$url','$md5',".time().");";
$this->db->Query($sql);
return true;
}
}
/**
* 由主程序调用 设置几个参数
*
* @param unknown_type $gl_ID
* @param unknown_type $OS
* @param unknown_type $db
*/
public final function set($gl_ID,$OS,$db){
$this->gl_ID=$gl_ID;
$this->OS=$OS;
$this->timeStart=time();
if($OS=='Win'){
if(!file_exists("./log")){mkdir("./log");}
if(!file_exists("./log/".$this->RMT_name)){mkdir("./log/".$this->RMT_name);}
}
$this->log_file='./log/'.$this->RMT_name.'/'.date('Ymd_His_').$gl_ID.'.txt';
$this->db=$db;
}
/**
* 由主程序调用
*
* @return 返回主程序需要的所有数据
*/
public final function ending(){
$this->timeEnd=time();
$this->eclapse=$this->timeEnd - $this->timeStart;
return array(
'RMT_name'=>$this->RMT_name,
'Buy'=>$this->areasBuy,
'Sell'=>$this->areasSell,
'Status'=>$this->retStatus,
'timeStart'=>$this->timeStart,
'timeEnd'=>$this->timeEnd,
'eclapse'=>$this->eclapse,
'readPages'=>$this->readPages
); //程序结束
}
/**
* 模式匹配,带错误检查
*
* @param string $reg 正则表达式
* @param string $doc 被搜索的文档
* @param bool $enableNone 是否允许无匹配
* @return array 匹配结果数组
*/
protected final function pregMatchAll($reg,$doc,$enableN){
$matchs=array();
$r=preg_match_all($reg,$doc,$matchs,PREG_SET_ORDER);
if($r===false){
$this->status(array(
'code'=>_T::GC_MATCH_WRONG,
'content'=>"$reg \n$doc "
));
return false;
}
if(!$enableNone&&$r==0){
$this->status(array(
'code'=>_T::GC_MATCH_NONE,
'content'=>"$reg \n$doc "
));
return false;
}
return $matchs;
}
/**
* 取得一个表格中的数据,去除各种HTML标识
*
* @param unknown_type $doc
* @return unknown 二维数组
*/
function getTableData($doc){
$doc=$this->getMiddle($doc,','
');
if(!$doc)return false;
$doc=$this->getMiddle($doc,'>');
if(!$doc)return false;
$trs=$this->getMiddles($doc,',' ');
if(!$trs)return false;
$Table=array();
foreach($trs as $tr){
$tr=$this->getMiddle($tr,'>');
$tds=$this->getMiddles($tr,',' ');
if(!$tds)continue;
$Row=array();
foreach($tds as $td){
$td=$this->getMiddle($td,'>');
$td=preg_replace('/(<[^>]*>)/','',$td);
$Row[]=$td;
}
$Table[]=$Row;
}
return $Table;
}
/**
* 模式匹配,带错误检查
*
* @param string $reg 正则表达式
* @param string $doc 被搜索的文档
* @param bool $enableNone 是否允许无匹配
* @return array 匹配结果数组
*/
protected final function pregMatch($reg,$doc,$enableN){
$matchs=array();
$r=preg_match($reg,$doc,$matchs);
if($r===false){
$this->status(array(
'code'=>_T::GC_MATCH_WRONG,
'content'=>"$reg\n$doc " ));
return false;
}
if(!$enableNone&&$r==0){
$this->status(array(
'code'=>_T::GC_MATCH_NONE,
'content'=>"$reg \n$doc "));
return false;
}
return $matchs;
}
/**
* 获取字符串中指定开头和结尾中间的内容(不包括开头和结尾标识),不区分大小写
*
* @param 要处理的字符串 $str
* @param 开头标识 $begin(如果空,则从字符串开头开始)
* @param 结尾标识 $end(如果空,则到字符串结尾)
*
* @return 返回中间的字符串,或者是错误代码
*/
protected final function getMiddle($str,$begin=null,$end=null,$enableN){
if($begin!==null){
$begin=strtolower($begin);
$i=strpos($str,$begin);
if($i===false){
if($enableNone)return '';
$this->status(array(
'code'=>_T::GC_EXPLODE_NO_BEGIN,
'content'=>"$begin \n$str \n"
));
return false;
}
$str=substr($str,$i+strlen($begin));
if($str===false){
if($enableNone)return '';
$this->status(array(
'code'=>_T::GC_EXPLODE_NO_MIDDLE,
'content'=>"$begin \n$end \n$str \n"
));
return false;
}
}
if($end!==null){
$end=strtolower($end);
$j=strpos($str,$end);
if($j===false){
if($enableNone)return $str;
$this->status(array(
'code'=>_T::GC_EXPLODE_NO_END,
'content'=>"$end \n$str "));
return false;
}
$str=substr($str,0,$j);
if($str===false){
if($enableNone)return '';
$this->status(array(
'code'=>_T::GC_EXPLODE_NO_MIDDLE,
'content'=>"$begin \n$end \n$str "));
return false;
}
}
return $str;
}
//用来去除文档中的 protected final function eraseTable($doc){
$doc=preg_replace('/(]*>)/i','',$doc);
$doc=preg_replace('/(<\/table>)/i','',$doc);
$doc=preg_replace('/(]*>)/i','',$doc);
$doc=preg_replace('/(<\/tr>)/i','',$doc);
$doc=preg_replace('/(]*>)/i','',$doc);
$doc=preg_replace('/(<\/td>)/i','',$doc);
$doc=preg_replace('/(]*")/i','',$doc);
$doc=preg_replace("/(onclick='[^']*')/i",'',$doc);
$doc=preg_replace("/(class='[^']*')/i",'',$doc);
$doc=preg_replace('/(class="[^"]*")/i','',$doc);
$doc=preg_replace('/\s*/i','',$doc);
$doc=str_replace('
','',$doc);
$doc=str_replace(',','',$doc);
return $doc;
}
//取得本段中所有出现的段落
protected final function getMiddles($doc,$start,$end){
$results=array();
$offset=0;
while(true){
$i=strpos($doc,$start,$offset);
if($i===false) return $results;
$offset=$i+strlen($start);
$j=strpos($doc,$end,$offset);
if($j===false)return $results;
$offset=$j;
$results[]=substr($doc,$i+strlen($start),$j-$i-strlen($start));
}
return null;
}
/**
* 取得url内容
* 码表转换
* @param unknown_type $url
* @return unknown
*/
protected final function get2($url){
$this->log(_T::readFile,$url);
$doc=file_get_contents($url);
if(!$doc){
sleep(2);
echo "\nretry\n";
$doc=file_get_contents($url);
if(!$doc){
sleep(2);
echo "\nretry\n";
$doc=file_get_contents($url);
if(!$doc){
echo '\nAbort\n';
}
}
}
if($doc===false) {
$this->status(array('code'=>_T::GC_ACCESS_DENY,'content'=>"[url=$url]url:$url[/url]"));
return false;
}
$doc=mb_convert_encoding($doc,"utf-8",$this->Encoding);
$this->readPages ++;
//如果在Windows系统下,记录快照
if($this->OS=='Win'){
if(!file_exists("./snap")){mkdir("./snap");}
if(!file_exists("./snap/".$this->RMT_name)){mkdir("./snap/".$this->RMT_name);}
if(substr($url,0,7)=='http://')$url=substr($url,7);
if(substr($url,0,strlen($this->RMT_name))==$this->RMT_name)
$url=substr($url,strlen($this->RMT_name));
if(substr($url,0,1)=='/')$url=substr($url,1);
$url=str_replace('/','_',$url);
$url=str_replace(':','_',$url);
$url=str_replace('?','_',$url);
$url=str_replace('=','_',$url);
$filename='./snap/'.$this->RMT_name.'/'.date('Ymd_His_').$url;
if(!preg_match('/\.html?$/i',$filename))
$filename=$filename.".html";
file_put_contents($filename,$doc);
}
return $doc;
}
/**
* 使用指定编码获取文件内容
*
* @param unknown_type $url
* @param unknown_type $encoding
* @return unknown
*/
protected final function getWithEncoding($url,$encoding){
$oldEncoding=$this->Encoding;
$this->Encoding=$encoding;
$doc=$this->get($url);
$this->Encoding=$oldEncoding;
return $doc;
}
/**
* 获取一个url中的页面内容,转换成小写,去除注释和脚本
*
* @param unknown_type $url
* @return unknown
*/
protected final function get($url){
$doc=$this->get2($url);
if($doc===false)return $doc; //如果有错,将错误向上传递
//转换小写,去除注释和脚本
$doc=strtolower($doc);
while(true){
$i=strpos($doc,'');
if($j===false) return $partHead;
$doc=$partHead.substr($partTail,$j+3);
}
while(true){
$i=strpos($doc,'');
if($j===false) return $partHead;
$doc=$partHead.substr($partTail,$j+9);
}
return $doc;
}
/**
* 如果状态是正常,立即返回true
* 否则
* 记录错误到retStatus数组中,同时写到日志文件中 *
* @param unknown_type $err 通常是另一个程序的返回结果,可能包含错误信息
* @return bool 状态是否正常,如果是错误,返回false
*/
private final function status($err){
if(!is_array($err))return true;
if(!isset($err['code']))return true;
$code=$err['code'];
$content=$err['content'];
$this->retStatus[]=$err;
$this->log($code,$content);
return false;
}
/**
* 记录日志
*
* @param string $name 日志项的名称
* @param string $content 日志项的内容
*/
protected final function log($name,$content){
$str="\n" ;
$str.="\t.date('Y-m-d H:i:s')."\n";
$str.="\t$name \n";
$str.="\t$content \n";
$str.="\n";
if($this->OS=='Win'){
file_put_contents($this->log_file,$str,FILE_APPEND);
}
}
/**
* 从字符串最取出数字(如有,按科学计数法算) 如
* $str = "123,55你12qq155,12312312";
* 会返回 12355 默认返回 第一个数字 如第二个参数为FALSE 则返回所有数字的数组
*
* @param string $str
* @param bool $first
* @return int or array
*/
protected final function str_to_int($str,$first=true){
$arrayQ = array("1","2","3","4","5","6","7","8","9","0");
$arrayB = array("1",'2','3','4','5','6','7','8','9','0');
$str = str_replace($arrayQ,$arrayB,$str);
$result=array();
preg_match_all("'[^0-9]*([0-9,]+)[^0-9]*'",$str,$result,PREG_SET_ORDER);
if(count($result)){
foreach ($result as $value){
$value = preg_replace("','","",$value[1]);
$numbers[] = $value;
}
}else {
return $str;
}
if($first){
return $numbers[0];
}
return $numbers;
}
function get_sub_content($start,$end,$str){
if ( $start == '' || $end == '' ){
return;
}
$str = explode($start, $str);
$str = explode($end, $str[1]);
return $str[0];
}
function getKeyWord($ereg){
$result[]="all";
preg_match_all("'{{{{([^{}]*)}}}}'si",$ereg,$result2);
foreach ($result2[1] as $value){
$result[]=$value;
}
return $result;
}
function makeArray($keyword,$data) {
$result = array();
if(!count($data))
return $result;
$count = count($data[0]);
foreach ($data as $key=>$value){
foreach ($value as $keyi=>$valuei){
$result[$key][$keyword[$keyi]]=$valuei;
}
}
return $result;
}
function filterEreg($html){
$ereg = str_replace(array(
'?',
'|',
".",
"*",
'[',
']',
),
array(
"\\?",
"\|",
"\.",
"\*"
,'\[',
'\]',
),
$html
);
$ereg = ereg_replace("{{{{[^{}]*}}}}","([^<>]+)",$ereg);
//dump($ereg);
$ereg = "'".$ereg."'s";
return $ereg;
}
}
阅读(367) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~