Chinaunix首页 | 论坛 | 博客
  • 博客访问: 335678
  • 博文数量: 20
  • 博客积分: 5010
  • 博客等级: 大校
  • 技术积分: 680
  • 用 户 组: 普通用户
  • 注册时间: 2006-07-03 08:02
文章存档

2013年(9)

2009年(6)

2008年(5)

我的朋友

分类: 项目管理

2008-12-26 01:36:17

说明:在线程池控制本线程时,同时使用下面3个全局变量 
  catch_urllist: TList;   待抓取的URL结构列表
  matching_urllist: TList;  符合内容的URL结构列表
  error_urllist: TStrings;  不符合内容的URL

*******************************************************************************
unit threadspider;
interface

uses
  Classes, Windows, Messages, IdHTTP, ComCtrls, SysUtils, PerlRegEx, SyncObjs;

const
  WM_THREADSTART = WM_USER + 100;   { 线程启动 }
  WM_THREADSTOP = WM_USER + 101;    { 一次抓取完毕 }
  WM_THREADGETSTOP = WM_USER + 102; { 线程结束 }

type
  TUrlInfo = record
    url: string[100];   //网址
    count: Integer;     //累计次数
  end;

  PMatchInfo = ^TMatchInfo;
  TMatchInfo = record
    url: array [0..100] of Char;    //网址
    Title: array [0..100] of Char;  //标题
    count: Integer;                 //累计次数
  end;

  TThreadSpider = class(TThread)
  private
    FHandle: THandle;
    Fhttp: TIdHTTP;
    FTimeOut: Integer;
    FLock: TCriticalSection; //数据共享保护
    procedure SetName;
    function GetHtml(url: string): string;
    procedure LinksFromHtml(html: string);
    function GetPageTitle(html: string): string;
    function ExistFilter(html: string): Boolean;
    function ExistInclude(html: string): Boolean;
    { ********************* }
    function GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
    function ExistMacthList(url: string): Boolean;
    function ExistErrorList(url: string): Boolean;
    function ExistCatchList(url: string): Boolean;
    procedure AddToMacthList(matchinfo: TMatchInfo);
    procedure AddToErrorList(urlinfo: TUrlInfo);
    procedure AddToCatchList(url: string);
    { ********************* }
  protected
    procedure Execute; override;
  public
    constructor Create(Handle: THandle; TimeOut: Integer);
    destructor Destroy; override;
  end;

implementation

uses variable;
type
  TThreadNameInfo = record
    FType: LongWord;     // must be 0x1000
    FName: PChar;        // pointer to name (in user address space)
    FThreadID: LongWord; // thread ID (-1 indicates caller thread)
    FFlags: LongWord;    // reserved for future use, must be zero
  end;

{ TThreadSpider }

procedure TThreadSpider.SetName;
var
  ThreadNameInfo: TThreadNameInfo;
begin
  ThreadNameInfo.FType := $1000;
  ThreadNameInfo.FName := 'ThreadSpider';
  ThreadNameInfo.FThreadID := $FFFFFFFF;
  ThreadNameInfo.FFlags := 0;
  try
    RaiseException( $406D1388, 0, sizeof(ThreadNameInfo) div sizeof(LongWord), @ThreadNameInfo );
  except
  end;
end;

procedure TThreadSpider.Execute;
var
  html: string;
  urlinfo: TUrlInfo;
  matchinfo: TMatchInfo;
begin
  SetName;
  while not Terminated do
  begin
    //已没有新的URL可采集,则退出,销毁线程
    if not GetUrlInfo(urlinfo) then Exit;
    html := GetHtml(urlinfo.url);
    //用户停止线程
    if Terminated then Exit;
    if html = '' then
    begin
      //说明页面无法访问
      AddToErrorList(urlinfo);
    end
    //判断过滤条件
    else if (not ExistFilter(html)) and ExistInclude(html) then
    begin
      //说明此URL符合要求,加至符合要求的LIST中
      FillChar(matchinfo, SizeOf(Tmatchinfo), #0);
      StrPCopy(matchinfo.url, urlinfo.url);
      StrPCopy(matchinfo.Title, GetPageTitle(html));
      matchinfo.count := urlinfo.count;
      AddToMacthList(matchinfo);
      //取该连接下面的URL
      LinksFromHtml(html);
    end else
    begin
      //说明页面中不存在相关的内容,废弃
      AddToErrorList(urlinfo);
    end;
    //一次抓取完毕。
    SendMessage(FHandle, WM_THREADGETSTOP, 0, 0);
  end;
  { Place thread code here }
end;

constructor TThreadSpider.Create(Handle: THandle; TimeOut: Integer);
begin
  inherited Create(True);
  Fhttp := TIdHTTP.Create(nil);
  FLock := TCriticalSection.Create;
  Fhttp.ReadTimeout := TimeOut * 1000; //抓取超时
  FHandle := Handle;
  FTimeOut := TimeOut;
  FreeOnTerminate := True;
  SendMessage(FHandle, WM_THREADSTART, 0, 0);
end;

destructor TThreadSpider.Destroy;
begin
  Fhttp.Disconnect;
  Fhttp.Free;
  FLock.Free;
  SendMessage(FHandle, WM_THREADSTOP, Integer(Self), 0);
  inherited Destroy;
end;

function TThreadSpider.GetHtml(url: string): string;
var
  html: string;
begin
  try
    html := Fhttp.Get(url);
  except
    html := '';
  end;
  Result := html;
end;

{ 取HTML页面内的URL地址 }
procedure TThreadSpider.LinksFromHtml(html: string);
var
  MyRegex: TPerlRegEx;
  List: TStrings;
  url: string;
  I: Integer;
begin
  MyRegex := TPerlRegEx.Create(nil);
  List := TStringList.Create;
  try
    MyRegex.RegEx := 'http://(?P[a-z-A-Z0-9.]+)';
    MyRegex.Subject := Html;
    if MyRegex.Match then
    begin
      repeat
        url := 'http://' + MyRegex.SubExpressions[1] + '/';
        if List.IndexOf(url)<0 then List.Add(url);
      until not MyRegex.MatchAgain;
    end;
    //开始判断URL是否存在和所在区域(已过滤,已匹配)
    for i := 0 to List.Count -1 do
    begin
      url := List.Strings[i];
      if ExistMacthList(url) then Continue; // 已匹配
      if ExistErrorList(url) then Continue; // 已过滤
      { 如果不存在,就添加到待采集的LIST中 }
      if not ExistCatchList(url) then AddToCatchList(url);
    end;
  finally
    List.Free;
    MyRegex.Free;
  end;
end;

{ 取网页TITLE }
function TThreadSpider.GetPageTitle(html: string): string;
var
  MyRegex: TPerlRegEx;
begin
  MyRegex := TPerlRegEx.Create(nil);
  try
    MyRegex.RegEx := '[s ]*(.*?)[s ]*';
    MyRegex.Subject := Html;
    if MyRegex.Match then
    begin
      Result := MyRegex.SubExpressions[1];
    end;
  finally
    MyRegex.Free;
  end;
end;

//取欲获取HTML的URL结构
function TThreadSpider.GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
begin
  Result := False;
  try
    FLock.Enter;
    if catch_urllist.Count > 0 then
    begin
      //取最前面的
      urlinfo := TUrlInfo(catch_urllist.Items[0]^);
      //取得后删除,
      FreeMem(catch_urllist.Items[0]);
      catch_urllist.Delete(0);
      Result := True;
    end;
  finally
    FLock.Leave;
  end;
end;

{ 过滤不允许的关键字 }
function TThreadSpider.ExistFilter(html: string): Boolean;
var
  i: Integer;
begin
  Result := False;
  try
    FLock.Enter;
    for i := 0 to Word_Filter.Count -1 do
    begin
      if Pos(LowerCase(Word_Filter.Strings[i]), LowerCase(html)) > 0 then {Word_Filter 外部TStringList}
        Result := True;
        Break;
    end;
  finally
    FLock.Leave;
  end;
end;

{ 查询包含的关键字 }
function TThreadSpider.ExistInclude(html: string): Boolean;
var
  i: Integer;
begin
  Result := False;
  try
    FLock.Enter;
    for i := 0 to Word_Include.Count -1 do
    begin
      if Pos(LowerCase(Word_Include.Strings[i]), LowerCase(html)) > 0 then {Word_Include 外部TStringList}
        Result := True;
        Break;
    end;
  finally
    FLock.Leave;
  end;
end;

{ 判断是否存在于已符合要求的URL LIST中 }
function TThreadSpider.ExistMacthList(url: string): Boolean;
var
  i, count: Integer;
  MatchInfo: TMatchInfo;
begin
  Result := False;
  try
    FLock.Enter;
    for i := 0 to matching_urllist.Count -1 do
    begin
      MatchInfo := TMatchInfo(matching_urllist.Items[i]^);
      if LowerCase(Trim(MatchInfo.url)) = LowerCase(Trim(url)) then
      begin
        count := MatchInfo.count + 1;
        TMatchInfo(matching_urllist.Items[i]^).count := count;
        Result := True;
        Break;
      end;
    end;
  finally
    FLock.Leave;
  end;
end;

{ 判断是否存在于不符合要求的URL LIST中 }
function TThreadSpider.ExistErrorList(url: string): Boolean;
var
  i: Integer;
begin
  Result := False;
  try
    FLock.Enter;
    for i := 0 to error_urllist.Count -1 do
    begin
      if LowerCase(error_urllist.Strings[i]) = LowerCase(url) then
      begin
        Result := True;
        Break;
      end;
    end;
  finally
    FLock.Leave;
  end;
end;

{ 判断是否存在于待采集的URL LIST中 }
function TThreadSpider.ExistCatchList(url: string): Boolean;
var
  i, count: Integer;
  urlinfo: TUrlInfo;
begin
  Result := False;
  try
    FLock.Enter;
    for i := 0 to catch_urllist.Count -1 do
    begin
      urlinfo := TUrlInfo(catch_urllist.Items[i]^);
      if LowerCase(urlinfo.url) = LowerCase(url) then
      begin
        count := urlinfo.count + 1;
        TUrlInfo(catch_urllist.Items[i]^).count := count;
        Result := True;
        Break;
      end;
    end;
  finally
    FLock.Leave;
  end;
end;

procedure TThreadSpider.AddToMacthList(matchinfo: TMatchInfo);
var
  info: PMatchInfo;
begin
  info := AllocMem(SizeOf(TMatchInfo));
  info.url := matchinfo.url;
  info.Title := matchinfo.Title;
  info.count := matchinfo.count;
  try
    FLock.Enter;
    matching_urllist.Add(info);
  finally
    FLock.Leave;
  end;
end;

procedure TThreadSpider.AddToErrorList(urlinfo: TUrlInfo);
begin
  try
    FLock.Enter;
    error_urllist.Add(urlinfo.url);
  finally
    FLock.Leave;
  end;
end;

{ 添加一个新的URL }
procedure TThreadSpider.AddToCatchList(url: string);
var
  info: ^TUrlInfo;
begin
  info := AllocMem(SizeOf(TUrlInfo));
  info^.url := Trim(url);
  info^.count := 1; //新的,默认为 1
  try
    FLock.Enter;
    catch_urllist.Add(info);
  finally
    FLock.Leave;
  end;
end;

end.

阅读(1646) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~