多线程内容匹配抓取-davidxueer-ChinaUnix博客

说明：在线程池控制本线程时，同时使用下面3个全局变量
catch_urllist: TList; 待抓取的URL结构列表
matching_urllist: TList; 符合内容的URL结构列表
error_urllist: TStrings; 不符合内容的URL

*******************************************************************************
unit threadspider;
interface

uses
Classes, Windows, Messages, IdHTTP, ComCtrls, SysUtils, PerlRegEx, SyncObjs;

const
WM_THREADSTART = WM_USER + 100; { 线程启动 }
WM_THREADSTOP = WM_USER + 101; { 一次抓取完毕 }
WM_THREADGETSTOP = WM_USER + 102; { 线程结束 }

type
TUrlInfo = record
url: string[100]; //网址
count: Integer; //累计次数
end;

PMatchInfo = ^TMatchInfo;
TMatchInfo = record
    url: array [0..100] of Char;    //网址
    Title: array [0..100] of Char; //标题
    count: Integer;                 //累计次数
end;

TThreadSpider = class(TThread)
private
    FHandle: THandle;
    Fhttp: TIdHTTP;
    FTimeOut: Integer;
    FLock: TCriticalSection; //数据共享保护
    procedure SetName;
    function GetHtml(url: string): string;
    procedure LinksFromHtml(html: string);
    function GetPageTitle(html: string): string;
    function ExistFilter(html: string): Boolean;
    function ExistInclude(html: string): Boolean;
    { ********************* }
    function GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
    function ExistMacthList(url: string): Boolean;
    function ExistErrorList(url: string): Boolean;
    function ExistCatchList(url: string): Boolean;
    procedure AddToMacthList(matchinfo: TMatchInfo);
    procedure AddToErrorList(urlinfo: TUrlInfo);
    procedure AddToCatchList(url: string);
    { ********************* }
protected
    procedure Execute; override;
public
    constructor Create(Handle: THandle; TimeOut: Integer);
    destructor Destroy; override;
end;

implementation

uses variable;
type
TThreadNameInfo = record
    FType: LongWord;     // must be 0x1000
    FName: PChar;        // pointer to name (in user address space)
    FThreadID: LongWord; // thread ID (-1 indicates caller thread)
    FFlags: LongWord;    // reserved for future use, must be zero
end;

{ TThreadSpider }

procedure TThreadSpider.SetName;
var
ThreadNameInfo: TThreadNameInfo;
begin
ThreadNameInfo.FType := $1000;
ThreadNameInfo.FName := 'ThreadSpider';
ThreadNameInfo.FThreadID := $FFFFFFFF;
ThreadNameInfo.FFlags := 0;
try
RaiseException( $406D1388, 0, sizeof(ThreadNameInfo) div sizeof(LongWord), @ThreadNameInfo );
except
end;
end;

procedure TThreadSpider.Execute;
var
html: string;
urlinfo: TUrlInfo;
matchinfo: TMatchInfo;
begin
SetName;
while not Terminated do
begin
    //已没有新的URL可采集，则退出，销毁线程
    if not GetUrlInfo(urlinfo) then Exit;
    html := GetHtml(urlinfo.url);
    //用户停止线程
    if Terminated then Exit;
    if html = '' then
    begin
      //说明页面无法访问
      AddToErrorList(urlinfo);
    end
    //判断过滤条件
    else if (not ExistFilter(html)) and ExistInclude(html) then
    begin
      //说明此URL符合要求，加至符合要求的LIST中
      FillChar(matchinfo, SizeOf(Tmatchinfo), #0);
      StrPCopy(matchinfo.url, urlinfo.url);
      StrPCopy(matchinfo.Title, GetPageTitle(html));
      matchinfo.count := urlinfo.count;
      AddToMacthList(matchinfo);
      //取该连接下面的URL
      LinksFromHtml(html);
    end else
    begin
      //说明页面中不存在相关的内容，废弃
      AddToErrorList(urlinfo);
    end;
    //一次抓取完毕。
    SendMessage(FHandle, WM_THREADGETSTOP, 0, 0);
end;
{ Place thread code here }
end;

constructor TThreadSpider.Create(Handle: THandle; TimeOut: Integer);
begin
inherited Create(True);
Fhttp := TIdHTTP.Create(nil);
FLock := TCriticalSection.Create;
Fhttp.ReadTimeout := TimeOut * 1000; //抓取超时
FHandle := Handle;
FTimeOut := TimeOut;
FreeOnTerminate := True;
SendMessage(FHandle, WM_THREADSTART, 0, 0);
end;

destructor TThreadSpider.Destroy;
begin
Fhttp.Disconnect;
Fhttp.Free;
FLock.Free;
SendMessage(FHandle, WM_THREADSTOP, Integer(Self), 0);
inherited Destroy;
end;

function TThreadSpider.GetHtml(url: string): string;
var
html: string;
begin
try
html := Fhttp.Get(url);
except
html := '';
end;
Result := html;
end;

{ 取HTML页面内的URL地址 }
procedure TThreadSpider.LinksFromHtml(html: string);
var
MyRegex: TPerlRegEx;
List: TStrings;
url: string;
I: Integer;
begin
MyRegex := TPerlRegEx.Create(nil);
List := TStringList.Create;
try
    MyRegex.RegEx := 'http://(?P[a-z-A-Z0-9.]+)';
    MyRegex.Subject := Html;
    if MyRegex.Match then
    begin
      repeat
        url := 'http://' + MyRegex.SubExpressions[1] + '/';
        if List.IndexOf(url)<0 then List.Add(url);
      until not MyRegex.MatchAgain;
    end;
    //开始判断URL是否存在和所在区域（已过滤，已匹配）
    for i := 0 to List.Count -1 do
    begin
      url := List.Strings[i];
      if ExistMacthList(url) then Continue; // 已匹配
      if ExistErrorList(url) then Continue; // 已过滤
      { 如果不存在，就添加到待采集的LIST中 }
      if not ExistCatchList(url) then AddToCatchList(url);
    end;
finally
    List.Free;
    MyRegex.Free;
end;
end;

{ 取网页TITLE }
function TThreadSpider.GetPageTitle(html: string): string;
var
MyRegex: TPerlRegEx;
begin
MyRegex := TPerlRegEx.Create(nil);
try
    MyRegex.RegEx := '[s ]*(.*?)[s ]*';
    MyRegex.Subject := Html;
    if MyRegex.Match then
    begin
      Result := MyRegex.SubExpressions[1];
    end;
finally
    MyRegex.Free;
end;
end;

//取欲获取HTML的URL结构
function TThreadSpider.GetUrlInfo(var UrlInfo: TUrlInfo): Boolean;
begin
Result := False;
try
    FLock.Enter;
    if catch_urllist.Count > 0 then
    begin
      //取最前面的
      urlinfo := TUrlInfo(catch_urllist.Items[0]^);
      //取得后删除，
      FreeMem(catch_urllist.Items[0]);
      catch_urllist.Delete(0);
      Result := True;
    end;
finally
    FLock.Leave;
end;
end;

{ 过滤不允许的关键字 }
function TThreadSpider.ExistFilter(html: string): Boolean;
var
i: Integer;
begin
Result := False;
try
    FLock.Enter;
    for i := 0 to Word_Filter.Count -1 do
    begin
      if Pos(LowerCase(Word_Filter.Strings[i]), LowerCase(html)) > 0 then {Word_Filter 外部TStringList}
        Result := True;
        Break;
    end;
finally
    FLock.Leave;
end;
end;

{ 查询包含的关键字 }
function TThreadSpider.ExistInclude(html: string): Boolean;
var
i: Integer;
begin
Result := False;
try
    FLock.Enter;
    for i := 0 to Word_Include.Count -1 do
    begin
      if Pos(LowerCase(Word_Include.Strings[i]), LowerCase(html)) > 0 then {Word_Include 外部TStringList}
        Result := True;
        Break;
    end;
finally
    FLock.Leave;
end;
end;

{ 判断是否存在于已符合要求的URL LIST中 }
function TThreadSpider.ExistMacthList(url: string): Boolean;
var
i, count: Integer;
MatchInfo: TMatchInfo;
begin
Result := False;
try
    FLock.Enter;
    for i := 0 to matching_urllist.Count -1 do
    begin
      MatchInfo := TMatchInfo(matching_urllist.Items[i]^);
      if LowerCase(Trim(MatchInfo.url)) = LowerCase(Trim(url)) then
      begin
        count := MatchInfo.count + 1;
        TMatchInfo(matching_urllist.Items[i]^).count := count;
        Result := True;
        Break;
      end;
    end;
finally
    FLock.Leave;
end;
end;

{ 判断是否存在于不符合要求的URL LIST中 }
function TThreadSpider.ExistErrorList(url: string): Boolean;
var
i: Integer;
begin
Result := False;
try
    FLock.Enter;
    for i := 0 to error_urllist.Count -1 do
    begin
      if LowerCase(error_urllist.Strings[i]) = LowerCase(url) then
      begin
        Result := True;
        Break;
      end;
    end;
finally
    FLock.Leave;
end;
end;

{ 判断是否存在于待采集的URL LIST中 }
function TThreadSpider.ExistCatchList(url: string): Boolean;
var
i, count: Integer;
urlinfo: TUrlInfo;
begin
Result := False;
try
    FLock.Enter;
    for i := 0 to catch_urllist.Count -1 do
    begin
      urlinfo := TUrlInfo(catch_urllist.Items[i]^);
      if LowerCase(urlinfo.url) = LowerCase(url) then
      begin
        count := urlinfo.count + 1;
        TUrlInfo(catch_urllist.Items[i]^).count := count;
        Result := True;
        Break;
      end;
    end;
finally
    FLock.Leave;
end;
end;

procedure TThreadSpider.AddToMacthList(matchinfo: TMatchInfo);
var
info: PMatchInfo;
begin
info := AllocMem(SizeOf(TMatchInfo));
info.url := matchinfo.url;
info.Title := matchinfo.Title;
info.count := matchinfo.count;
try
    FLock.Enter;
    matching_urllist.Add(info);
finally
    FLock.Leave;
end;
end;

procedure TThreadSpider.AddToErrorList(urlinfo: TUrlInfo);
begin
try
    FLock.Enter;
    error_urllist.Add(urlinfo.url);
finally
    FLock.Leave;
end;
end;

{ 添加一个新的URL }
procedure TThreadSpider.AddToCatchList(url: string);
var
info: ^TUrlInfo;
begin
info := AllocMem(SizeOf(TUrlInfo));
info^.url := Trim(url);
info^.count := 1; //新的，默认为 1
try
    FLock.Enter;
    catch_urllist.Add(info);
finally
    FLock.Leave;
end;
end;

end.