不知道有人清楚网络蜘蛛的吗?help!(100分)

  • 主题发起人 主题发起人 浮生
  • 开始时间 开始时间

浮生

Unregistered / Unconfirmed
GUEST, unregistred user!
要做电子商务平台,后台要有一个搜索引擎,需要用到“网络蜘蛛”。
但小弟对此原理一窍不通,只知道它是自动在网上搜索,找到符合要求的页面自动
入库。我上源码空间见到一个“网络蜘蛛”,但由于对网络的生疏,看不懂。
不知有哪位大侠,帮我介绍一下它的原理?
谢谢
nit spider;

interface

uses
Windows, SysUtils, Classes, GhSock, hyperstr, dialogs;

const
Max_LAYERS = 15;

type
TURL = record
Proto: string;
Server: string;
Domain: string;
Path: string;
Filename: string;
end;
TOnBeforeConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnFailConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnAfterConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnLinkFound = procedure(ThreadNum, Layer: Integer; URL: TUrl; var searchThisLink: Boolean) of object;
TOnPageLoaded = procedure(ThreadNum, Layer: Integer; URL: TUrl; Header, HTMLBody, TextBody: string) of object;
TOnfoundEmail = procedure(ThreadNum, Layer: integer; EmailAddress: string) of object;
TOnGetNextUrlSeed = procedure(ThreadNum: Integer; var UrlToSearch: string) of object;
TOnError = procedure(ThreadNum: Integer; ErrorMessage: string) of object;

type
TSpider = class;
TSpiderThread = class(TThread)
private
TempUrlList: tstringlist;
LayersURLSToSearch: array[1..max_layers] of tstringlist;
Fabort: Boolean;
ThreadNumber: Integer;
MainSpiderObj: TSpider;
Http1: TGhClient;
procedure FOnTerminate(Sender: TObject);
protected

public
constructor Create(Sender: Tspider; ThreadNum: integer);
destructor Destroy; override;
procedure Execute; override;
procedure Abort;
published

end;

Tspider = class(Tcomponent)
private
{ Private declarations }
FProxyAddr: string;
FMaxLayers: Integer;
FThreadsToUse: Integer;
ThreadList: array[1..100] of TspiderThread;
FOnBeforeConnect: TOnBeforeConnect;
FOnAfterConnect: TOnAfterConnect;
FOnFailConnect: TOnFailConnect;
FOnLinkFound: TOnLinkfound;
FOnPageLoaded: TOnPageLoaded;
FOnEmailfound: TOnfoundEmail;
FOnGetNextUrlSeed: TOnGetNextUrlSeed;
FOnError: TOnError;
function Running: Boolean;
protected
{ Protected declarations }
public
{ Public declarations }
work_CS: TRTLCriticalSection;
procedure Run;
constructor Create(Aowner: Tcomponent); override;
destructor Destroy; override;
procedure Abort;
published
property ProxyAddr: string read FProxyAddr write FProxyAddr;
property MaxLayers: integer read fmaxlayers write fmaxlayers;
property ThreadsToUse: integer read FThreadsToUse write FThreadsToUse;
property OnAfterConnect: TOnAfterConnect read FOnAfterConnect write FOnAfterConnect;
property OnBeforeConnect: TOnBeforeConnect read FOnBeforeConnect write FOnBeforeConnect;
property OnFailConnect: TOnFailConnect read FOnFailConnect write FOnFailConnect;
property OnLinkFound: TOnLinkfound read FOnLinkFound write FOnLinkFound;
property OnPageLoaded: TOnPageLoaded read FOnPageLoaded write FOnPageLoaded;
property OnEmailFound: TOnfoundEmail read FOnEmailFound write FOnEmailFound;
property OnGetNextUrlSeed: TOnGetNextUrlSeed read FOnGetNextUrlSeed write FOnGetNextUrlSeed;
property OnError: TOnError read FOnError write FOnError;
{ Published declarations }
end;


procedure Register;

implementation

function IsIpAddress(IP: string): Boolean;
begin
Result := False;
if (CountF(IP, '.', 1) = 3) and IsNum(IP[1]) and IsNum(Copy(ip, length(ip), 1)) then
Result := True;
end;

function Parse_Url(InString: string): TURL;
var
Tempstr: string;
TempInt: Integer;
TempUrl: TUrl;
begin

// default ret vals.
TempUrl.proto := '';
TempUrl.server := '';
TempUrl.domain := '';
TempUrl.path := '/';
TempUrl.filename := '';

TempStr := Instring;

{If CountF(Tempstr, '.', 1)=1 Then
Begin
If copy(
TempUrl.Filename:=Tempstr;
tempstr:='';
End;}


if pos('://', tempstr) > 0 then
begin
TempUrl.proto := copy(tempstr, 1, pos('://', tempstr) - 1);
tempstr := copy(tempstr, pos('://', tempstr) + 3, 5000);
end;

/// grab host name
if pos('/', tempstr) > 0 then
begin
TempUrl.Domain := copy(tempstr, 1, pos('/', tempstr) - 1);
// grab the balance of the Url line.
Tempstr := copy(tempstr, pos('/', tempstr), 5000); // use the balance of the string to gat path and file
end
else
if CountF(tempstr, '.', 1) > 0 then // theres at least one period
begin
TempUrl.Domain := tempstr; // it must be a domain
Tempstr := ''; // no more to do after domain.

end;

Tempint := CountF(TempUrl.Domain, '.', 1); // find out how many periods there are..

if TempInt > 1 then // is there a second period in the domain, then there is a server..
begin
if not IsIpAddress(Tempurl.Domain) then
begin
// now get the period that separtates server from Domain ..
if (Pos('.COM.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.EDU.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.GOV.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.NET.', Uppercase(TempUrl.Domain)) > 0) then
TempInt := ScanCC(TempUrl.Domain, '.', tempint - 2)
else
TempInt := ScanCC(TempUrl.Domain, '.', tempint - 1);

TempUrl.Server := Copy(TempUrl.Domain, 1, tempint);
TempUrl.Domain := Copy(TempUrl.Domain, tempint + 1, 5000);
end;
end;

if Length(Tempstr) = 0 then
begin
parse_url.proto := TempUrl.path;
parse_url.server := TempUrl.server;
parse_url.domain := TempUrl.Domain;
parse_url.path := TempUrl.Path;
parse_url.filename := TempUrl.Filename;
exit; // no more to do.
end;

// scan for '?'
Tempint := pos(tempstr, '?');

if TempInt > 0 then // there are some url variables..
begin
TempUrl.filename := copy(tempstr, tempint, 1000); // grab the url vars..
TempUrl.path := copy(tempstr, 1, tempint - 1);
Tempstr := TempUrl.path;
end;


Tempint := ScanB(Tempstr, '/', 0); // scan backwards for a slash
if Tempint > 0 then
begin
TempUrl.filename := copy(Tempstr, Tempint + 1, 1000) + TempUrl.filename; // add the vars back in..
TempUrl.path := copy(Tempstr, 1, Tempint);
end;

// this might not be nessarry..
if length(TempUrl.path) = 0 then
TempUrl.path := '/';

parse_url.proto := TempUrl.path;
parse_url.server := TempUrl.server;
parse_url.domain := TempUrl.Domain;
parse_url.path := TempUrl.Path;
parse_url.filename := TempUrl.Filename;



end;

function stripHtml(HTMLString: string): string;
var
workstring, token: string;
var
I: integer;
begin
SetLength(workstring, 100000);
workstring := htmlstring;
I := 1;
while I > 0 do
begin
Token := ParseTag(workstring, '<A', '/A>', I);
if I > 0 then
begin
Delete(workstring, I - (length(token) + 4), length(token) + 4);
I := 1;
end;
end;

I := 1;
while I > 0 do
begin
Token := ParseTag(workstring, '<', '>', I);
if I > 0 then
begin
Delete(workstring, I - (length(token) + 2), length(token) + 2);
I := 1;
end;
end;

// take off the &nbsp
while pos('&nbsp', workstring) > 0 do
begin
Delete(workstring, pos('&nbsp', workstring), 5);
end;

result := workstring;
end;



constructor Tspider.create(Aowner: Tcomponent);
begin
inherited create(Aowner);
MaxLayers := 1;
ThreadsToUse := 1;
InitializeCriticalSection(Work_CS); // initialize my Critical section.
end;

destructor Tspider.Destroy;
begin
inherited destroy;
end;

procedure Tspider.Run;
var
x: Integer;
begin
if running then
begin
Abort;
sleep(2000);
end;
begin
for x := 1 to FThreadsToUse do
begin
ThreadList[x] := TspiderThread.create(self, x);
end;
end;
end;


procedure Tspider.Abort;
var
x: Integer;
begin
for x := 1 to FThreadsToUse do
begin
if Threadlist[x] <> nil then
ThreadList[x].abort;
end;
end;

function Tspider.Running: Boolean;
var
x: Integer;
begin
for x := 1 to FThreadsToUse do
begin
if Threadlist[x] <> nil then
begin
Running := True;
end;
Running := False;
end;
end;


constructor TSpiderThread.Create(Sender: TSpider; ThreadNum: Integer);
begin
Threadnumber := Threadnum;
Self.MainSpiderObj := Sender;
http1 := nil;
Onterminate := FOnTerminate;
inherited Create(False);
end;

destructor TSpiderThread.Destroy;
begin
inherited;
end;

procedure TSpiderThread.Abort;
begin
Http1.Disconnect;
Fabort := True;
end;

procedure TSpiderThread.FOnTerminate(Sender: TObject);
begin
Free;
Self := nil;
end;

procedure TSpiderThread.Execute;
var
tempstr, workstr, temp_result: string;
x, MailPos, which_layer, which_layer_item, BytesSoFar, I, ContentLength: integer; Temp_url: Turl; frameset: boolean;
var
PageHeader, Body: string; headerlist: Tstringlist; requestheader: string; AddToSearchLater: boolean;
var
Textonly: string; Url_Info: TURL;

begin
//FreeonTerminate := True;
Fabort := false;
Setlength(workstr, 100000); //??
Http1 := Tghclient.create(nil);
for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x] := tstringlist.create;

TempUrlList := tstringlist.create;
with MainSpiderObj do
try
try

while not Fabort do
begin

for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x].clear;

TempStr := '';
// add Seed address ..
EnterCriticalSection(work_CS);
try
if assigned(FOnGetNextUrlSeed) then
FOnGetNextUrlSeed(Threadnumber, TempStr);
finally
LeaveCriticalSection(Work_CS);
end;

if Length(Tempstr) <= 0 then // done, no more to do..
Exit;


LayersURLSToSearch[1].add(lowercase(TempStr));

for Which_layer := 1 to FMaxLayers do
begin

if Fabort then break;

for which_layer_item := 0 to LayersURLSToSearch[Which_layer].count - 1 do
begin
if Fabort then break;
Url_Info := Parse_URL(LayersURLSToSearch[Which_layer].strings[which_layer_item]);
if Length(Url_Info.domain) <= 0 then
continue;

FrameSet := False;

try
headerlist := Tstringlist.create;
try
headerList.add('Host: ' + Url_info.server + Url_info.domain);
headerList.add('Accept: */*');
headerList.add('Accept-Language: en-us');
headerList.add('User-Agent: Mozilla/4.0 ');
Requestheader := headerlist.text + #13#10;
finally
Headerlist.free;
end;

if length(FProxyAddr) > 0 then
begin
http1.host := FProxyAddr;
tempstr := 'GET Http://' + Url_info.Server + Url_info.Domain + Url_info.Path + Url_info.Filename + ' HTTP/1.0' + #13#10 + requestheader;
end
else
begin
Http1.host := Url_Info.Server + Url_Info.Domain;
tempstr := 'GET ' + Url_Info.Path + Url_Info.FileName + ' HTTP/1.0' + #13#10 + requestheader;
end;

EnterCriticalSection(work_CS);
try
if assigned(FOnBeforeConnect) then
FOnBeforeConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;

/// showmessage('Looking up'+tempstr);
Http1.Port := 80;
try
Http1.connect;
except
EnterCriticalSection(work_CS);
try
if assigned(FOnFailConnect) then
FOnFailConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;
raise;
Continue;
end;

EnterCriticalSection(work_CS);
try
if assigned(FOnAfterConnect) then
FOnAfterConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;




// send request to web server..
http1.SocketIO.WriteFromString(TempStr);

/// get page back..

// Get header first..
pageHeader := '';
http1.SocketIO.ReadTimeout := 2;
while true do
begin
if Fabort then break;
tempstr := http1.SocketIO.ReadlnToString;
Pageheader := Pageheader + tempstr;
if (Length(tempstr) = 2) and (pos(tempstr, #13#10) > 0) then
Break;
end;

/// check the header to see if this is a TEXT/HTML type, if not, quit now..
if (length(PageHeader) <= 0) or not (Pos('TEXT/HTML', UpperCase(PageHeader)) > 0) then
begin
http1.Disconnect;
Continue;
end;



ContentLength := 0;
tempstr := '';
// look for the length..
if pos('CONTENT-LENGTH', UPPERCASE(Pageheader)) > 0 then
begin
Tempstr := Copy(Pageheader, pos('CONTENT-LENGTH', UPPERCASE(Pageheader)) + 15, 10);
ContentLength := strtointdef(Copy(Tempstr, 1, pos(#10, Tempstr) - 2), 0);
end;

// Now get body ...
BytesSoFar := 0;
Body := '';
TextOnly := '';
Tempstr := '';
while True do
begin
if Fabort then break;
Http1.SocketIO.ReadTimeout := 6;
tempstr := Http1.SocketIO.ReadToString(-1); // read everything in buffer.
Body := Body + tempstr;
BytesSoFar := BytesSoFar + length(tempstr);
// if there is a content length and we have recieved enough, then break.
if (contentlength > 0) and (BytesSoFar >= ContentLength) then
break;
end;


except
on EReadFail do ;
on EReadTimeout do ;
on e: exception do
begin
http1.disconnect;
EnterCriticalSection(work_CS);
try
if assigned(FOnError) then
FOnError(Threadnumber, E.message + LayersURLSToSearch[Which_layer].strings[which_layer_item]);
finally
LeaveCriticalsection(work_cs);
end;
end;
end;

Textonly := stripHtml(Body);
EnterCriticalSection(work_CS);
try
if assigned(FOnPageLoaded) then
FOnPageLoaded(Threadnumber, Which_Layer, Url_Info, Pageheader,
Body, textonly);
finally
LeaveCriticalSection(work_cs);
end;
// now look for emails and links...


workstr := lowercase(Body);
if (Which_layer <= FMaxLayers) then
begin
// seee if this is a frame page..
if (pos('</frameset>', workstr) > 0) then // frame def file..
Frameset := True;

TempUrlList.clear;

I := 1;
while I > 0 do
begin
if Fabort then break;
tempstr := ParseTag(workstr, 'href="', '"', I);
// put it into our list only if it's not a gif or jpg
if (length(tempstr) > 0) and not ((pos('.gif', Tempstr) > 0) or (pos('.jpg', Tempstr) > 0)) then
begin
TempUrlList.add(tempstr);
end;
end;

if frameset then
begin
I := 1;
while I > 0 do
begin
tempstr := ParseTag(workstr, 'src="', '"', I);
if (length(tempstr) > 0) and not ((pos('.gif', Tempstr) > 0) or (pos('.jpg', Tempstr) > 0)) then
TempUrlList.add(tempstr);
end;
end;

///Now decide whether to keep these to search later..
for X := 0 to TempURLList.count - 1 do
begin
if Fabort then break;
if pos('mailto:', tempURlList.Strings[x]) > 0 then // ignore mail addresses here..
continue;

Temp_Result := '';
if (CountF(tempURlList.Strings[x], '.', 1) = 1) or (copy(tempURlList.Strings[x], 1, 3) = '../') or (copy(tempURlList.Strings[x], 1, 1) = '/') then
begin
if Copy(tempURlList.Strings[x], 1, 1) = '/' then
tempURlList.Strings[x] := Url_info.Server + Url_Info.Domain + tempURlList.Strings[x]
else
tempURlList.Strings[x] := Url_info.Server + Url_Info.Domain + '/' + tempURlList.Strings[x];
end;

Temp_Url := Parse_URL(tempURlList.Strings[x]);

if (Length(Temp_url.domain) > 0) then
begin
AddToSearchLater := True;
EnterCriticalSection(work_CS);
try
if assigned(FOnLinkFound) then
FOnLinkFound(Threadnumber, which_Layer, temp_URL, AddToSearchLater);
finally
LeaveCriticalSection(Work_cs);
end;
if AddToSearchLater then
LayersURLSToSearch[Which_layer + 1].add(temp_Url.Server + Temp_url.Domain + Temp_url.Path + Temp_url.Filename);
end;
end;

Tempstr := '';
for X := 0 to TempURLList.count - 1 do
begin
if Fabort then break;
MailPos := pos('mailto:', tempURlList.Strings[x]); // look only at mail addresses here..
if (MailPos > 0) and (pos('@', tempURlList.Strings[x]) > 0) then
begin
Tempstr := copy(tempURlList.Strings[x], mailpos + 7, 500);
EnterCriticalSection(work_CS);
try
if assigned(FOnEmailfound) then
FOnEmailfound(Threadnumber, which_Layer, tempstr);
finally
LeaveCriticalSection(work_cs);
end;
end;
end;
end;

end; // If (Which_layer + 1 <= FMaxLayers) then

if Fabort then break;
end;
end;
except
on E: exception do
begin
if assigned(FOnError) then
FOnError(Threadnumber, E.message);
end;
end;
finally
TempUrlList.free;
for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x].Free;
Http1.free;
http1 := nil;
end;
end;


procedure Register;
begin
RegisterComponents('Internet', [Tspider]);
end;

initialization


end.
 
我怎么不能安装的??
hyperstr.pas找不到
 
我也不知道
 
使用httpscan比较好。
 
多人接受答案了。
 

Similar threads

I
回复
0
查看
514
import
I
I
回复
0
查看
731
import
I
I
回复
0
查看
1K
import
I
后退
顶部