如何快速获取网页中的超级链接,以及链接的超级链接,类似一个链接的递归.(100分)

  • 主题发起人 主题发起人 Helix
  • 开始时间 开始时间
H

Helix

Unregistered / Unconfirmed
GUEST, unregistred user!
我可以获取当前网页的所有链接,但是想快速获得下一层的链接,就很慢.
如果可以给一个网页,目的是为了获取以此网页为源头,向下树型扩充所有的链接.
请问如何是好?
 
网络蜘蛛
 
那请问,有没有源代码?
 
有你自己去找一下
unit spider;

interface

uses
Windows, SysUtils, Classes, GhSock,HYPERSTR,dialogs;

const
Max_LAYERS = 15;

type
TURL = record
Proto: string;
Server: string;
Domain: string;
Path: string;
Filename: string;
end;
TOnBeforeConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnFailConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnAfterConnect = procedure(ThreadNum, Layer: Integer; URL: TUrl) of object;
TOnLinkFound = procedure(ThreadNum, Layer: Integer; URL: TUrl; var searchThisLink: Boolean) of object;
TOnPageLoaded = procedure(ThreadNum, Layer: Integer; URL: TUrl; Header, HTMLBody, TextBody: string) of object;
TOnfoundEmail = procedure(ThreadNum, Layer: integer; EmailAddress: string) of object;
TOnGetNextUrlSeed = procedure(ThreadNum: Integer; var UrlToSearch: string) of object;
TOnError = procedure(ThreadNum: Integer; ErrorMessage: string) of object;

type
TSpider = class;
TSpiderThread = class(TThread)
private
TempUrlList: tstringlist;
LayersURLSToSearch: array[1..max_layers] of tstringlist;
Fabort: Boolean;
ThreadNumber: Integer;
MainSpiderObj: TSpider;
Http1: TGhClient;
procedure FOnTerminate(Sender: TObject);
protected

public
constructor Create(Sender: Tspider; ThreadNum: integer);
destructor Destroy; override;
procedure Execute; override;
procedure Abort;
published

end;

Tspider = class(Tcomponent)
private
{ Private declarations }
FProxyAddr: string;
FMaxLayers: Integer;
FThreadsToUse: Integer;
ThreadList: array[1..100] of TspiderThread;
FOnBeforeConnect: TOnBeforeConnect;
FOnAfterConnect: TOnAfterConnect;
FOnFailConnect: TOnFailConnect;
FOnLinkFound: TOnLinkfound;
FOnPageLoaded: TOnPageLoaded;
FOnEmailfound: TOnfoundEmail;
FOnGetNextUrlSeed: TOnGetNextUrlSeed;
FOnError: TOnError;
function Running: Boolean;
protected
{ Protected declarations }
public
{ Public declarations }
work_CS: TRTLCriticalSection;
procedure Run;
constructor Create(Aowner: Tcomponent); override;
destructor Destroy; override;
procedure Abort;
published
property ProxyAddr: string read FProxyAddr write FProxyAddr;
property MaxLayers: integer read fmaxlayers write fmaxlayers;
property ThreadsToUse: integer read FThreadsToUse write FThreadsToUse;
property OnAfterConnect: TOnAfterConnect read FOnAfterConnect write FOnAfterConnect;
property OnBeforeConnect: TOnBeforeConnect read FOnBeforeConnect write FOnBeforeConnect;
property OnFailConnect: TOnFailConnect read FOnFailConnect write FOnFailConnect;
property OnLinkFound: TOnLinkfound read FOnLinkFound write FOnLinkFound;
property OnPageLoaded: TOnPageLoaded read FOnPageLoaded write FOnPageLoaded;
property OnEmailFound: TOnfoundEmail read FOnEmailFound write FOnEmailFound;
property OnGetNextUrlSeed: TOnGetNextUrlSeed read FOnGetNextUrlSeed write FOnGetNextUrlSeed;
property OnError: TOnError read FOnError write FOnError;
{ Published declarations }
end;


procedure Register;

implementation

function IsIpAddress(IP: string): Boolean;
begin
Result := False;
if (CountF(IP, '.', 1) = 3) and IsNum(IP[1]) and IsNum(Copy(ip, length(ip), 1)) then
Result := True;
end;

function Parse_Url(InString: string): TURL;
var
Tempstr: string;
TempInt: Integer;
TempUrl: TUrl;
begin

// default ret vals.
TempUrl.proto := '';
TempUrl.server := '';
TempUrl.domain := '';
TempUrl.path := '/';
TempUrl.filename := '';

TempStr := Instring;

{If CountF(Tempstr, '.', 1)=1 Then
Begin
If copy(
TempUrl.Filename:=Tempstr;
tempstr:='';
End;}


if pos('://', tempstr) > 0 then
begin
TempUrl.proto := copy(tempstr, 1, pos('://', tempstr) - 1);
tempstr := copy(tempstr, pos('://', tempstr) + 3, 5000);
end;

/// grab host name
if pos('/', tempstr) > 0 then
begin
TempUrl.Domain := copy(tempstr, 1, pos('/', tempstr) - 1);
// grab the balance of the Url line.
Tempstr := copy(tempstr, pos('/', tempstr), 5000); // use the balance of the string to gat path and file
end
else
if CountF(tempstr, '.', 1) > 0 then // theres at least one period
begin
TempUrl.Domain := tempstr; // it must be a domain
Tempstr := ''; // no more to do after domain.

end;

Tempint := CountF(TempUrl.Domain, '.', 1); // find out how many periods there are..

if TempInt > 1 then // is there a second period in the domain, then there is a server..
begin
if not IsIpAddress(Tempurl.Domain) then
begin
// now get the period that separtates server from Domain ..
if (Pos('.COM.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.EDU.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.GOV.', Uppercase(TempUrl.Domain)) > 0) or
(Pos('.NET.', Uppercase(TempUrl.Domain)) > 0) then
TempInt := ScanCC(TempUrl.Domain, '.', tempint - 2)
else
TempInt := ScanCC(TempUrl.Domain, '.', tempint - 1);

TempUrl.Server := Copy(TempUrl.Domain, 1, tempint);
TempUrl.Domain := Copy(TempUrl.Domain, tempint + 1, 5000);
end;
end;

if Length(Tempstr) = 0 then
begin
parse_url.proto := TempUrl.path;
parse_url.server := TempUrl.server;
parse_url.domain := TempUrl.Domain;
parse_url.path := TempUrl.Path;
parse_url.filename := TempUrl.Filename;
exit; // no more to do.
end;

// scan for '?'
Tempint := pos(tempstr, '?');

if TempInt > 0 then // there are some url variables..
begin
TempUrl.filename := copy(tempstr, tempint, 1000); // grab the url vars..
TempUrl.path := copy(tempstr, 1, tempint - 1);
Tempstr := TempUrl.path;
end;


Tempint := ScanB(Tempstr, '/', 0); // scan backwards for a slash
if Tempint > 0 then
begin
TempUrl.filename := copy(Tempstr, Tempint + 1, 1000) + TempUrl.filename; // add the vars back in..
TempUrl.path := copy(Tempstr, 1, Tempint);
end;

// this might not be nessarry..
if length(TempUrl.path) = 0 then
TempUrl.path := '/';

parse_url.proto := TempUrl.path;
parse_url.server := TempUrl.server;
parse_url.domain := TempUrl.Domain;
parse_url.path := TempUrl.Path;
parse_url.filename := TempUrl.Filename;



end;

function stripHtml(HTMLString: string): string;
var
workstring, token: string;
var
I: integer;
begin
SetLength(workstring, 100000);
workstring := htmlstring;
I := 1;
while I > 0 do
begin
Token := ParseTag(workstring, '<A', '/A>', I);
if I > 0 then
begin
Delete(workstring, I - (length(token) + 4), length(token) + 4);
I := 1;
end;
end;

I := 1;
while I > 0 do
begin
Token := ParseTag(workstring, '<', '>', I);
if I > 0 then
begin
Delete(workstring, I - (length(token) + 2), length(token) + 2);
I := 1;
end;
end;

// take off the &nbsp
while pos('&nbsp', workstring) > 0 do
begin
Delete(workstring, pos('&nbsp', workstring), 5);
end;

result := workstring;
end;



constructor Tspider.create(Aowner: Tcomponent);
begin
inherited create(Aowner);
MaxLayers := 1;
ThreadsToUse := 1;
InitializeCriticalSection(Work_CS); // initialize my Critical section.
end;

destructor Tspider.Destroy;
begin
inherited destroy;
end;

procedure Tspider.Run;
var
x: Integer;
begin
if running then
begin
Abort;
sleep(2000);
end;
begin
for x := 1 to FThreadsToUse do
begin
ThreadList[x] := TspiderThread.create(self, x);
end;
end;
end;


procedure Tspider.Abort;
var
x: Integer;
begin
for x := 1 to FThreadsToUse do
begin
if Threadlist[x] <> nil then
ThreadList[x].abort;
end;
end;

function Tspider.Running: Boolean;
var
x: Integer;
begin
for x := 1 to FThreadsToUse do
begin
if Threadlist[x] <> nil then
begin
Running := True;
end;
Running := False;
end;
end;


constructor TSpiderThread.Create(Sender: TSpider; ThreadNum: Integer);
begin
Threadnumber := Threadnum;
Self.MainSpiderObj := Sender;
http1 := nil;
Onterminate := FOnTerminate;
inherited Create(False);
end;

destructor TSpiderThread.Destroy;
begin
inherited;
end;

procedure TSpiderThread.Abort;
begin
Http1.Disconnect;
Fabort := True;
end;

procedure TSpiderThread.FOnTerminate(Sender: TObject);
begin
Free;
Self := nil;
end;

procedure TSpiderThread.Execute;
var
tempstr, workstr, temp_result: string;
x, MailPos, which_layer, which_layer_item, BytesSoFar, I, ContentLength: integer; Temp_url: Turl; frameset: boolean;
var
PageHeader, Body: string; headerlist: Tstringlist; requestheader: string; AddToSearchLater: boolean;
var
Textonly: string; Url_Info: TURL;

begin
//FreeonTerminate := True;
Fabort := false;
Setlength(workstr, 100000); //??
Http1 := Tghclient.create(nil);
for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x] := tstringlist.create;

TempUrlList := tstringlist.create;
with MainSpiderObj do
try
try

while not Fabort do
begin

for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x].clear;

TempStr := '';
// add Seed address ..
EnterCriticalSection(work_CS);
try
if assigned(FOnGetNextUrlSeed) then
FOnGetNextUrlSeed(Threadnumber, TempStr);
finally
LeaveCriticalSection(Work_CS);
end;

if Length(Tempstr) <= 0 then // done, no more to do..
Exit;


LayersURLSToSearch[1].add(lowercase(TempStr));

for Which_layer := 1 to FMaxLayers do
begin

if Fabort then break;

for which_layer_item := 0 to LayersURLSToSearch[Which_layer].count - 1 do
begin
if Fabort then break;
Url_Info := Parse_URL(LayersURLSToSearch[Which_layer].strings[which_layer_item]);
if Length(Url_Info.domain) <= 0 then
continue;

FrameSet := False;

try
headerlist := Tstringlist.create;
try
headerList.add('Host: ' + Url_info.server + Url_info.domain);
headerList.add('Accept: */*');
headerList.add('Accept-Language: en-us');
headerList.add('User-Agent: Mozilla/4.0 ');
Requestheader := headerlist.text + #13#10;
finally
Headerlist.free;
end;

if length(FProxyAddr) > 0 then
begin
http1.host := FProxyAddr;
tempstr := 'GET Http://' + Url_info.Server + Url_info.Domain + Url_info.Path + Url_info.Filename + ' HTTP/1.0' + #13#10 + requestheader;
end
else
begin
Http1.host := Url_Info.Server + Url_Info.Domain;
tempstr := 'GET ' + Url_Info.Path + Url_Info.FileName + ' HTTP/1.0' + #13#10 + requestheader;
end;

EnterCriticalSection(work_CS);
try
if assigned(FOnBeforeConnect) then
FOnBeforeConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;

/// showmessage('Looking up'+tempstr);
Http1.Port := 80;
try
Http1.connect;
except
EnterCriticalSection(work_CS);
try
if assigned(FOnFailConnect) then
FOnFailConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;
raise;
Continue;
end;

EnterCriticalSection(work_CS);
try
if assigned(FOnAfterConnect) then
FOnAfterConnect(Threadnumber, Which_layer, Url_Info);
finally
LeaveCriticalsection(work_cs);
end;




// send request to web server..
http1.SocketIO.WriteFromString(TempStr);

/// get page back..

// Get header first..
pageHeader := '';
http1.SocketIO.ReadTimeout := 2;
while true do
begin
if Fabort then break;
tempstr := http1.SocketIO.ReadlnToString;
Pageheader := Pageheader + tempstr;
if (Length(tempstr) = 2) and (pos(tempstr, #13#10) > 0) then
Break;
end;

/// check the header to see if this is a TEXT/HTML type, if not, quit now..
if (length(PageHeader) <= 0) or not (Pos('TEXT/HTML', UpperCase(PageHeader)) > 0) then
begin
http1.Disconnect;
Continue;
end;



ContentLength := 0;
tempstr := '';
// look for the length..
if pos('CONTENT-LENGTH', UPPERCASE(Pageheader)) > 0 then
begin
Tempstr := Copy(Pageheader, pos('CONTENT-LENGTH', UPPERCASE(Pageheader)) + 15, 10);
ContentLength := strtointdef(Copy(Tempstr, 1, pos(#10, Tempstr) - 2), 0);
end;

// Now get body ...
BytesSoFar := 0;
Body := '';
TextOnly := '';
Tempstr := '';
while True do
begin
if Fabort then break;
Http1.SocketIO.ReadTimeout := 6;
tempstr := Http1.SocketIO.ReadToString(-1); // read everything in buffer.
Body := Body + tempstr;
BytesSoFar := BytesSoFar + length(tempstr);
// if there is a content length and we have recieved enough, then break.
if (contentlength > 0) and (BytesSoFar >= ContentLength) then
break;
end;


except
on EReadFail do ;
on EReadTimeout do ;
on e: exception do
begin
http1.disconnect;
EnterCriticalSection(work_CS);
try
if assigned(FOnError) then
FOnError(Threadnumber, E.message + LayersURLSToSearch[Which_layer].strings[which_layer_item]);
finally
LeaveCriticalsection(work_cs);
end;
end;
end;

Textonly := stripHtml(Body);
EnterCriticalSection(work_CS);
try
if assigned(FOnPageLoaded) then
FOnPageLoaded(Threadnumber, Which_Layer, Url_Info, Pageheader,
Body, textonly);
finally
LeaveCriticalSection(work_cs);
end;
// now look for emails and links...


workstr := lowercase(Body);
if (Which_layer <= FMaxLayers) then
begin
// seee if this is a frame page..
if (pos('</frameset>', workstr) > 0) then // frame def file..
Frameset := True;

TempUrlList.clear;

I := 1;
while I > 0 do
begin
if Fabort then break;
tempstr := ParseTag(workstr, 'href="', '"', I);
// put it into our list only if it's not a gif or jpg
if (length(tempstr) > 0) and not ((pos('.gif', Tempstr) > 0) or (pos('.jpg', Tempstr) > 0)) then
begin
TempUrlList.add(tempstr);
end;
end;

if frameset then
begin
I := 1;
while I > 0 do
begin
tempstr := ParseTag(workstr, 'src="', '"', I);
if (length(tempstr) > 0) and not ((pos('.gif', Tempstr) > 0) or (pos('.jpg', Tempstr) > 0)) then
TempUrlList.add(tempstr);
end;
end;

///Now decide whether to keep these to search later..
for X := 0 to TempURLList.count - 1 do
begin
if Fabort then break;
if pos('mailto:', tempURlList.Strings[x]) > 0 then // ignore mail addresses here..
continue;

Temp_Result := '';
if (CountF(tempURlList.Strings[x], '.', 1) = 1) or (copy(tempURlList.Strings[x], 1, 3) = '../') or (copy(tempURlList.Strings[x], 1, 1) = '/') then
begin
if Copy(tempURlList.Strings[x], 1, 1) = '/' then
tempURlList.Strings[x] := Url_info.Server + Url_Info.Domain + tempURlList.Strings[x]
else
tempURlList.Strings[x] := Url_info.Server + Url_Info.Domain + '/' + tempURlList.Strings[x];
end;

Temp_Url := Parse_URL(tempURlList.Strings[x]);

if (Length(Temp_url.domain) > 0) then
begin
AddToSearchLater := True;
EnterCriticalSection(work_CS);
try
if assigned(FOnLinkFound) then
FOnLinkFound(Threadnumber, which_Layer, temp_URL, AddToSearchLater);
finally
LeaveCriticalSection(Work_cs);
end;
if AddToSearchLater then
LayersURLSToSearch[Which_layer + 1].add(temp_Url.Server + Temp_url.Domain + Temp_url.Path + Temp_url.Filename);
end;
end;

Tempstr := '';
for X := 0 to TempURLList.count - 1 do
begin
if Fabort then break;
MailPos := pos('mailto:', tempURlList.Strings[x]); // look only at mail addresses here..
if (MailPos > 0) and (pos('@', tempURlList.Strings[x]) > 0) then
begin
Tempstr := copy(tempURlList.Strings[x], mailpos + 7, 500);
EnterCriticalSection(work_CS);
try
if assigned(FOnEmailfound) then
FOnEmailfound(Threadnumber, which_Layer, tempstr);
finally
LeaveCriticalSection(work_cs);
end;
end;
end;
end;

end; // If (Which_layer + 1 <= FMaxLayers) then

if Fabort then break;
end;
end;
except
on E: exception do
begin
if assigned(FOnError) then
FOnError(Threadnumber, E.message);
end;
end;
finally
TempUrlList.free;
for x := 1 to MAX_LAYERS do
LayersURLSToSearch[x].Free;
Http1.free;
http1 := nil;
end;
end;


procedure Register;
begin
RegisterComponents('Internet', [Tspider]);
end;

initialization


end.

 
难道你要做SE
 
对了,SE该如何做呢?
 
what is se?
 
有一个控件,httpscan,可以的,功能很强大。
 
多麻烦!
用一个httpscan就可以拉!!!!!!!
 
httpscan这控件哪里有下啊
 
playicq 上有下httpscan的
torry.net也有呵
 
用递归啊。
超链接是有“<a href=”的关键字的,在你得到的网页源码里得到超链接,然后访问其中的网页链接,再读出来。但要注意控制读取的层数。应该只要一个过程或函程就能完成这一功能。需要源代码的话可以去写给你。
 
楼上的 参宿四 大大能不能写一个代码给我啊。。。。呵呵我有点菜。你写的时候最好能注释一下,好能让我了解,谢谢了!r.s.violet@163.com
还有可不可以把链接取得的形式为:
XXXXX(链接文字) http://XXX.XXX.XXX
 
需要一个MaxLayer的integer做为全局变量限定递归的层数
调用方式:如 GetLink('http://nstudio.myrice.com',1);
但是如果出错如“404”错误,调用就会中断。
另外,我用的是Delphi 6,所以代码有点臃肿。我记得D7里好像有个PosEx的函数,能指定起始位置进行直找子字符串。如果用这个函数,中间有一大段代码可以“减负”。如那个破坏已查过的<A>之类
如果还嫌慢的话可以开多线程

procedure TForm1.GetLink(url:string;layer:integer);
var
source:string;
link:string;
linkText:string;
linkDir:string;
symEnd:char;
i,j:integer;
begin
source:=IdHTTP1.Get(url); //得到源码
try
while (Pos('<A',uppercase(source))>0) do
begin
i:=Pos('<A',uppercase(source)); //得到<A> 标记的位置
j:=Pos('</A>',uppercase(source));
//剔除<A>之前的</A>
while j<i do
begin
source[j]:='x';
j:=Pos('</A>',uppercase(source));
end;

source:='x'; //破坏当前<A>标记
//考虑原样显示标记<pre>
if (i>Pos('<PRE>',uppercase(source))) and (i<Pos('</PRE>',uppercase(source))) then
begin
continue;
end;
//判断HREF是否在<A>中
while Pos('HREF=',uppercase(source))<i do
source[Pos('HREF=',uppercase(source))]:='x';
while source<>'>' do
i:=i+1;
if i<Pos('HREF=',uppercase(source)) then
continue;
//条件成立,开始读取Link和Text
//破坏当前</A>标记
//1.读取Link
i:=Pos('HREF=',uppercase(source));
i:=i+5;
link:='';
if (source='"') or (source='''') then
begin
symEnd:=source;
i:=i+1;
end
else symEnd:=' ';
while (source<>symEnd)and (source<>'>') do
begin
link:=link+source;
i:=i+1;
end;
//2.读取Text
while source<>'>' do
i:=i+1;
i:=i+1;
linkText:='';
while i<j do
begin
linkText:=linkText+source;
i:=i+1;
end;
//保存Link和Text代码在这插入



//***************************
//递归调用
if layer<MaxLayer then
begin
if Pos('MAILTO:',uppercase(link))=0 then
begin //判断该链接是否需要递归
if Pos('HTTP://',uppercase(link))>0 then GetLink(link,layer+1)
else
begin
linkDir:=url;
while linkDir[length(linkDir)]<>'/' do
begin
linkDir[length(linkDir)]:=' ';
linkDir:=trimright(linkDir);
end;
GetLink(linkDir+link,layer+1);
end;
end;
end;
end;
finally
end;

end;
 
devecom:
  用httpscan可以取得链接,但我要同时取得其链接文字,能实现吗?不能实现的话如何改?谢谢!
 
后退
顶部