如何抓取超链接!!~~~(100分)

  • 主题发起人 主题发起人 糟老头
  • 开始时间 开始时间

糟老头

Unregistered / Unconfirmed
GUEST, unregistred user!
现在在做一个软件
实现功能是能向服务器 发请求后 且能在服务器传回的超文本文件中 抓取超链接
是基于JSP的
现在急需知道怎么抓取超链接在传回的超文本文件中
想问下 要看DELPHI关于什么方面的资料 还有要用到什么控件
很急 谢谢 各位~~~
 
这是前段写的解析HTML的URL的,写的还不完整,大概给你个思路

//ParseURL.pas
unit parseURL;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, ComCtrls;

type
PURLData = ^TURLData;
TURLData = record
URL: string;
Title: string;
end;

TURL = class
private
FList: TList;
FStream: TMemoryStream;
FFileName: string;
function GetURL(const Index: Integer): TURLData;
function GetURLCount: Integer;
procedure SetFileName(const Value: string);
public
constructor Create;
destructor Destroy; override;
procedure ClearURL;
procedure ExtractURL;

property URL[const Index: Integer]: TURLData read GetURL;
property URLCount: Integer read GetURLCount;
property FileName: string read FFileName write SetFileName;
end;

TForm1 = class(TForm)
Button1: TButton;
ListView1: TListView;
procedure Button1Click(Sender: TObject);
procedure ListView1Data(Sender: TObject; Item: TListItem);
private
FURL: TURL;
public
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
end;

var
Form1: TForm1;

implementation

{$R *.dfm}

{ TURL }

procedure TURL.ClearURL;
var
I: Integer;
begin
for I := 0 to FList.Count - 1 do
FreeMem(FList);
FList.Clear;
end;

constructor TURL.Create;
begin
FList := TList.Create;
FStream := TMemoryStream.Create;
end;

destructor TURL.Destroy;
begin
ClearURL;
FStream.Free;
inherited Destroy;
end;

procedure TURL.SetFileName(const Value: string);
var
C: Char;
begin
if Value = FFileName then Exit;
ClearURL;
FFileName := Value;
FStream.LoadFromFile(FFileName);
C := #0;
FStream.Position := FStream.Size;
FStream.Write(C, 1);
ExtractURL;
end;

procedure TURL.ExtractURL;
const
HREF: array [0..3] of Char = ('H', 'R', 'E', 'F');
var
P, S: PChar;
Data: TURLData;

function CompHREF: Boolean;
var
T: PChar;
I: Integer;
begin
T := P;
for I := 0 to SizeOf(HREF) - 1 do
begin
Result := UpCase(T^) = HREF;
if not Result then
break;
Inc(T);
end;
end;

procedure SkipBlanks;
begin
while P^ <> #0 do
begin
if P^ in [#33..#255, #10] then
break;
Inc(P);
end;
end;

procedure AddURL;
var
D: PURLData;
begin
New(D);
D^.URL := Data.URL;
D^.Title := Data.Title;
FList.Add(D);
end;

procedure GetURL;
var
Len: Integer;
begin
SkipBlanks;
if CompHREF then
begin
Inc(P, SizeOf(HREF));
SkipBlanks;
if P^ = '=' then Inc(P);
SkipBlanks;
case P^ of
'"', '''':
begin
{ URL }
Inc(P);
S := P;
while not (P^ in ['"', '''']) do Inc(P);
Len := P - S;
SetLength(Data.URL, Len);
Move(S^, Data.URL[1], Len);

{ GetTitle }
Inc(P);
SkipBlanks;
while P^ <> '>' do Inc(P);
Inc(P);
SkipBlanks;

S := P;
while True do
case P^ of
'<':
begin
if P = S then
begin
Inc(P);
Continue;
end else
begin
Inc(P);
case P^ of
'/':
begin
Len := P - S - 1;
SetLength(Data.Title, Len);
Move(S^, Data.Title[1], Len);
AddURL;
Exit;
end;
end;
end;
end;
'>':
begin
Inc(P);
S := P;
end;
else
Inc(P);
end;
end;
end;
end;
end;

begin
P := FStream.Memory;
SkipBlanks;
while True do
begin
case P^ of
'<':
begin
Inc(P);
SkipBlanks;
case P^ of
'a', 'A':
begin
Inc(P);
GetURL;
end;
end;
end;
#0: break;
end;
Inc(P);
end;
end;

function TURL.GetURL(const Index: Integer): TURLData;
begin
if Index in [0..(FList.Count - 1)] then
Result := PURLData(FList[Index])^
else
raise Exception.Create('Error');
end;

function TURL.GetURLCount: Integer;
begin
Result := FList.Count;
end;

constructor TForm1.Create(AOwner: TComponent);
begin
inherited Create(AOwner);
FURL := TURL.Create;
end;

destructor TForm1.Destroy;
begin
FURL.Free;
inherited Destroy;
end;

procedure TForm1.Button1Click(Sender: TObject);
begin
FURL.FileName := 'a.txt'; //将HTML文件给FileName,进行解析
ListView1.Items.Count := FURL.URLCount;
end;

procedure TForm1.ListView1Data(Sender: TObject; Item: TListItem);
var
Data: TURLData;
begin
Data := FURL.URL[Item.Index];
Item.Caption := Data.Title;
Item.SubItems.Add(Data.URL)
end;

end.

//ParseURL.dfm
object Form1: TForm1
Left = 192
Top = 106
Width = 544
Height = 375
Caption = 'Form1'
Color = clBtnFace
Font.Charset = DEFAULT_CHARSET
Font.Color = clWindowText
Font.Height = -11
Font.Name = 'MS Sans Serif'
Font.Style = []
OldCreateOrder = False
PixelsPerInch = 96
TextHeight = 13
object Button1: TButton
Left = 448
Top = 24
Width = 75
Height = 25
Caption = 'Button1'
TabOrder = 0
OnClick = Button1Click
end
object ListView1: TListView
Left = 8
Top = 16
Width = 433
Height = 321
Columns = <
item
Caption = 'Title'
Width = 100
end
item
AutoSize = True
Caption = 'URL'
end>
MultiSelect = True
OwnerData = True
TabOrder = 1
ViewStyle = vsReport
OnData = ListView1Data
end
end
 
代码那忘了Free New指针,你加一下。。。
 
unit HTMLPars;

interface

uses
Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
ExtCtrls, StdCtrls, Buttons;

type THTMLParam = class
private
fRaw:string;
fKey:string;
fValue:string;
procedure SetKey(Key:string);
public
constructor Create;
destructor Destroy; override;
published
property Key:string read fKey write SetKey;
property Value:string read fValue;
property Raw:string read fRaw;
end;

type THTMLTag = class
private
fName:string;
fRaw:string;
procedure SetName(Name:string);
public
Params:TList;
constructor Create;
destructor Destroy; override;
published
property Name:string read fName write SetName; // uppercased TAG (without <>)
property Raw:string read fRaw; // raw TAG (parameters included) as read from input file (without<>)
end;

type THTMLText = class
private
fLine:String;
fRawLine:string;
procedure SetLine(Line:string);
public
constructor Create;
destructor Destroy; override;
published
property Line:string read fLine write SetLine; // HTML3.2 Entities and Western Latin-1 Font converted Text
property Raw:string read fRawLine; // raw text line as read from input file
end;

type THTMLParser = class(TObject)
private
Text:string;
Tag:string;
isTag:boolean;
procedure AddText;
procedure AddTag;
public
parsed:TList;
Lines:TStringlist;
constructor Create;
destructor Destroy; override;
procedure Execute;
published
end;

implementation



constructor THTMLParser.Create;
begin
inherited Create;
Lines:=TStringlist.Create;
Parsed:=TList.Create;
end;

destructor THTMLParser.Destroy;
begin
Lines.Free;
Parsed.Free;
inherited Destroy;
end;


procedure THTMLParser.AddText;
var HTMLText:THTMLText;

begin
if not isTag then
if Text<>'' then
begin
HTMLText:=THTMLText.Create;
HTMLText.Line:=Text;
Text:='';
parsed.Add(HTMLText);
end;
end;


procedure THTMLParser.AddTag;
var HTMLTag:THTMLTag;
begin
isTag:=false;
HTMLTag:=THTMLTag.Create;
HTMLTag.Name:=Tag;
Tag:='';
parsed.Add(HTMLTag);
end;



procedure THTMLParser.Execute;
var i:integer;
s:string;
begin
Text:='';
Tag:='';
isTag:=false;
for i:= 1 to Lines.Count do
begin
s:=Lines[i-1];
while Length(s)>0 do
begin
if s[1]='<' then begin AddText;isTag:=true;end
else
if s[1]='>' then AddTag
else
if isTag then Tag:=Tag+s[1]
else Text:=Text+s[1];
delete(s,1,1);
end;
if (not isTag) and (Text<>'') then Text:=Text+#10;
end;
if (isTag) and (Tag<>'') then AddTag;
if (not isTag) and (Text<>'') then AddText;
end;




constructor THTMLTag.Create;
begin
inherited Create;
Params:=Tlist.Create;
end;


destructor THTMLTag.Destroy;
var i:integer;
begin
for i:= Params.Count downto 1 do
begin
THTMLparam(Params[i-1]).Free;
Params.delete(i-1);
end;
Params.Free;
inherited Destroy;
end;



procedure THTMLTag.SetName(Name:string);
var Tag:string;
param:string;
HTMLParam:THTMLParam;
isQuote:boolean;
begin
fRaw:=Name;
Params.clear;

while (Length(Name)>0) and (Name[1]<>' ') do
begin
Tag:=Tag+Name[1];
Delete(Name,1,1);
end;

fName:=uppercase(Tag);

while (Length(Name)>0) do
begin
param:='';
isQuote:=false;
while (Length(Name)>0) and ( not ((Name[1]=' ') and (isQuote=false))) do
begin
if Name[1]='"' then
IsQuote:=not(IsQuote);
param:=param+Name[1];
Delete(Name,1,1);
end;

if (Length(Name)>0) and (Name[1]=' ') then Delete(Name,1,1);
if param<>'' then
begin
HTMLParam:=THTMLParam.Create;
HTMLParam.key:=param;
params.add(HTMLParam);
end;
end;

end;


{$i latin1.pas}

procedure THTMLText.SetLine(Line:string);
var j,i:integer;
isEntity:boolean;
Entity:string;
EnLen,EnPos:integer;
d,c:integer;
begin
fRawLine:=Line;
while pos(#10,Line)>0 do Line[Pos(#10,Line)]:=' ';
while pos(' ',Line)>0 do delete(Line,pos(' ',Line),1);

i:=1;
isEntity:=false;
EnPos:=0;
while (i<=Length(Line)) do
begin
if Line='&' then begin EnPos:=i;isEntity:=true;Entity:='';end;
if isEntity then Entity:=Entity+Line;
if isEntity then
if (Line=';') or (Line=' ') then begin
EnLen:=Length(Entity);

// charset encoded entity
if (EnLen>2) and (Entity[2]='#') then
begin
delete(Entity,EnLen,1); //delete the ;
delete(Entity,1,2); // delete the &#
if uppercase(Entity[1])='X' then Entity[1]:='$'; // it's hex (but not supported!!!)
if (Length(Entity)<=3) then // we cant convert e.g. cyrillic/chinise capitals
begin
val(Entity,d,c);
if c=0 then // conversion successful
begin
delete(Line,EnPos,EnLen);
insert(Charset[d],Line,EnPos);
i:=EnPos; // set new start
end;
end;
end
else
begin // its an entity
j:=1;
while (j<=100) do
begin
if Entity=(Entities[j,1]) then
begin
delete(Line,EnPos,EnLen);
insert(Entities[j,2],Line,Enpos);
j:=102; // stop searching
end;
j:=j+1;
end;
// reset Line
if j=103 then i:=EnPos-1
else i:=EnPos;
end;

IsEntity:=false;
end;
i:=i+1;
end;

fLine:=Line;
end;


procedure THTMLParam.SetKey(Key:string);
begin
fValue:='';
fRaw:=Key;
if pos('=',key)<>0 then
begin
fValue:=Key;
delete(fValue,1,pos('=',key));
key:=copy(Key,1,pos('=',key)-1);

if Length(fValue)>1 then
if (fValue[1]='"') and (fValue[Length(fValue)]='"') then
begin
delete(fValue,1,1);
delete(fValue,Length(fValue),1);
end;
end;
fKey:=uppercase(key);
end;

constructor THTMLParam.Create;
begin
inherited Create;
end;

destructor THTMLParam.Destroy;
begin
inherited Destroy;
end;

constructor THTMLText.Create;
begin
inherited Create;
end;

destructor THTMLText.Destroy;
begin
inherited Destroy;
end;
end.
 
用控件吧。
 
oliver99 用什么控件合适呢??
 
多人接受答案了。
 
后退
顶部