如何提取页面文本? (50分)

  • 主题发起人 主题发起人 nau
  • 开始时间 开始时间
N

nau

Unregistered / Unconfirmed
GUEST, unregistred user!
代码:
请问:给定一个url,如何直接提取该页的页面文本?
 
如果有人回答了给我一份,谢谢!!
 
要是有个“机器人”就好了!
 
N久前写过,不过代码丢了。只有些类似的。。。

这里有个CB的,类似吧
http://www.delphibbs.com/delphibbs/dispq.asp?lid=1543113
 
unit Main;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, OleCtrls, SHDocVw, IdBaseComponent, IdComponent,
IdTCPConnection, IdTCPClient, IdFTP;

type
TMainForm = class(TForm)
WebBrowser: TWebBrowser;
Edit: TEdit;
Button: TButton;
Memo: TMemo;
procedure ButtonClick(Sender: TObject);
procedure WebBrowserDocumentComplete(Sender: TObject;
const pDisp: IDispatch; var URL: OleVariant);
private
{ Private declarations }
public
{ Public declarations }
end;

var
MainForm: TMainForm;

implementation

uses
MSHTML_TLB;

{$R *.dfm}

procedure TMainForm.ButtonClick(Sender: TObject);
begin
Memo.Clear;
WebBrowser.Navigate(WideString(Edit.Text))
end;

procedure TMainForm.WebBrowserDocumentComplete(Sender: TObject;
const pDisp: IDispatch; var URL: OleVariant);
var
Doc: IHTMLDocument2;
Index: OleVariant;
Frame: IDispatch;
Window: IHTMLWindow2;
IP: string;
p: Integer;
begin
WebBrowser.Document.QueryInterface(IID_IHTMLDocument, Doc);
Index:=2;
Frame:=IDispatch(Doc.frames.item(Index));
Frame.QueryInterface(IID_IHTMLWindow2, Window);
try
Memo.Text:=Window.document.body.innerHTML
except
end
end.
 
procedure Tcomm_Form.Web_dummy;
begin
if WebBrowser1.Document=nil then exit;
WebBrowser1.ExecWB(OLECMDID_selectall,OLECMDEXECOPT_DODEFAULT);
WebBrowser1.ExecWB(OLECMDID_COPY,OLECMDEXECOPT_DODEFAULT);
end;
然后将粘贴板的内容copy到你的文件中
 
uses WinInet;

function DownloadFile(const AURL: string): string;

procedure Add(Buf: PChar; Count: Integer);
var
Len: Integer;
begin
Len := Length(Result);
SetLength(Result, Len + Count);
Move(Buf^, Result[Len + 1], Count);
end;

var
BytesRead: DWORD;
Session, Connection: HINTERNET;
Buffer: array[1..1024] of Char;
begin
Result := '';
Session := InternetOpen(nil, INTERNET_OPEN_TYPE_DIRECT, nil, nil, 0);
if Assigned(Session) then
try
Connection := InternetOpenUrl(Session, PChar(AURL), nil, 0,
INTERNET_FLAG_RAW_DATA, {INTERNET_FLAG_RELOAD, }0);
if Assigned(Connection) then
try
repeat
FillChar(Buffer, SizeOf(Buffer), 0);
InternetReadFile(Connection, @Buffer, SizeOf(Buffer), BytesRead);
if BytesRead > 0 then
Add(@Buffer, BytesRead);
Application.ProcessMessages;
until BytesRead = 0;
finally
InternetCloseHandle(Connection);
end;
finally
InternetCloseHandle(Session);
end;
end;

procedure TForm1.Button1Click(Sender: TObject);
begin
Memo1.Text := DownloadFile('http://www.delphibbs.com');
end;
 
这是完整的程序代码,但还是取不到页面文本,怎么办啊?我难受啊!
unit Unit1;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls;

type
TForm1 = class(TForm)
Button1: TButton;
Memo1: TMemo;
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;

var
Form1: TForm1;
implementation
uses WinInet;
{$R *.dfm}

function DownloadFile(const AURL: string): string;

procedure Add(Buf: PChar; Count: Integer);
var
Len: Integer;
begin
Len := Length(Result);
SetLength(Result, Len + Count);
Move(Buf^,Result[Len + 1],Count);
end;

var
BytesRead: DWORD;
Session, Connection: HINTERNET;
Buffer: array[1..1024] of Char;
begin
Result := '';
Session := InternetOpen(nil, INTERNET_OPEN_TYPE_DIRECT, nil, nil, 0);
if Assigned(Session) then
try
Connection := InternetOpenUrl(Session, PChar(AURL), nil,INTERNET_FLAG_RAW_DATA,INTERNET_FLAG_RELOAD,0);
if Assigned(Connection) then
try
repeat
FillChar(Buffer, SizeOf(Buffer), 0);
InternetReadFile(Connection, @Buffer, SizeOf(Buffer), BytesRead);
if BytesRead > 0 then
Add(@Buffer, BytesRead);
Application.ProcessMessages;
until BytesRead = 0;
finally
InternetCloseHandle(Connection);
end;
finally
InternetCloseHandle(Session);
end;
end;
procedure TForm1.Button1Click(Sender: TObject);
begin
Memo1.Text := DownloadFile('http://www.sina.com.cn');
end;

end.
 
后退
顶部