如何在Twebbrowser中取得全部源代码 ( 积分: 200 )

  • 主题发起人 主题发起人 hackering
  • 开始时间 开始时间
H

hackering

Unregistered / Unconfirmed
GUEST, unregistred user!
难点:
1:目标网页有多层镶嵌的frame时同样要自动取下所有的子frame的源代码。
[由于frame之间存在互动,要一次浏览即可读出全部代码,而不是一个个frame查看url]
2:需要delphi可编译的源程序(C++的不要,网上流传的那个C++代码转成delphi没用)
 
难点:
1:目标网页有多层镶嵌的frame时同样要自动取下所有的子frame的源代码。
[由于frame之间存在互动,要一次浏览即可读出全部代码,而不是一个个frame查看url]
2:需要delphi可编译的源程序(C++的不要,网上流传的那个C++代码转成delphi没用)
 
网上资料:
=========================================================================
FAQ:两种方法访问多层嵌套的frame
问========================================

在您的网站拜读了关于TWebBrowser的使用方法,但是一直有一个问题困扰我,就是如何取得frame嵌套frame的HTML的原码,我只是知道单个frame如何取得源码,但是多个frame嵌套就没有办法,请教一下!

2004-09-29 23:41:28


答========================================

要得到源代码,必须先得到frame。访问frame一般说来有两种方法:

1、通过WebBrowser的文档接口得到frame的集合,再逐一访问。

HRESULT IHTMLDocument2::get_frames(IHTMLFramesCollection2 **p);

由IHTMLFramesCollection2接口的item方法,可以以frame的索引号(从0开始)或frame的名称来访问相应的frame,pvarResult则返回一个IDispatch接口(或一个IDispatch接口的数组,多层嵌套的情况).

HRESULT item(
VARIANT *pvarIndex,
VARIANT *pvarResult
);

例子如下,假设pWin是一个指向主窗口的有效的IHTMLWindow接口指针。

......
VARIANT frameRequested;
VARIANT frameOut;
IHTMLFramesCollection2* pFramesCol;
IHTMLWindow2* pRightFrameWindow;
IHTMLDocument2* pRightDoc;

frameRequested.vt = VT_BSTR;//若为VT_I4则以索引号来访问
frameRequested.bstrVal = L"rightframe";//以名称来访问
//frameRequested.vt = VT_I4;
//frameRequested.bstrVal = (BSTR)0;

hr = pWin->get_frames(&pFramesCol);
hr = pFramesCol->item(&frameRequested, &frameOut);

hr = frameOut.pdispVal->QueryInterface(IID_IHTMLWindow2, (void**)&pRightFrameWindow);
hr = pRightFrameWindow->get_document(&pRightDoc);
......


2、通过IOleContainer枚举嵌入对象的方式来访问WebBrowser对象。

void CMyHtmlView::RefreshFrames()
{
// 取得文档的IDispatch指针
LPDISPATCH lpDisp = NULL;
lpDisp = GetHtmlDocument();

if (lpDisp)
{
IOleContainer* pContainer;
HRESULT hr = lpDisp->QueryInterface(IID_IOleContainer, (void**)&pContainer);
lpDisp->Release();
if (FAILED(hr))
return hr;

IEnumUnknown* pEnumerator;
// 获得枚举器
hr = pContainer->EnumObjects(OLECONTF_EMBEDDINGS, &pEnumerator);
pContainer->Release();
if (FAILED(hr))
return hr;

IUnknown* pUnk;
ULONG uFetched;
// 枚举并刷新所有frame
for (UINT i = 0; S_OK == pEnumerator->Next(1, &pUnk, &uFetched); i++)
{
IWebBrowser2* pBrowser;

hr = pUnk->QueryInterface(IID_IWebBrowser2, (void**)&pBrowser);
pUnk->Release();
if (SUCCEEDED(hr))
{
pBrowser->Refresh();
pBrowser->Release();
}
}
pEnumerator->Release();
}


3、访问的多层嵌套frame
注意每个frame又可以包含自己的frame,而上面所说的方法则是针对一个WebBrowser的窗口实现的,并不会涉及到深层的frame。要实现多层嵌套frame的访问,只需要加入一点递归的操作就行了。如对1中的pRightFrameWindow和2中的pBrowser,将函数稍加修改,在得到两个指针后作递归调用即可。


4、访问源代码
下面的方法来自CHtmlView,是比较正规的方法(能够保持网页的原始格式)。


BOOL CHtmlView::GetSource(CString& refString)
{
BOOL bRetVal = FALSE;
CComPtr<IDispatch> spDisp = GetHtmlDocument();

if (spDisp != NULL)
{
HGLOBAL hMemory;
hMemory = GlobalAlloc(GMEM_MOVEABLE, 0);
if (hMemory != NULL)
{
CComQIPtr<IPersistStreamInit> spPersistStream = spDisp;
if (spPersistStream != NULL)
{
CComPtr<IStream> spStream;
if (SUCCEEDED(CreateStreamOnHGlobal(hMemory, TRUE, &spStream)))
{
spPersistStream->Save(spStream, FALSE);

LPCTSTR pstr = (LPCTSTR) GlobalLock(hMemory);
if (pstr != NULL)
{
// Stream is always ANSI, but CString
// assignment operator will convert implicitly.

bRetVal = TRUE;
TRY
{
refString = pstr;
}
CATCH_ALL(e)
{
bRetVal = FALSE;
DELETE_EXCEPTION(e);
}
END_CATCH_ALL

if(bRetVal == FALSE)
GlobalFree(hMemory);
else
GlobalUnlock(hMemory);
}
}
}
}
}

return bRetVal;
}
 
我翻译成delphi后,当出现3层frame镶嵌时,QueryInterface接口转换即无效了。
 
经过一个晚上的摸索。
问题已解决,提前结束
 
晚了
// Code 1

uses
ActiveX, MSHTML_TLB, ComCtrls, ComObj;

function GetBrowserForFrame(Doc: IHTMLDocument2; nFrame: Integer): IWebBrowser2;
//Thanks to Rik Barker
//returns an interface to the frame's browser
var
pContainer: IOLEContainer;
enumerator: ActiveX.IEnumUnknown;
nFetched: PLongInt;
unkFrame: IUnknown;
hr: HRESULT;
begin
Result := nil;
nFetched := nil;
// Cast the page as an OLE container
pContainer := Doc as IOleContainer;
// Get an enumerator for the frames on the page
hr := pContainer.EnumObjects(OLECONTF_EMBEDDINGS or OLECONTF_OTHERS, enumerator);
if hr <> S_OK then
begin
pContainer._Release;
Exit;
end;
// Now skip to the frame we're interested in
enumerator.Skip(nFrame);
// and get the frame as IUnknown
enumerator.Next(1,unkFrame, nFetched);
// Now QI the frame for a WebBrowser Interface - I'm not entirely
// sure this is necessary, but COM never ceases to surprise me
unkframe.QueryInterface(IID_IWebBrowser2, Result);
end;

function GetFrameSource(WebDoc: iHTMLDocument2): string;
//returns frame HTML and scripts as a text string
var
re: integer;
HTMLel: iHTMLElement;
HTMLcol: iHTMLElementCollection;
HTMLlen: Integer;
ScriptEL: IHTMLScriptElement;
begin
Result := '';
if Assigned(WebDoc) then
begin
HTMLcol := WebDoc.Get_all;
HTMLlen := HTMLcol.Length;
for re := 0 to HTMLlen - 1 do
begin
HTMLel := HTMLcol.Item(re, 0) as iHTMLElement;
if HTMLEl.tagName = 'HTML' then
Result := Result + HTMLEl.outerHTML;
end;
end;
end;

function WB_SaveFrameToFile(HTMLDocument: IHTMLDocument2;
const FileName: TFileName): Boolean;
// Save IHTMLDocument2 to a file
var
PersistFile: IPersistFile;
begin
PersistFile := HTMLDocument as IPersistFile;
PersistFile.Save(StringToOleStr(FileName), System.True);
end;


function SaveWBFrames(WebBrowser1: TWebBrowser): string;
// return the source for all frames in the browser
var
Webdoc, HTMLDoc: ihtmldocument2;
framesCol: iHTMLFramesCollection2;
FramesLen: integer;
pickFrame: olevariant;
p: integer;
begin
try
WebDoc := WebBrowser1.Document as IHTMLDocument2;
Result := GetFrameSource(WebDoc);

// §§§ Hier kann Result in eine Datei gespeichert werden §§§§ oder mit
// WB_SaveFrameToFile(WebDoc,'c:/MainPage.html');

//Handle multiple or single frames
FramesCol := WebDoc.Get_frames;
FramesLen := FramesCol.Get_length;
if FramesLen > 0 then
for p := 0 to FramesLen - 1 do
begin
pickframe := p;
HTMLDoc := WebBrowser1.Document as iHTMLDocument2;

WebDoc := GetBrowserForFrame(HTMLDoc, pickframe).document as iHTMLDocument2;
if WebDoc <> nil then
begin
Result := GetFrameSource(WebDoc);
WB_SaveFrameToFile(WebDoc, 'c:/Frame' + IntToStr(p) + '.html');
// ShowMessage(HTMLDoc.Get_parentWindow.Get_name);
// ShowMessage(HTMLDoc.Get_parentWindow.Parent.Get_document.nameProp);

end;
end;
except
Result := 'No Source Available';
end;
end;

// Test:

procedure TForm1.Button1Click(Sender: TObject);
begin
SaveWBFrames(Webbrowser1);
end;


// Code 2
uses
ActiveX;

function TForm1.GetFrame(FrameNo: Integer): IWebbrowser2;
var
OleContainer: IOleContainer;
enum: IEnumUnknown;
unk: IUnknown;
Fetched: PLongint;
begin
while Webbrowser1.ReadyState <> READYSTATE_COMPLETE do
Application.ProcessMessages;
if Assigned(Webbrowser1.document) then
begin
Fetched := nil;
OleContainer := Webbrowser1.Document as IOleContainer;
OleContainer.EnumObjects(OLECONTF_EMBEDDINGS, Enum);
Enum.Skip(FrameNo);
Enum.Next(1, Unk, Fetched);
Result := Unk as IWebbrowser2;
end
else
Result := nil;
end;

// Load sample page
// Testseite laden
procedure TForm1.Button1Click(Sender: TObject);
begin
Webbrowser1.Navigate('http://www.warebizprogramming.com/tutorials/html/framesEx1.htm');
end;

// Save all frames in single files
// Alle Frameseiten in einzelne Dateien speichern
procedure TForm1.Button2Click(Sender: TObject);
var
IpStream: IPersistStreamInit;
AStream: TMemoryStream;
iw: IWebbrowser2;
i: Integer;
sl: TStringList;
begin
for i := 0 to Webbrowser1.OleObject.Document.frames.Length - 1 do
begin
iw := GetFrame(i);
AStream := TMemoryStream.Create;
try
IpStream := iw.document as IPersistStreamInit;
if Succeeded(IpStream.save(TStreamadapter.Create(AStream), True)) then
begin
AStream.Seek(0, 0);
sl := TStringList.Create;
sl.LoadFromStream(AStream);
sl.SaveToFile('c:/frame' + IntToStr(i) + '.txt');
// memo1.Lines.LoadFromStream(AStream);
sl.Free;
end;
except
end;
AStream.Free;
end;
end;

end.
 
既然Avalon那么热心,那我就再加一个条件,答完就送分。

要求把只要能显示的文字,就能全部拷贝下来。
====================================================================
由于部分ele的源文件是加密的。类似于:

<OBJECT CLASSID="CLSID:8C9D5912-EED6-4488-B778-2D74EF9B859D" id="Object1" codebase=http://www.drcnet.com.cn/fish_dll/Ip3HtmlView.dll#version2,0,0,1 VIEWASTEXT>
<param name="meta_data1" value="ncCanfEMPNlfkbmcKanfhgPMjhgdCNPLmdMJaaMLDPaIjeKJlhGNCOoamfJOFaiaPaaabgKNebkhIKfepcmhINbfjclaBJfeOKBMcbndkhBPHMifjafcJPicLNidkdgd">


那么如何直接从浏览器中找出他们的内存地址,并全屏拷贝下来。
拷贝出来的结果是txt。而不是html源代码。
===============================================================
ps:要求全屏幕拷贝,而不是荧屏取词。
 
哇! 我现在可没有这个本事 呵呵
如果你研究出来就共享一下吧
截取屏幕不难但要达到年的要求 鄙人修炼的还不够 呵呵
 
呵呵,我已经研究出来了。
 
后退
顶部