如何转换 html 文件为 TEXT 文件? 谁有解析html语法的控件啊(50分)

  • 主题发起人 主题发起人 xgwzw
  • 开始时间 开始时间
? 改个
后缀
不就好了?
 
用TWebBrowser载入 然后就可以把符号去掉 取得文本
 
用word打开,另存为txt
 
关键是如何去掉html 标记
 
webbrowser1.Navigate('c:/1.htm');
Memo1.Lines.Add(IHtmlDocument2(WebBrowser1.Document).Body.OuterText);

两句话就可以解决的问题
 
function HtmlToTxt(const HTMLText:string;MarkLinks:boolean):string;
var
NextToken,s0:string;
i:integer;
HelpIdx:integer;
inQuot:boolean; // 去除<script>段之用
InputLen:integer;
InputIdx:integer; // 指向输入字符的下一个待处理字符
inPre:boolean; // 表示是否在<pre>...</pre>段内
CurrLink:string;

// 取得下一段字符串
function GetNextToken(const s:string; const StartIdx:integer):string;
var
i:integer;
begin
result:=s[StartIdx];
if result='&' then
begin
for i:=StartIdx+1 to Length(s) do
begin
if s in ['&',' ',#13,'<'] then break;
result:=result+s;
if s=';' then break;
end;
end
else if result='<' then
begin
for i:=StartIdx+1 to Length(s) do
begin
result:=result+s;
if s='>' then break;
end;
end
else
begin
for i:=StartIdx+1 to Length(s) do
if s in ['&','<'] then break
else result:=result+s;
end;
end;

// 输入:<a href="http://delphigroup.yeah.net">
// 输出:http://delphigroup.yeah.net
function GetLink(s:string):string;
var
LPos,RPos,LQuot,RQuot:integer;
begin
result:='';

// 去掉'....<'
LPos:=pos('<',s);
if LPos=0 then exit;
delete(s,1,LPos);
s:=Trim(s);

// 去掉'>....'
RPos:=pos('>',s);
if RPos=0 then exit;
delete(s,RPos,MaxInt);

if uppercase(copy(s,1,2))='A ' then
begin
LPos:=pos('HREF',uppercase(s));
if LPos=0 then exit;

LQuot:=PosX('"',s,1);
RQuot:=PosX('"',s,2);

if (LQuot<LPos) or (RQuot>RPos) then exit;

// 开头带'#'的超链接,视为无效
if s[LQuot+1]='#' then exit;

// 开头带'javascript:'的超链接,也视为无效
// 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div>
if copy(s,LQuot+1,11)='javascript:' then exit;

result:=copy(s,LQuot+1,RQuot-LQuot-1);
end;
end;

// 把所有&xxx的转义;所有<xxx>取消;其它照样返回
function ConvertHTMLToken(const s:string;var inPre:boolean):string;
var
s0,s0_2,s0_3,s0_4:string;
begin
if s[1]='&' then
begin
s0:=lowerCase(s);
result:='';
if s0=' ' then result:=' '
else if s0='"' then result:='"'
else if s0='>' then result:='>'
else if s0='<' then result:='<'
else if s0='·' then result:='·'
else if s0='™' then result:=' TM '
else if s0='©' then result:='(c)'
else if s0='&' then result:='&'
else if s0='&amp' then result:='&';
end
else if s[1]='<' then
begin
s0:=lowerCase(s);
s0_2:=copy(s0,1,2);
s0_3:=copy(s0,1,3);
s0_4:=copy(s0,1,4);

result:='';
// 将所有<hr>替换成为'------'
if s0='<br>' then result:=CR
else if s0_4='<pre' then // <pre 一定要在 <p 之前判断!
begin inPre:=true;result:=CR; end
else if s0_2='<p' then result:=CR+CR
else if s0_3='<hr' then result:=CR+DupString('-',40)+CR
else if s0_3='<ol' then result:=CR
else if s0_3='<ul' then result:=CR
// else if s0_4='</ol' then result:=CR
// else if s0_4='</ul' then result:=CR
else if s0_3='<li' then result:='·'
else if s0_4='</li' then result:=CR
else if s0_4='</tr' then result:=CR
else if s0='</td>' then result:=#9
else if s0='<title>' then result:='《'
else if s0='</title>' then result:='》'+CR+CR
else if s0='</pre>' then inPre:=false
else if copy(s0,1,6)='<table' then result:=CR
else if MarkLinks and (s0[2]='a') then
begin
CurrLink:=GetLink(s);
if CurrLink<>'' then result:='[';
end
else if MarkLinks and (s0='</a>') then
if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
end
else if inPre then
result:=s
else // 不在<pre>..</pre>内,则删除所有CR
result:=Replacing(s,CR,'');
end;

begin
if (pos('<',HTMLText)=0) and
(pos('>',HTMLText)=0) then exit;

s0:=UnixToDos(HTMLText);
result:='';
InputLen:=Length(s0);
InputIdx:=1;
inPre:=false;
CurrLink:='';

while InputIdx<=InputLen do
begin
NextToken:=GetNextToken(s0,InputIdx);

// 去除<style ...> -- </style>之间的内容
if lowercase(copy(NextToken,1,6))='<style' then
begin
while lowercase(NextToken)<>'</style>' do
begin
inc(InputIdx,Length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
inc(InputIdx,Length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;

// 去除<Script ...> -- </Script>之间的内容
if lowercase(copy(NextToken,1,7))='<script' then
begin
// while lowercase(NextToken)<>'</script>' do
// begin
// inc(InputIdx,strlen(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
// end;
// inc(InputIdx,strlen(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,Length(NextToken));
inQuot:=false;
i:=InputIdx-1;
while I<InputLen do
begin
inc(i);
if s0='"' then
begin
inQuot:=not inQuot;
continue;
end;
if not inQuot then
// 去除<script>段里的<!-- ... -->注释段, 99.8.2
if copy(s0,i,4)='<!--' then
begin
HelpIdx:=pos('-->',copy(s0,i+4,MaxInt));
if HelpIdx>0 then
begin
inc(i,4+HelpIdx+2);
end
else
begin
i:=InputLen;
break;
end;
end;
if lowercase(copy(s0,i,9))='</script>' then
begin
break;
end;
end;
InputIdx:=i;
end;

NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,Length(NextToken));
result:=result+ConvertHTMLToken(NextToken,inPre);
end;
end;
 
/////////// 1.html内容 放在c盘目录下 //////////////
<HTML>
<BODY>
<TBODY>
<TR>
<TD>冰力不足是不是全世界最漂亮的?</TD>

<INPUT type=radio CHECKED value=是 name=gq> 是 <INPUT type=radio value=否
name=gq> 否</TD></TR>
</BODY>
</HTML>


////////////////// Unit1.pas ////////////////////
unit Unit1;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs,MSHTML, OleCtrls, SHDocVw, StdCtrls;

type
TForm1 = class(TForm)
Memo1: TMemo;
Button1: TButton;
WebBrowser1: TWebBrowser;
procedure Button1Click(Sender: TObject);
procedure FormCreate(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;

var
Form1: TForm1;

implementation

{$R *.dfm}

procedure TForm1.Button1Click(Sender: TObject);
begin

Memo1.Lines.Add(IHtmlDocument2(WebBrowser1.Document).Body.OuterText);
end;

procedure TForm1.FormCreate(Sender: TObject);
begin
webbrowser1.Navigate('c:/1.html');
end;

end.
///////////////// Unit1.dfm /////////////////////
object Form1: TForm1
Left = 192
Top = 114
Width = 462
Height = 514
Caption = 'Form1'
Color = clBtnFace
Font.Charset = DEFAULT_CHARSET
Font.Color = clWindowText
Font.Height = -11
Font.Name = 'MS Sans Serif'
Font.Style = []
OldCreateOrder = False
OnCreate = FormCreate
PixelsPerInch = 96
TextHeight = 13
object Memo1: TMemo
Left = 24
Top = 208
Width = 401
Height = 151
ScrollBars = ssBoth
TabOrder = 0
end
object Button1: TButton
Left = 192
Top = 400
Width = 89
Height = 25
Caption = #36716#25442
TabOrder = 1
OnClick = Button1Click
end
object WebBrowser1: TWebBrowser
Left = 16
Top = 34
Width = 401
Height = 151
TabOrder = 2
ControlData = {
4C000000722900009B0F00000000000000000000000000000000000000000000
000000004C000000000000000000000001000000E0D057007335CF11AE690800
2B2E126208000000000000004C0000000114020000000000C000000000000046
8000000000000000000000000000000000000000000000000000000000000000
00000000000000000100000000000000000000000000000000000000}
end
end

///////////////// Project1.dpr //////////////////
program Project1;

uses
Forms,
Unit1 in 'Unit1.pas' {Form1};

{$R *.res}

begin
Application.Initialize;
Application.CreateForm(TForm1, Form1);
Application.Run;
end.


笑死我了 太委屈我啦 不过曾经感谢无私教我的黑冰老师 我当时也许比你菜 他能够很有耐心教我 所以菜秒会了也要帮助更多菜鸟 所以这样写以后有菜鸟看一定也明白了
 
这里有个来自faststring的函数,WantHTML参数指示是抽取文本还是抽取html标记
function StripHTMLorNonHTML(const S : string; WantHTML : Boolean) : string;
var
X: Integer;
TagCnt: Integer;
ResChar: PChar;
SrcChar: PChar;
begin
TagCnt := 0;
SetLength(Result, Length(S));
if Length(S) = 0 then Exit;

ResChar := @Result[1];
SrcChar := @S[1];
for X:=1 to Length(S) do
begin
case SrcChar^ of
'<':
begin
Inc(TagCnt);
if WantHTML and (TagCnt = 1) then
begin
ResChar^ := '<';
Inc(ResChar);
end;
end;
'>':
begin
Dec(TagCnt);
if WantHTML and (TagCnt = 0) then
begin
ResChar^ := '>';
Inc(ResChar);
end;
end;
else
case WantHTML of
False:
if TagCnt <= 0 then
begin
ResChar^ := SrcChar^;
Inc(ResChar);
TagCnt := 0;
end;
True:
if TagCnt >= 1 then
begin
ResChar^ := SrcChar^;
Inc(ResChar);
end else
if TagCnt < 0 then TagCnt := 0;
end;
end;
Inc(SrcChar);
end;
SetLength(Result, ResChar - PChar(@Result[1]));
Result := FastReplace(Result, ' ', ' ', False);
Result := FastReplace(Result,'&','&', False);
Result := FastReplace(Result,'<','<', False);
Result := FastReplace(Result,'>','>', False);
Result := FastReplace(Result,'"','"', False);
end;
function FastReplace(const aSourceString : string; const aFindString, aReplaceString : string;
CaseSensitive : Boolean = False) : string;
var
PResult : PChar;
PReplace : PChar;
PSource : PChar;
PFind : PChar;
PPosition : PChar;
CurrentPos,
BytesUsed,
lResult,
lReplace,
lSource,
lFind : Integer;
Find : TFastPosProc;
CopySize : Integer;
JumpTable : TBMJumpTable;
begin
LSource := Length(aSourceString);
if LSource = 0 then begin
Result := aSourceString;
exit;
end;
PSource := @aSourceString[1];

LFind := Length(aFindString);
if LFind = 0 then exit;
PFind := @aFindString[1];

LReplace := Length(aReplaceString);

//Here we may get an Integer Overflow, or OutOfMemory, if so, we use a Delta
try
if LReplace <= LFind then
SetLength(Result,lSource)
else
SetLength(Result, (LSource *LReplace) div LFind);
except
SetLength(Result,0);
end;

LResult := Length(Result);
if LResult = 0 then begin
LResult := Trunc((LSource + LReplace) * cDeltaSize);
SetLength(Result, LResult);
end;


PResult := @Result[1];


if CaseSensitive then
begin
MakeBMTable(PChar(AFindString), lFind, JumpTable);
Find := BMPos;
end else
begin
MakeBMTableNoCase(PChar(AFindString), lFind, JumpTable);
Find := BMPosNoCase;
end;


BytesUsed := 0;
if LReplace > 0 then begin
PReplace := @aReplaceString[1];
repeat
PPosition := Find(PSource,PFind,lSource, lFind, JumpTable);
if PPosition = nil then break;

CopySize := PPosition - PSource;
Inc(BytesUsed, CopySize + LReplace);

if BytesUsed >= LResult then begin
//We have run out of space
CurrentPos := Integer(PResult) - Integer(@Result[1]) +1;
LResult := Trunc(LResult * cDeltaSize);
SetLength(Result,LResult);
PResult := @Result[CurrentPos];
end;

FastCharMove(PSource^,PResult^,CopySize);
Dec(lSource,CopySize + LFind);
Inc(PSource,CopySize + LFind);
Inc(PResult,CopySize);

FastCharMove(PReplace^,PResult^,LReplace);
Inc(PResult,LReplace);

until lSource < lFind;
end else begin
repeat
PPosition := Find(PSource,PFind,lSource, lFind, JumpTable);
if PPosition = nil then break;

CopySize := PPosition - PSource;
FastCharMove(PSource^,PResult^,CopySize);
Dec(lSource,CopySize + LFind);
Inc(PSource,CopySize + LFind);
Inc(PResult,CopySize);
Inc(BytesUsed, CopySize);
until lSource < lFind;
end;

SetLength(Result, (PResult+LSource) - @Result[1]);
if LSource > 0 then
FastCharMove(PSource^, Result[BytesUsed + 1], LSource);
end;
 
用正则表达式组件分析
如script表达式: '<script .*?>(.|/n)*?<//script>'
 
“冰力不足”的方法是正确的
 
后退
顶部