就用这个函数吧,相信能够帮你解决问题。
function HtmlToTxt(const HTMLText:string;MarkLinks:boolean):string;
CONST CR = #13#10;
var
NextToken,s0:string;
i:integer;
HelpIdx:integer;
inQuot:boolean; // 去除<script>段之用
InputLen:integer;
InputIdx:integer; // 指向输入字符的下一个待处理字符
inPre:boolean; // 表示是否在<pre>...</pre>段内
CurrLink:string;
// 取得下一段字符串
function GetNextToken(const s:string; const StartIdx:integer):string;
var
i:integer;
begin
result:=s[StartIdx];
if result='&' then
begin
for i:=StartIdx+1 to length(s) do
begin
if s in ['&',' ',#13,'<'] then break;
result:=result+s;
if s=';' then break;
end;
end
else if result='<' then
begin
for i:=StartIdx+1 to length(s) do
begin
result:=result+s;
if s='>' then break;
end;
end
else
begin
for i:=StartIdx+1 to length(s) do
if s in ['&','<'] then break
else result:=result+s;
end;
end;
//在str 中找到第 id 个 substr的位置, length(SubStr) =1
function NPos(SubStr, Str: String; Id: Integer) : Integer;
var i, Count :integer;
begin
Count := 0;
for i := 1 to Length(Str) do
begin
if Str = SubStr then Inc(Count);
if Count >= Id then Break;
end;
Result := i;
end;
// 输入:<a href="http://delphigroup.yeah.net">
// 输出:http://delphigroup.yeah.net
function GetLink(s:string):string;
var
LPos,RPos,LQuot,RQuot:integer;
begin
result:='';
// 去掉'....<'
LPos:=pos('<',s);
if LPos=0 then exit;
delete(s,1,LPos);
s:=Trim(s);
// 去掉'>....'
RPos:=pos('>',s);
if RPos=0 then exit;
delete(s,RPos,MaxInt);
if uppercase(copy(s,1,2))='A ' then
begin
LPos:=pos('HREF',uppercase(s));
if LPos=0 then exit;
LQuot:=NPos('"',s,1);
RQuot:=NPos('"',s,2);
if (LQuot<LPos) or (RQuot>RPos) then exit;
// 开头带'#'的超链接,视为无效
if s[LQuot+1]='#' then exit;
// 开头带'javascript:'的超链接,也视为无效
// 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div>
if copy(s,LQuot+1,11)='javascript:' then exit;
result:=copy(s,LQuot+1,RQuot-LQuot-1);
end;
end;
function MakeStr(Str: String; id : Integer):String;
var i : integer;
Buffer : String;
begin
Buffer := '';
for i := 1 to id do Buffer := Buffer + Str;
Result := Buffer;
end;
function ReplaceStr(const S, Srch, Replace: string): string;
var
I: Integer;
Source: string;
begin
Source := S;
Result := '';
repeat
I := Pos(uppercase(Srch), uppercase(Source));
if I > 0 then begin
Result := Result + Copy(Source, 1, I - 1) + Replace;
Source := Copy(Source, I + Length(Srch), MaxInt);
end
else Result := Result + Source;
until I <= 0;
end;
// 把所有&xxx的转义;所有<xxx>取消;其它照样返回
function ConvertHTMLToken(const s:string;var inPre:boolean):string;
var
s0,s0_2,s0_3,s0_4:string;
begin
if s[1]='&' then
begin
s0:=lowerCase(s);
result:='';
if s0=' ' then result:=' '
else if s0='"' then result:='"'
else if s0='>' then result:='>'
else if s0='<' then result:='<'
else if s0='·' then result:='·'
else if s0='™' then result:=' TM '
else if s0='©' then result:='(c)'
else if s0='&' then result:='&'
else if s0='&amp' then result:='&';
end
else if s[1]='<' then
begin
s0:=lowerCase(s);
s0_2:=copy(s0,1,2);
s0_3:=copy(s0,1,3);
s0_4:=copy(s0,1,4);
result:='';
// 将所有<hr>替换成为'------'
if s0='<br>' then result:= CR
else if s0_4='<pre' then // <pre 一定要在 <p 之前判断!
begin inPre:=true;result:=CR; end
else if s0_2='<p' then result:=CR+CR
else if s0_3='<hr' then result:=CR+MakeStr('-',40)+CR
else if s0_3='<ol' then result:=CR
else if s0_3='<ul' then result:=CR
// else if s0_4='</ol' then result:=CR
// else if s0_4='</ul' then result:=CR
else if s0_3='<li' then result:='·'
else if s0_4='</li' then result:=CR
else if s0_4='</tr' then result:=CR
else if s0='</td>' then result:=#9
else if s0='<title>' then result:=' 标题 <<'
else if s0='</title>' then result:='>>'+CR+CR
else if s0='</pre>' then inPre:=false
else if copy(s0,1,6)='<table' then result:=CR
else if MarkLinks and (s0[2]='a') then
begin
CurrLink:=GetLink(s);
if CurrLink<>'' then result:='[';
end
else if MarkLinks and (s0='</a>') then
if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
end
else if inPre then
result:=s
else // 不在<pre>..</pre>内,则删除所有CR
result:=ReplaceStr(s,CR,'');
end;
begin
s0:=HTMLText;
result:='';
InputLen:=length(s0);
InputIdx:=1;
inPre:=false;
CurrLink:='';
while InputIdx<=InputLen do
begin
NextToken:=GetNextToken(s0,InputIdx);
// 去除<style ...> -- </style>之间的内容
if lowercase(copy(NextToken,1,6))='<style' then
begin
while lowercase(NextToken)<>'</style>' do
begin
inc(InputIdx,length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
inc(InputIdx,length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
// 去除<Script ...> -- </Script>之间的内容
if lowercase(copy(NextToken,1,7))='<script' then
begin
// while lowercase(NextToken)<>'</script>' do
// begin
// inc(InputIdx,length(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
// end;
// inc(InputIdx,length(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,length(NextToken));
inQuot:=false;
i:=InputIdx-1;
while I<InputLen do
begin
inc(i);
if s0='"' then
begin
inQuot:=not inQuot;
continue;
end;
if not inQuot then
// 去除<script>段里的<!-- ... -->注释段, 99.8.2
if copy(s0,i,4)='<!--' then
begin
HelpIdx:=pos('-->',copy(s0,i+4,MaxInt));
if HelpIdx>0 then
begin
inc(i,4+HelpIdx+2);
end
else
begin
i:=InputLen;
break;
end;
end;
if lowercase(copy(s0,i,9))='</script>' then
begin
break;
end;
end;
InputIdx:=i;
end;
NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,length(NextToken));
result:=result+ConvertHTMLToken(NextToken,inPre);
end;
end;