function HtmlToTxt(const HTMLText:string;MarkLinks:boolean):string;
var
NextToken,s0:string;
i:integer;
HelpIdx:integer;
inQuot:boolean; // 去除<script>段之用
InputLen:integer;
InputIdx:integer; // 指向输入字符的下一个待处理字符
inPre:boolean; // 表示是否在<pre>...</pre>段内
CurrLink:string;
// 取得下一段字符串
function GetNextToken(const s:string; const StartIdx:integer):string;
var
i:integer;
begin
result:=s[StartIdx];
if result='&' then
begin
for i:=StartIdx+1 to Length(s) do
begin
if s in ['&',' ',#13,'<'] then break;
result:=result+s;
if s=';' then break;
end;
end
else if result='<' then
begin
for i:=StartIdx+1 to Length(s) do
begin
result:=result+s;
if s='>' then break;
end;
end
else
begin
for i:=StartIdx+1 to Length(s) do
if s in ['&','<'] then break
else result:=result+s;
end;
end;
// 输入:<a href="http://delphigroup.yeah.net">
// 输出:http://delphigroup.yeah.net
function GetLink(s:string):string;
var
LPos,RPos,LQuot,RQuot:integer;
begin
result:='';
// 去掉'....<'
LPos:=pos('<',s);
if LPos=0 then exit;
delete(s,1,LPos);
s:=Trim(s);
// 去掉'>....'
RPos:=pos('>',s);
if RPos=0 then exit;
delete(s,RPos,MaxInt);
if uppercase(copy(s,1,2))='A ' then
begin
LPos:=pos('HREF',uppercase(s));
if LPos=0 then exit;
LQuot:=PosX('"',s,1);
RQuot:=PosX('"',s,2);
if (LQuot<LPos) or (RQuot>RPos) then exit;
// 开头带'#'的超链接,视为无效
if s[LQuot+1]='#' then exit;
// 开头带'javascript:'的超链接,也视为无效
// 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div>
if copy(s,LQuot+1,11)='javascript:' then exit;
result:=copy(s,LQuot+1,RQuot-LQuot-1);
end;
end;
// 把所有&xxx的转义;所有<xxx>取消;其它照样返回
function ConvertHTMLToken(const s:string;var inPre:boolean):string;
var
s0,s0_2,s0_3,s0_4:string;
begin
if s[1]='&' then
begin
s0:=lowerCase(s);
result:='';
if s0=' ' then result:=' '
else if s0='"' then result:='"'
else if s0='>' then result:='>'
else if s0='<' then result:='<'
else if s0='·' then result:='·'
else if s0='™' then result:=' TM '
else if s0='©' then result:='(c)'
else if s0='&' then result:='&'
else if s0='&' then result:='&';
end
else if s[1]='<' then
begin
s0:=lowerCase(s);
s0_2:=copy(s0,1,2);
s0_3:=copy(s0,1,3);
s0_4:=copy(s0,1,4);
result:='';
// 将所有<hr>替换成为'------'
if s0='<br>' then result:=CR
else if s0_4='<pre' then // <pre 一定要在 <p 之前判断!
begin inPre:=true;result:=CR; end
else if s0_2='<p' then result:=CR+CR
else if s0_3='<hr' then result:=CR+DupString('-',40)+CR
else if s0_3='<ol' then result:=CR
else if s0_3='<ul' then result:=CR
// else if s0_4='</ol' then result:=CR
// else if s0_4='</ul' then result:=CR
else if s0_3='<li' then result:='·'
else if s0_4='</li' then result:=CR
else if s0_4='</tr' then result:=CR
else if s0='</td>' then result:=#9
else if s0='<title>' then result:='《'
else if s0='</title>' then result:='》'+CR+CR
else if s0='</pre>' then inPre:=false
else if copy(s0,1,6)='<table' then result:=CR
else if MarkLinks and (s0[2]='a') then
begin
CurrLink:=GetLink(s);
if CurrLink<>'' then result:='[';
end
else if MarkLinks and (s0='</a>') then
if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
end
else if inPre then
result:=s
else // 不在<pre>..</pre>内,则删除所有CR
result:=Replacing(s,CR,'');
end;
begin
if (pos('<',HTMLText)=0) and
(pos('>',HTMLText)=0) then exit;
s0:=UnixToDos(HTMLText);
result:='';
InputLen:=Length(s0);
InputIdx:=1;
inPre:=false;
CurrLink:='';
while InputIdx<=InputLen do
begin
NextToken:=GetNextToken(s0,InputIdx);
// 去除<style ...> -- </style>之间的内容
if lowercase(copy(NextToken,1,6))='<style' then
begin
while lowercase(NextToken)<>'</style>' do
begin
inc(InputIdx,Length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
inc(InputIdx,Length(NextToken));
NextToken:=GetNextToken(s0,InputIdx);
end;
// 去除<Script ...> -- </Script>之间的内容
if lowercase(copy(NextToken,1,7))='<script' then
begin
// while lowercase(NextToken)<>'</script>' do
// begin
// inc(InputIdx,strlen(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
// end;
// inc(InputIdx,strlen(NextToken));
// NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,Length(NextToken));
inQuot:=false;
i:=InputIdx-1;
while I<InputLen do
begin
inc(i);
if s0='"' then
begin
inQuot:=not inQuot;
continue;
end;
if not inQuot then
// 去除<script>段里的<!-- ... -->注释段, 99.8.2
if copy(s0,i,4)='<!--' then
begin
HelpIdx:=pos('-->',copy(s0,i+4,MaxInt));
if HelpIdx>0 then
begin
inc(i,4+HelpIdx+2);
end
else
begin
i:=InputLen;
break;
end;
end;
if lowercase(copy(s0,i,9))='</script>' then
begin
break;
end;
end;
InputIdx:=i;
end;
NextToken:=GetNextToken(s0,InputIdx);
inc(InputIdx,Length(NextToken));
result:=result+ConvertHTMLToken(NextToken,inPre);
end;
end;