可以使用 TWebBrowser 来分析。
JvHtmlParser没分析过。
下面我自己写的分析,可以参考下的。
// psInput: 输入HTML源码
// pbShowTitle 是否保留标题
// pbShowLink 是否保留超级链接地址串
function GetHtmlText(const psInput:string;pbShowTitle:boolean=True;
pbShowLink:boolean=False;pbOnlyLink:boolean =False;ALinks :TStringList=nil):string;
// 得到所有超级链接地址与文本
// psInput: 输入HTML源码
// pbGetText: 是否取文本内容
// ALinks: 得到的超级链接地址与文本,如果取文本,则地址在奇数行,文本在偶数行
// 返回值: 超级链接数量
function GetLinks(const psInput:string;pbGetText :boolean;ALinks :TStringList):integer;
implementation
function GetHtmlText(const psInput:string;pbShowTitle,pbShowLink:boolean;
pbOnlyLink:boolean ;ALinks :TStringList):string;
const
CrLf = #13#10;
// sHtmlChar: HTML格式的特殊字符 &XXX; 的形式保存
// sTextChar: 转换后的文本
// 这二个常量做成字典,一定要一一对应!!!
sHtmlChar:array[0..7] of string =
(' ','&','<','>','"','®','©','·');
sTextChar:array[0..7] of string =
(' ','&','<','>','"','(R)','(c)','·');
var
sOutput, CurrentLink:string;
iInputLen, iPos, iMaxHtmlChar, iLoop,
iIndex , iOutputLen,iOutputIndex:integer;
bInPre, // 当前字符是否处于 Pre块、 ---bInScript,Style或Script块
bFindEndTag:boolean; // 主过程是否继续查找标签结束符 '>'
////bHasSpaceTagBefoer // 标签字符前是否有空格
procedure AddOutputLength(const piSize :integer);
begin
if pbOnlyLink then Exit;
// 分配输出串大小 -- 累加
iOutputLen := iOutputIndex + piSize;
SetLength(sOutput,iOutputLen);
end; // Local
procedure AddToOutput(const psValue:string);
var
iLen ,iLoop:integer;
begin
if pbOnlyLink then Exit;
// 累计结果串
iLen := Length(psValue);
if iLen =0 then Exit;
if iOutputIndex + iLen > iOutputLen then
AddOutputLength( (iLen+ iInputLen - iIndex)*2 );
for iLoop :=1 to iLen do
begin
Inc(iOutputIndex); // 初始为 0
sOutput[iOutputIndex] := psValue[iLoop];
end;
end; // Local
function IsEnd:boolean;
begin
// 是否结尾
Result := iIndex > iInputLen;
end; // Local
function Current:Char;
begin
if Not IsEnd then
Result :=psInput[iIndex]
else
Result := #0;
end;
function FindCharIndex(const pcChar:Char):boolean;
begin
// 只有找到才返回真 -- 目前没用使用返回值
while Not IsEnd and (Current <> pcChar) do Inc(iIndex);
Result := Not IsEnd and (Current = pcChar);
end;
procedure RemoveSpace;
begin
//移除空格
while Not IsEnd and (Current=' ') do Inc(iIndex);
end; // Local
function GetHtmlTextChar(const psToken :string):string;
var
iLoop :integer;
begin
// 处理特殊字符
if pbOnlyLink then
begin
Result :='';
Exit;
end;
Result := psToken;
for iLoop := Low(sHtmlChar) to High(sHtmlChar) do
if psToken = sHtmlChar[iLoop] then
begin
Result := sTextChar[iLoop];
Exit;
end;
end; // Local
function GetLinkStr: string;
var
bInValid{,bHasDoubleQuot,bHasSingleQuot} :boolean;
iChar :Char;
begin
// 得到超级链接地址
Result := '';
RemoveSpace;
iPos := iIndex;
while (not IsEnd) do
begin
//if Copy(psInput,iIndex,5)='class' then
// ShowMessage('aa');
if ((FindCharIndex('h') or FindCharIndex('H'))
and (LowerCase(Copy(psInput,iIndex,4)) = 'href')) then
begin
Inc(iIndex,3);
break;
end;
end;
FindCharIndex('=') ;
// 超链接:
// (1) <a class="b" href="xxx">..</a>
// (2) <a href = fff>..</a>
// (3) <a href='xx'>..</a>
// (4) <a href=#>..</a>
// (5) <a href=" javascript:xxx">..</a>
if not IsEnd then //LowerCase(Trim(Copy(psInput,iPos,iIndex-iPos)))='href' then
begin
Inc(iIndex);
RemoveSpace;
//bHasDoubleQuot := Current='"'; // 是否有双引号
//bHasSingleQuot := Current = ''''; // 是否有单引号
iChar := Current;
if (iChar ='"') or (iChar='''') then
Inc(iIndex)
else
iChar := #0;
//if bHasDoubleQuot or bHasSingleQuot then Inc(iIndex);
// 下面几种不要处理:
// <1>href=#... 之类不处理--书签
// <2>javascript: 脚本语言
//ToDo: 问题:如果此脚本语言中包含 '>' 符号,则要怎么处理?
RemoveSpace; // 去除可能单引号或双引号后的空格
bInValid := (Current='#') or (LowerCase(Copy(psInput,iIndex,11))='javascript:');
if not bInValid then
begin
iPos := iIndex;
//if bHasDoubleQuot then // 定位到结束的双引号
// FindCharIndex('"')
//else if bHasSingleQuot then // 定位到结束的单引号
// FindCharIndex('''')
if iChar <> #0 then
FindCharIndex(iChar)
else
while not IsEnd and (Current<>' ') and (Current<>'>') do Inc(iIndex);
Result := Copy(psInput,iPos,iIndex - iPos); // 不含结束符 >
end;
end;
// 保证 iIndex 指向 '>' 或结尾
FindCharIndex('>')
end; // Local
procedure RemoveStyle;
var
bInComm :boolean;
begin
// 处理格式中的注释中 /* */ 可能出现的 </style
// ....
bInComm := False;
while Not IsEnd do
begin
case Current of
'/':
if Not bInComm and (Copy(psInput,iIndex+1,1)='*') then
begin
bInComm := True;
Inc(iIndex);
end;
'*':
if bInComm and (Copy(psInput,iIndex+1,1)='/') then
begin
bInComm := False;
Inc(iIndex);
end;
'<':
if Not bInComm and
(LowerCase(Copy(psInput,iIndex+2,5)) = 'style') then Break;
end; // case end.
Inc(iIndex);
end; // while end
FindCharIndex('>');
end; // Local
function RemoveScript:boolean;
var
bInString,bIsDoubleQuot, // 当前处于 字串中 ,双引号字串(可含单引号),
bIsSingleQuot, // 单引号字串(可含双引号)
bIsJavaScript : boolean; // 语言,True :JavaScript ,False:VBScript
sTag :string;
begin
// 处理脚本语言
// 语言 注释 字串
// vbscript 单引号 ' </script 双引号 "</script"
// javascript 双斜杠 // </script 双引号 "</script" (转义符/)
Result :=FindCharIndex('>'); // 定位结束 '>'
if Not Result then Exit; // 没找到 > 返回 False ,-- 没有处理
// <script type=text/javascript
// <script language=javascript
sTag := LowerCase(Copy(psInput,iPos,iIndex-iPos+1));
bIsJavaScript := Not (Pos('vbscript',sTag)>0); //Todo: 不指定是 javascript??? !!!
bInString := False;
Inc(iIndex);
while Not IsEnd do
begin
case Current of
'''': // Vb的单行注释
begin
if Not bIsJavaScript then
begin
if Not bInString then
begin
while Not IsEnd and Not (Current in [#10,#13]) do Inc(iIndex);
end;
end;
// JavaScript 字串 ,当不是处于双引号内部时
if bIsJavaScript and Not bIsDoubleQuot then
begin
if bInString and (psInput[iIndex-1]<>'/') then
begin
bInString := False;
bIsSingleQuot := False;
end
else
begin
bInString := True;
bIsSingleQuot := True;
end;
end;
end;
'"': // 字串
// ToDo: 当处于正则表达式中时怎么处理?
begin
if bIsJavaScript then
begin
if Not bIsSingleQuot then
begin
if bInString and (psInput[iIndex-1]<>'/') then
begin
bInString := False;
bIsDoubleQuot := False;
end
else
begin
bInString := True;
bIsDoubleQuot := True;
end;
end;
end
else
begin
bInString := Not bInString;
end;
end;
'/': // Java的单行注释 双斜线 // ...#13#10
// ToDo: Java 的多行注释是 /* ... */ ? -- 是
begin
if bIsJavaScript then
begin
if Not bInString then
begin
if (Copy(psInput,iIndex,2)='//') then
while Not IsEnd and Not (Current in [#10,#13]) do Inc(iIndex)
else if (Copy(psInput,iIndex,2)='/*') then
begin
Inc(iIndex);
Inc(iIndex); // 指向开始符之后
while Not IsEnd do
begin
FindCharIndex('*');
if Not IsEnd and (Copy(psInput,iIndex,2)='*/') then
begin
Inc(iIndex); // 指向结束符 /, 外面的 ++ 再指向下一个字符
Break;
end;
Inc(iIndex);
end;
end;
end; // if Not bInString end.
end;
end;
'<':
begin
if Not bInString and // 找到结束符 </script
(LowerCase(Copy(psInput,iIndex+2,6)) = 'script') then Break;
end;
end; // case .. end.
Inc(iIndex)
end; // while .. end.
FindCharIndex('>');
end; // Local
procedure RemoveComm;
begin
while Not IsEnd and FindCharIndex('>') do
begin
if (psInput[iIndex-1] ='-') and (psInput[iIndex-2] ='-') then
Break;
Inc(iIndex);
end;
end; // Local
function GetHtmlToken(const psTag :string):string;
var
iTagLen :integer;
s_2,s_3,s_4 :string;
begin
// 处理HTML标签,Result 就是处理标签的内容 ,标签头与尾都调用
// bFindEndTag 设置为 False 说明已处理标签到 '>' 符,后续不再处理
// bFindendTag = False 时要把 iIndex 设置指向 '>' 符或结束符
if psTag='' then Exit;
iTagLen := Length(psTag);
s_2 := Copy(psTag,1,2);
s_3 := Copy(psTag,1,3);
s_4 := Copy(psTag,1,4);
Result := '';
if s_3 = '<br' then // 加车换行 <br> <br /> ,< br>?
Result := CrLf
else if s_4 = '<div' then
Result := CrLf
else if s_4 = '<pre' then
bInPre := True
else if Copy(psTag,1,5) = '</pre' then
bInPre := False
else if s_2 = '<p' then // 注: <pre 判断要比 <p 前
Result := CrLf + CrLf
else if s_3 = '<li' then
Result := CrLf
else if s_3 = '<tr' then
Result := CrLf
else if s_4 = '</td' then
Result := #9
else if pbShowLink and (psTag = '<a ') then // 超级链接 开始
begin
CurrentLink := GetLinkStr;
if CurrentLink <> '' then Result := ' [';
bFindEndTag := False;
end
else if pbShowLink and (s_4 = '</a>') then // 超级链接 结束
begin
if CurrentLink<>'' then
begin
Result := ' '+ CurrentLink + '] ';
if pbOnlyLink then ALinks.Add(CurrentLink);
end;
CurrentLink :='';
end
else
begin
if iTagLen>=6 then
begin
// 注释: <!----> <!--comman--> <!-- xx -->
// <!--script xx>...</script-->
// 脚本: <style xx>...</style>
// <script xx>...</script>
if (Copy(psTag, 2, 5) = 'style') or // 2-5: <style
((iTagLen>=9) and (Copy(psTag,5,5)='style')) then // 5-5: <!--style
begin
RemoveStyle;
bFindEndTag := False;
end
else if (Copy(psTag, 2, 6) = 'script') or // 2-6: <script
((iTagLen>=10) and (Copy(psTag, 5, 6)='script')) then // 5-6: <!--script
begin
RemoveScript;
bFindEndTag := False;
end
else if Copy(psTag,1,4) ='<!--' then // HTML注释,可能注释中包含在脚本里,所以要后判断
begin
RemoveComm;
bFindEndTag := False;
end
//else if bInScript and ((iTagLen>=7) and
// ((Copy(psTag,3,5) = 'style') or // 3-5: </style
// (Copy(psTag,3,6)='script'))) then // 3-6: </script
// bInScript := False
else if Not pbShowTitle and (psTag='<title>') then
begin
// 不显示标题,处理掉
Inc(iIndex);
while Current<>'>' do Inc(iIndex);
bFindEndTag := False;
end;
end;
end;
end; // Local
begin // Main
// 注:由于对输入字串只循环一次,效率应该比较高吧!!! ^_^!! by liqj 2006-09
Result := '';
iInputLen := Length(psInput);
if iInputLen=0 then Exit;
sOutput := ''; //ToDo: 想节省一个变量则用 Result 代替.
iOutputIndex := 0;
AddOutputLength(iInputLen); // 初始化结果串等于输入串
iMaxHtmlChar := Length(sHtmlChar[Low(sHtmlChar)]);
for iLoop := Low(sHtmlChar)+1 to High(sHtmlChar) do
if iMaxHtmlChar < Length(sHtmlChar[iLoop]) then
iMaxHtmlChar := Length(sHtmlChar[iLoop]);
iIndex :=1;
bInPre := False;
//bInScript := False;
CurrentLink :='';
while Not IsEnd do
begin
case Current of
'<':
begin
iPos := iIndex;
RemoveSpace; // 保证 '<' 后的空格无效 : < /p -> </p ; < br -> <br
if iIndex <> iPos then // '<' 后有空格
iPos := iIndex // 开始符重新指定
else // 否则开始符指向下一个,即不要包含 '<' 符
Inc(iPos);
while Not IsEnd and (Current<>' ') and (Current<>'>') do Inc(iIndex);
bFindEndTag := True;
AddToOutput(GetHtmlToken('<'+LowerCase(Copy(psInput,iPos,iIndex -iPos+1))));
if bFindEndTag and (Current = ' ') then
FindCharIndex('>');
end;
'&':
begin
iPos := iIndex;
//FindCharIndex(';');
while Not IsEnd and Not (Current in [' ',';','<']) do Inc(iIndex);
if Current='<' then Continue;
if (iIndex-iPos+1<=iMaxHtmlChar) and (Current=';') then // 这里的长度是字典的最长字串的长度
AddToOutput(GetHtmlTextChar(LowerCase(Copy(psInput,iPos,iIndex -iPos+1))))
else
AddToOutput(Copy(psInput,iPos,iIndex -iPos+1));
end;
#13: // #13,#13#10
begin
if bInPre then
AddToOutput(CrLf)
else
AddToOutput(' '); //ToDo:1. 回车用空格代替
Inc(iIndex);
if Current=#10 then
Inc(iIndex);
Continue;
end;
#10: // #10,#10#13
begin
if bInPre then
AddToOutput(CrLf)
else
AddToOutput(' '); //ToDo:2. 回车用空格代替
Inc(iIndex);
if Current=#13 then
Inc(iIndex);
Continue;
end;
else
AddToOutput(psInput[iIndex]); //* 注:对于指多次分配资源的事,可以使用预分配方式解决
// 使用一过程来增加到结果,可以判断当前分配是否用完
// 开始可以预分配与输入一样大,一般够用了。
// 用流也行!!!
end; // case end.
Inc(iIndex);
end;
if iOutputIndex <> iOutputLen then
SetLength(sOutput,iOutputIndex);
Result := sOutput;
end;
// 比较字串,以左字串为准,可忽略右字串中的某个字符
// pbIsAllMatch : True 指右串中只能有左串与忽略字符
function SameTextLeftEx(const psValueLeft,psValueRight:string;
piIgnoreChar:Char;pbIsAllMatch:boolean):boolean;
var
iLen,iLen2 :integer;
begin
iLen := Length(psValueLeft);
iLen2 := Length(psValueRight);
while (iLen >=1) and (iLen2>=1) do
begin
if psValueRight[iLen2] = piIgnoreChar then
begin
Dec(iLen2);
if psValueLeft[iLen] = piIgnoreChar then Dec(iLen);
Continue;
end;
if psValueLeft[iLen]<>psValueRight[iLen2] then
Break;
Dec(iLen);
Dec(iLen2);
end;
if pbIsAllMatch and (iLen =0) and (iLen2>0) then // 处理后缀
while (iLen2<=1) and (psValueRight[iLen2]= piIgnoreChar) do Dec(iLen2);
Result := (not pbIsAllMatch and (iLen = 0)) or
(pbIsAllMatch and (iLen = 0) and (iLen2=0));
end;
function GetLinks(const psInput:string;pbGetText :boolean;ALinks :TStringList):integer;
var
iIndex,iLen,iLinkCount,
iPos,iPos2 :integer;
function FindChar(iChar :Char):boolean;
begin
while (iIndex<=iLen) and (psInput[iIndex]<>iChar) do Inc(iIndex);
Result := (iIndex<=iLen) and (psInput[iIndex]=iChar);
end;
function ProcessLink(const psLink:string):boolean;
var
iChar :Char;
iLen :integer;
begin
iLen := Length(psLink);
iPos := Pos('href',psLink);
Result := iPos >0;
if Result then
begin
iPos2 := PosEx('=',psLink,iPos+4); // uses StrUtils
Inc(iPos2);
while (iPos2<=iLen) and (psLink[iPos2]=' ') do Inc(iPos2);
if (iPos2<=iLen) and (psLink[iPos2]<>' ') then
begin
iChar := psLink[iPos2];
iPos := iPos2;
if (iChar = '"') or (iChar='''') then
iPos := iPos +1
else
iChar := ' ';
iPos2 := PosEx(iChar,psLink,iPos);
if iPos2=0 then iPos2 := iLen;
iLen := iPos2 - iPos;
//if psLink[iPos2 -1]='/' then Dec(iLen);
ALinks.Add(Copy(psLink,iPos,iLen));
end
else
ALinks.Add('#');
Inc(iLinkCount); // 超级链接地址数量
end;
end;
procedure GetLinkText;
begin
iPos := iIndex;
if FindChar('<') then
begin
while iIndex<=iLen do // 直到找到 </a>
begin
iPos2 := iIndex;
if FindChar('>') and
SameTextLeftEx('</a>',LowerCase(Copy(psInput,iPos2,iIndex-iPos2+1)),' ',False) then
begin
ALinks.Add(Copy(psInput,iPos +1,iPos2-iPos-1));
Break;
end;
FindChar('<');
end;
end
else
ALinks.Add('');
end;
begin
Result := 0;
iLinkCount :=0;
ALinks.Clear;
iLen := Length(psInput);
iIndex :=1;
while iIndex<=iLen do
begin
if psInput[iIndex]='<' then
begin
Inc(iIndex);
while (iIndex<=iLen) and (psInput[iIndex]=' ') do Inc(iIndex); // Ignore #32
if (iIndex < iLen) and (psInput[iIndex+1] =' ') and
((psInput[iIndex] = 'a') or (psInput[iIndex] = 'A')) then
begin
Inc(iIndex,2); // Jump: A#32
iPos := iIndex;
if FindChar('>') then
begin
//sLink := Copy(psInput,iPos,iIndex - iPos);
if ProcessLink(LowerCase(Copy(psInput,iPos,iIndex - iPos)))
and pbGetText then
GetLinkText;
end;
end
else
FindChar('>');
end;
Inc(iIndex);
end;
Result := iLinkCount;
end;