呵呵,我写过一个类似的程序(抓取新浪新闻,保存进入数据库 ),贴出来大家看看吧:
dfm文件:
object FormMain: TFormMain
Left = 343
Top = 218
BorderIcons = [biSystemMenu, biMinimize]
BorderStyle = bsSingle
Caption = 'News Crack'
ClientHeight = 125
ClientWidth = 287
Color = clBtnFace
Font.Charset = GB2312_CHARSET
Font.Color = clWindowText
Font.Height = -12
Font.Name = '宋体'
Font.Style = []
Icon.Data = {
0000010002002020100000000000E80200002600000010101000000000002801
00000E0300002800000020000000400000000100040000000000800200000000
0000000000000000000000000000000000000000800000800000008080008000
0000800080008080000080808000C0C0C0000000FF0000FF000000FFFF00FF00
0000FF00FF00FFFF0000FFFFFF00000000000000000000000000000000000777
7777777777777777777770000000000000000000000000000000700000000FFF
FFFFFFFFFFFFFFFFFFF0700000000FFFFFFF000000FFFFFFFFF0700000000FFF
FFFFFFFFFFFFFFFFFFF0700000000FFFFFFF00000000FFFFFFF0700000000FFF
FFFFFFFFFFFFFFFFFFF0700000000FFFFFFF000000000FFFFFF0700000000FFF
FFFFFFFFFFFFFFFFFFF0000000000F00000FFFFFFFFFFFF000F0700000000FFF
FFFFFFFFFFFFFFF040F0070000000F00000FFFFFFFFFFFF000F0707000000FFF
FFFFFFFFFFFFFFFFFFF007070000000000000000000000000000F07070000000
000FFFFF0FFF0FFF0FFFFF070700000000FFFFF0FFF0FFF0FFF0FFF070000000
000FFF0FFF0FFF0FFF0FFFFF070000000000FFFFF0FFF0FFF0FFFFFFF0000000
00000FFF0FFF0FFF0FFFFFFFFF000000000000FFFFF0FFF0FFFFFFFFF0000000
0000000FFF0FFF0FFFFFFFFF0000000000000000FFFFF0FFF0FFFFF000000000
000000000FFF0FFF0F0FFF00000000000000000000FFFFF0F0FFF00000000000
00000000000FFF0F0FFF000000000000000000000000FFF0FFF0000000000000
0000000000000FFFFF0000000000000000000000000000FFF000000000000000
000000000000000F000000000000000000000000000000000000000000000000
0000000000000000000000000000FFFFFFFF8000007F0000007F0000007F0000
007F0000007F0000007F0000007F0000007F0000007F0000003F0000001F0000
000F0000000700000003FC000001F8000003FC000001FE000003FF000001FF80
0003FFC00007FFE0000FFFF0001FFFF8003FFFFC007FFFFE00FFFFFF01FFFFFF
83FFFFFFC7FFFFFFEFFFFFFFFFFF280000001000000020000000010004000000
0000C00000000000000000000000000000000000000000000000000080000080
00000080800080000000800080008080000080808000C0C0C0000000FF0000FF
000000FFFF00FF000000FF00FF00FFFF0000FFFFFF0007777777777770000000
0000000070000FFF777FFFF070000FFF7777FFF070000FFF77777FF000000777
FFFFF11070000777FFFFF11007000000000000007070000FF0F0F0F007000000
FF0F0F0FF07000000FF0F0FFFF00000000FF0FF0F0000000000FFF0F00000000
0000F0F00000000000000F000000000000000000000000070000000700000007
00000007000000070000000300000001000000010000C0010000E0010000F001
0000F8030000FC070000FE0F0000FF1F0000FFBF0000}
OldCreateOrder = False
OnCreate = FormCreate
PixelsPerInch = 96
TextHeight = 12
object Panel2: TPanel
Left = 0
Top = 0
Width = 287
Height = 125
Align = alClient
BevelOuter = bvNone
TabOrder = 0
object LabelTime: TLabel
Left = 103
Top = 9
Width = 60
Height = 12
Caption = '时间设定:'
end
object Bevel1: TBevel
Left = 8
Top = 80
Width = 273
Height = 10
Shape = bsBottomLine
end
object Label1: TLabel
Left = 16
Top = 6
Width = 48
Height = 12
Caption = 'Success:'
end
object Label2: TLabel
Left = 16
Top = 42
Width = 36
Height = 12
Caption = 'Error:'
end
object Label3: TLabel
Left = 16
Top = 24
Width = 42
Height = 12
Caption = 'Ingore:'
end
object LabelSuccess: TLabel
Left = 66
Top = 6
Width = 6
Height = 12
Caption = '0'
Font.Charset = GB2312_CHARSET
Font.Color = clBlue
Font.Height = -12
Font.Name = '宋体'
Font.Style = []
ParentFont = False
end
object LabelError: TLabel
Left = 66
Top = 42
Width = 6
Height = 12
Caption = '0'
Font.Charset = GB2312_CHARSET
Font.Color = clRed
Font.Height = -12
Font.Name = '宋体'
Font.Style = []
ParentFont = False
end
object LabelIngore: TLabel
Left = 66
Top = 24
Width = 6
Height = 12
Caption = '0'
end
object ButtonClear: TButton
Left = 221
Top = 29
Width = 60
Height = 21
Caption = '&Clear'
TabOrder = 0
OnClick = ButtonClearClick
end
object ButtonStart: TButton
Left = 221
Top = 55
Width = 60
Height = 21
Caption = '&Stop'
Default = True
Enabled = False
TabOrder = 1
OnClick = ButtonStartClick
end
object EditTime: TEdit
Left = 103
Top = 29
Width = 57
Height = 20
ReadOnly = True
TabOrder = 2
Text = '5'
end
object UpDownTime: TUpDown
Left = 160
Top = 29
Width = 17
Height = 20
Associate = EditTime
Min = 1
Position = 5
TabOrder = 3
Wrap = False
OnClick = UpDownTimeClick
end
object ButtonLoad: TButton
Left = 221
Top = 4
Width = 60
Height = 21
Caption = '&Manual'
TabOrder = 4
OnClick = ButtonLoadClick
end
object ProgressBar: TProgressBar
Left = 12
Top = 62
Width = 193
Height = 18
Min = 0
Max = 100
TabOrder = 5
end
object Edit1: TEdit
Left = 8
Top = 96
Width = 269
Height = 20
TabOrder = 6
end
end
object Timer: TTimer
OnTimer = TimerTimer
Left = 220
Top = 92
end
object NMHTTP: TNMHTTP
Port = 0
ReportLevel = 0
Body = 'Default.htm'
Header = 'Head.txt'
InputFileMode = False
OutputFileMode = False
ProxyPort = 0
Left = 252
Top = 92
end
object DataSourceNews: TDataSource
DataSet = Query
Left = 188
Top = 92
end
object Query: TADOQuery
Connection = Database
CursorType = ctStatic
Parameters = <>
SQL.Strings = (
'SELECT newstype.type_name as 类型, news.news_title AS 标题, news.new' +
's_detail AS 内容, news.news_url as 地址,news.news_time AS 时间'
'FROM news, newstype'
'WHERE news.type_id = newstype.type_id'
'ORDER BY news.type_id ASC, news.news_time DESC')
Left = 156
Top = 92
end
object Database: TADOConnection
CommandTimeout = 60
ConnectionString =
'Provider=MSDASQL.1;Password=xxx;Persist Security Info=True;Us' +
'er ID=xxx;Data Source=xxx;Connect Timeout=15;Extended Properti' +
'es="DSN=xxx;SERVER=xxxx;UID=xxx;PWD=xxx;WSID=xxx' +
'2";Locale Identifier=2052'
LoginPrompt = False
Provider = 'MSDASQL.1'
AfterConnect = DatabaseAfterConnect
Left = 124
Top = 92
end
end
pas文件:
unit UnitMain;
interface
uses
Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
StdCtrls, ComCtrls, ExtCtrls, Math, Psock, NMHttp, Grids, DBGrids, ADODB, Db, DBCtrls;
type
TFormMain = class(TForm)
Timer: TTimer;
NMHTTP: TNMHTTP;
DataSourceNews: TDataSource;
Panel2: TPanel;
LabelTime: TLabel;
ButtonClear: TButton;
ButtonStart: TButton;
EditTime: TEdit;
UpDownTime: TUpDown;
ButtonLoad: TButton;
Bevel1: TBevel;
Label1: TLabel;
Label2: TLabel;
Label3: TLabel;
LabelSuccess: TLabel;
LabelError: TLabel;
LabelIngore: TLabel;
ProgressBar: TProgressBar;
Query: TADOQuery;
Database: TADOConnection;
Edit1: TEdit;
procedure FormCreate(Sender: TObject);
procedure UpDownTimeClick(Sender: TObject; Button: TUDBtnType);
procedure TimerTimer(Sender: TObject);
procedure ButtonStartClick(Sender: TObject);
procedure ButtonClearClick(Sender: TObject);
procedure ButtonLoadClick(Sender: TObject);
procedure CrackOnce(Sender: TObject);
procedure PhraseNews(NewsIndex:integer;LinePos:integer);
function GetDetail(NewsIndex:integer;Title:string):string;
procedure DatabaseAfterConnect(Sender: TObject);
private
{ Private declarations }
public
TimeInc:integer;
Error:integer;
Success:integer;
Ingore:integer;
LastUrls1:array [0..10] of string;
LastUrls2:array [0..10] of string;
Source:string;
S:string;
{ Public declarations }
end;
const
NEWSURL = 'http://dailynews.sina.com.cn';
CLASSCOUNT = 10;
TokenStrings:array [0..CLASSCOUNT] of String=
(
'<!--开始新浪动态内容-->',
' 国内新闻',
' 国际新闻',
' 体育新闻',
' 科技新闻',
' 文化教育',
' 观点与评论',
' 游戏新闻',
' 财经新闻',
' 娱乐新闻',
' 社会新闻'
);
var
FormMain: TFormMain;
CurUrl:string;
InRound:boolean;
implementation
{$R *.DFM}
procedure CheckUrl(var Url:string);
begin
if (UpperCase(Copy(Url,1,4))<>'HTTP') then
begin
Url:=NEWSURL+'/'+Url;
CurUrl:=url;
end;
end;
procedure TFormMain.FormCreate(Sender: TObject);
var
I:integer;
begin
TimeInc:=UpDownTime.Position*60*1000;
Timer.Interval:=TimeInc;
ProgressBar.Max:=CLASSCOUNT;
ProgressBar.Min:=0;
ProgressBar.Position:=0;
Success:=0;
Ingore:=0;
Error:=0;
for I:=0 to CLASSCOUNT do
LastUrls1:='123';
for I:=0 to CLASSCOUNT do
LastUrls2:='123';
end;
procedure TFormMain.UpDownTimeClick(Sender: TObject; Button: TUDBtnType);
begin
TimeInc:=UpDownTime.Position*60*1000;
Timer.Interval:=TimeInc;
end;
procedure TFormMain.ButtonStartClick(Sender: TObject);
begin
Timer.Enabled:=Not Timer.Enabled;
if Timer.Enabled then
ButtonStart.Caption:='&Stop'
else
ButtonStart.Caption:='&Start'
end;
procedure TFormMain.ButtonClearClick(Sender: TObject);
begin
Query.Active:=False;
Query.SQL.Text:='DELETE FROM [free].[dbo].[news]';
Query.ExecSQL;
end;
procedure TFormMain.ButtonLoadClick(Sender: TObject);
begin
TimerTimer(Sender);
end;
procedure TFormMain.TimerTimer(Sender: TObject);
begin
if InRound=True then exit;
InRound:=True;
NMHTTP.Disconnect;
try
NMHTTP.Get(NEWSURL);
Source:=NMHTTP.Body;
CrackOnce(Sender);
InRound:=False;
except
Inc(Error,CLASSCOUNT);
LabelError.Caption:=IntToStr(Error);
InRound:=False;
end;
end;
procedure TFormMain.CrackOnce(Sender: TObject);
var
iPos:integer;
sFindStr:string;
I:integer;
begin
for I:=0 to CLASSCOUNT do
begin
sFindStr:=TokenStrings;
iPos:=Pos(sFindStr,Source);
Delete(Source,1,iPos-1);
PhraseNews(I,iPos);
ProgressBar.Position:=I;
if I=CLASSCOUNT then ProgressBar.Position:=0;
end;
end;
procedure TFormMain.PhraseNews(NewsIndex:integer;LinePos:integer);
var
sTitle:string;
sUrl:string;
sDetail:string;
tempStr1:string;
tempstr2:string;
I:integer;
begin
sTitle:='';
sUrl:='';
sDetail:='';
//get news url
I:=Pos('a href="',Source)+Length('a href="');
while (Source<>'"') and (Source<>' ') do
begin
sUrl:=sUrl+Source;
Inc(I);
end;
while Source<>'>' do
Inc(I);
Inc(I);
//get news title
while Source<>'<' do
begin
sTitle:=sTitle+Source;
Inc(I);
end;
if NewsIndex = 0 then //头条新闻
begin
if pos('china',sUrl)>0 then NewsIndex:=-1;
if pos('world',sUrl)>0 then NewsIndex:=-2;
if pos('sports',sUrl)>0 then NewsIndex:=-3;
if pos('tech',sUrl)>0 then NewsIndex:=-4;
if pos('culture',sUrl)>0 then NewsIndex:=-5;
if pos('comment',sUrl)>0 then NewsIndex:=-6;
if pos('game',sUrl)>0 then NewsIndex:=-7;
if pos('finance',sUrl)>0 then NewsIndex:=-8;
if pos('living',sUrl)>0 then NewsIndex:=-9;
if pos('society',sUrl)>0 then NewsIndex:=-10;
end;
{if NewsIndex=1 then //国内新闻
begin
end;
if NewsIndex=2 then //国际新闻
begin
end;
if NewsIndex=3 then //体育新闻
begin
end;
if NewsIndex=4 then //科技新闻
begin
end;
if NewsIndex=5 then //文化新闻
begin
end;
if NewsIndex=6 then //观点与评论新闻
begin
end;
if NewsIndex=7 then //游戏新闻
begin
end;
if NewsIndex=8 then //财经新闻
begin
end;
if NewsIndex=9 then //娱乐新闻
begin
end;
if NewsIndex=10 then //社会新闻
begin
end;}
if NewsIndex<0 then NewsIndex:=-NewsIndex;
CheckUrl(sUrl);
Edit1.Text:=sUrl+' '+sTitle;
try
NMHTTP.Disconnect;
NMHTTP.Body:='';
NMHTTP.Get(sUrl);
S:=NMHTTP.Body;
sDetail:=GetDetail(NewsIndex,sTitle);
except
Inc(Error);
LabelError.Caption:=IntToStr(Error);
exit;
end;
if (sTitle='') or (sDetail='') then
begin
Inc(error);
labelerror.caption:=IntToStr(error);
exit;
end;
if (sUrl=LastUrls1[NewsIndex]) or (sUrl=LastUrls2[NewsIndex]) then
begin
Inc(Ingore);
LabelIngore.Caption:=IntToStr(Ingore);
exit;
end;
LastUrls2[NewsIndex]:=LastUrls1[NewsIndex];
LastUrls1[NewsIndex]:=sUrl;
Query.Close;
sTitle:=StringReplace(sTitle,'''','"',[rfReplaceAll]);
if Length(sDetail)<8000 then
begin
//分成两个sql语句插入大于4000小于8000的新闻
if (Length(sDetail)< 8000) and (Length(sDetail)> 4000) then
begin
tempstr1 := sDetail;
tempstr2 := sDetail;
delete(tempstr2,1,4000);
delete(tempstr1,4000,length(tempstr1));
Query.active:=false;
Query.SQL.Text:='INSERT INTO [free].[dbo].[news](n_title,n_detail,n_url,t_id) VALUES('+''''+stitle+''''+','+''''+ tempstr1 +''''+','+''''+sUrl+''''+','+IntToStr(NewsIndex)+')';
try
Query.ExecSQL;
Inc(Success);
labelsuccess.caption:=IntToStr(success);
except
Inc(error);
labelerror.caption:=IntToStr(error);
end;
Query.active:=false;
Query.SQL.Text:='update [free].[dbo].[news] set n_detail=n_detail + ' + tempstr2 + ' where n_id=(select max(n_id) from news) ';
try
Query.ExecSQL;
Inc(Success);
labelsuccess.caption:=IntToStr(success);
except
Inc(error);
Labelerror.Caption:=IntToStr(error);
end;
end
else
begin
Query.Active:=False;
Query.SQL.Text:='INSERT INTO [free].[dbo].[news](n_title,n_detail,n_url,t_id) VALUES('+''''+stitle+''''+','+''''+sDetail+''''+','+''''+sUrl+''''+','+IntToStr(NewsIndex)+')';
try
Query.ExecSQL;
Inc(Success);
labelsuccess.caption:=IntToStr(Success);
except
Inc(error);
labelerror.caption:=IntToStr(Error);
end;
end; //end of 分成两个sql语句插入大于4000小于8000的新闻
end;
end;
function TFormMain.GetDetail(NewsIndex:integer;Title:string):string;
function VarifyText(S:string):string;
begin
Result:=StringReplace(S,'新浪科技','科技',[rfReplaceAll]);
Result:=StringReplace(S,'新浪娱乐','娱乐',[rfReplaceAll]);
if Pos('新浪',S)>0 then
begin
Result:='';
exit;
end;
Result:=StringReplace(S,'''','"',[rfReplaceAll]);
Result:=StringReplace(Result,'http://www.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'http://sports.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'http://tech.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'http://games.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'http://living.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'http://finance.sina.com.cn','http://www.textclick.com',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(S,#13,'',[rfReplaceAll]);
Result:=StringReplace(S,#10,#13#10,[rfReplaceAll]);
Result:=StringReplace(Result,#13#10#13#10#13#10,'',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,#13#10#13#10,'',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,#13#10,'',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'IMG SRC="/','IMG SRC="'+NEWSURL+'/',[rfReplaceAll,rfIgnoreCase]);
Result:=StringReplace(Result,'IMG SRC=/','IMG SRC='+NEWSURL+'/',[rfReplaceAll,rfIgnoreCase]);
end;
var
I,J,J1,J2,J3:integer;
begin
if newsindex=7 then
I:=Pos('<hr',S)
else
begin
I:=Pos('<HR',S);
if I=0 then I:=Pos('<hr',S);
end;
Delete(S,1,I+37);
J1:=Pos('<BR',S);
J2:=Pos('<br',S);
J3:=Pos('相关报道',S);
if pos('图文',title)= 1 then
begin
J2:=Pos(LowerCase(#13#10+'<br'),S);
end;
if J1+J2+J3=0 then
begin
Result:='';
exit;
end;
if J1=0 then J1:=J2;
if J2=0 then J2:=J1;
if J3=0 then J3:=J1;
J:=J1;
if J2<J then J:=J2;
if J3<J then J:=J3;
if NewsIndex=7 then //game
J:=Pos('<ul>',S);
Delete(S,J,Length(S)-J+1);
Result:=S;
Result:=VarifyText(Result);
end;
procedure TFormMain.DatabaseAfterConnect(Sender: TObject);
begin
ButtonStart.Enabled:=True;
end;
end.
根据网页的标记来判断的,缺点就是如果dailynews.sina.com.cn的版面改动过了,程序也要
做相应的修改。