看到 http://www.delphibbs.com/delphibbs/dispq.asp?lid=3924528 在讨论如何得知很大的文件是否相同,突然有个想法,是否可以这样做:
利用黄金分割点的办法,快速地选择检查点,从而大致判定二个很大的文件是否相同,于是就写了下面这个代码。方法就是选择一些特别的点去评估,代码中只是从头到尾地做了一遍,假如再从尾到头地来一遍,准确度就不见得会输于用 MD5 的比较,而耗费的时间却极短,完全可以用于非人为做假前提下的超大文件的相似性判定(代码中只做了是否相同的判定,没有做相似性判定,如果有需要,我想,做相似性判定的意义更大些)。
实际上,不论是 MD5 还是其他的方法(校验和除外),本质上都是获取文件特征,而获取文件特征就有初略的和精细的分别,对于很大的文件,用这种特定点的取样来处理,未必不是一条可行之路。
完整的源代码贴于下面,并在最后还有简单的测试代码样例,供大家参考和批评:
unit CheckPoint;
interface
uses
Windows, SysUtils, Classes;
const
DefaultPoint = 1000;
type
TSampleCompare = class(TList)
private
procedure Init;
function CreateCheckPoint(Num:integer): integer;
function GetPosition(Index: Integer): Longword;
procedure SelectSample(FileName: string;
SampleBuffer: TMemoryStream);
public
constructor Create(CheckPoint: integer = DefaultPoint);
function FileCompare(FileName1, FileName2: string): Boolean;
property DiffPoint[Index: Integer]: Longword read GetPosition;
default;
end;
implementation
{ TSampleCompare }
constructor TSampleCompare.Create(CheckPoint: integer);
var
i: integer;
begin
inherited Create;
Count := CheckPoint;
Items[0] := pointer(0);
Items[1] := pointer(1);
Items[2] := pointer(1);
for i := 3 to (CheckPoint - 1)do
Items := pointer(-1);
Init;
end;
function TSampleCompare.CreateCheckPoint(Num: integer): integer;
begin
Result := integer(Items[Num]);
if (Result = -1) then
begin
Result := CreateCheckPoint(Num - 2) + CreateCheckPoint(Num - 1);
Items[Num] := pointer(Result);
end;
end;
function TSampleCompare.FileCompare(FileName1, FileName2: string): Boolean;
var
M1, M2: TMemoryStream;
begin
Result := False;
M1 := TMemoryStream.Create;
M2 := TMemoryStream.Create;
try
SelectSample(FileName1, M1);
SelectSample(FileName2, M2);
if (M1.Size = M2.Size) then
Result := CompareMem(M1.Memory, M2.Memory, M1.Size);
finally
M1.Free;
M2.Free;
end;
end;
function TSampleCompare.GetPosition(Index: Integer): Longword;
begin
if (Index < 0) or (Index >= Count) then
Result := Longword(Items[0])
else
Result := Longword(Items[Index]);
end;
procedure TSampleCompare.Init;
var
i: integer;
begin
for i := 3 to Count - 1do
CreateCheckPoint(i);
end;
procedure TSampleCompare.SelectSample(FileName: string;
SampleBuffer: TMemoryStream);
var
FHandle: integer;
FSize:Longword;
FStruct:TOFStruct;
i: integer;
FBytesRead: Longword;
FWrite: Pointer;
begin
if not Assigned(SampleBuffer) then
exit;
SampleBuffer.Clear;
FillChar(FStruct, sizeof(FStruct), 0);
FHandle := OpenFile(PChar(FileName), FStruct, 0);
try
FSize := GetFileSize(FHandle, nil);
for i := 0 to Count - 1do
begin
if DiffPoint < FSize then
begin
SetFilePointer(FHandle, DiffPoint, nil, FILE_begin
);
SampleBuffer.Position := SampleBuffer.Size;
SampleBuffer.Size := SampleBuffer.Size + sizeof(integer);
FWrite := Pointer(Longword(SampleBuffer.Memory) + SampleBuffer.Position);
ReadFile(FHandle, FWrite^, sizeof(integer), FBytesRead, nil);
end
else
Break;
end;
finally
CloseHandle(FHandle);
end;
end;
end.
测试的样例代码如下,也非常简单:
unit Unit1;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, CheckPoint;
type
TForm1 = class(TForm)
Button1: TButton;
procedure FormCreate(Sender: TObject);
procedure FormDestroy(Sender: TObject);
procedure Button1Click(Sender: TObject);
private
FCheckPoint: TSampleCompare;
public
{ Public declarations }
end;
var
Form1: TForm1;
implementation
{$R *.dfm}
const
FileName1 = 'C:/Heroes5.zip';
// Size > 1.5G
FileName2 = 'C:/TrackMania.rar';
// Size > 1.5G
procedure TForm1.FormCreate(Sender: TObject);
begin
FCheckPoint := TSampleCompare.Create;
end;
procedure TForm1.FormDestroy(Sender: TObject);
begin
FCheckPoint.Free;
end;
procedure TForm1.Button1Click(Sender: TObject);
var
FMsg: string;
begin
FMsg := '二个文件似乎是不相同的!';
if FCheckPoint.FileCompare(FileName1, FileName2) then
FMsg :='二个文件好像是相同的.';
ShowMessage(FMsg);
end;
end.