const
MAX_BUFCOUNT = 1024;
MAX_HASHSIZE = $FFFF;
type
PStr = ^TStr;
TStr = record
buf: PChar;
len: Integer;
next: PStr;
end;
PStrArray = ^TStrArray;
TStrArray = record
Next: PStrArray;
Strs: array [0..MAX_BUFCOUNT - 1] of TStr;
end;
THashData = Cardinal;
PHash = ^THash;
THash = record
HashData: THashData;
Str: PStr;
Next: PHash;
end;
PHashArray = ^THashArray;
THashArray = record
Next: PHashArray;
Hashs: array [0..MAX_BUFCOUNT - 1] of THash;
end;
PStrHash = ^TStrHash;
TStrHash = array [0..MAX_HASHSIZE - 1] of PHash;
PDocData = ^TDocData;
TDocData = record
SourceDocData: TStr
// 保存源串数据
Word: PStr
// 保存第一个单词指针,第二个单词在next
WordCount: Integer
// 源串中有多少个单词, 通过Word字段访问
Hash: TStrHash
// Hash表
IdleStr: PStr
// 空闲单词指针,from StrBufs
StrBufCount: Integer
// 空闲单词指针数量
StrBufs: PStrArray
// 单词指针缓存,一次性申请N个指针
IdleHash: PHash
// 如上str
HashBufCount: Integer
// 如上strbufcount
HashArrays: PHashArray
// 如上strbufs
end;
TCharToByteValue = array [Char] of Byte;
var
CHAR_2_BYTE: TCharToByteValue
// 区分大小写的值
CHAR_2_BYTE_CASEINSENSITIVE: TCharToByteValue
// 不区分大小写的值
CHAR_2_BYTEs: array [Boolean] of TCharToByteValue;
procedure Init_CHAR_2_BYTEs;
var
I: Char;
begin
for I := #0 to Chr(MAXCHAR - 1) do
begin
CHAR_2_BYTE := Ord(I);
CHAR_2_BYTE_CASEINSENSITIVE := Ord(I);
end;
for I := 'a' to 'z' do
Dec(CHAR_2_BYTE_CASEINSENSITIVE, 32);
CHAR_2_BYTEs[False] := CHAR_2_BYTE;
CHAR_2_BYTEs[True] := CHAR_2_BYTE_CASEINSENSITIVE;
end;
function GetHashValue(const Buf
Len: Integer
HashSeed: Cardinal
IgnoreCase: Boolean = False): Cardinal;
var
P: PChar;
I: Integer;
CharByteArray: ^TCharToByteValue;
begin
Result := HashSeed;
P := @Buf;
CharByteArray := @CHAR_2_BYTEs[IgnoreCase];
for I := 1 to Len do
begin
// 不区分大小写
Result := Result + CharByteArray[P^] * 37;
Inc(P);
end;
end;
function GetStrHashData(AStr: PStr): THashData;
//var
// PreHash: Int64;
// H1, H2: PCardinal;
begin
//Result := HashValue(AStr.Buf^, AStr.len, $DEEDBEEF) and MAX_HASHSIZE;
//H1 := @Int64Rec(Result).Hi;
//H2 := @Int64Rec(Result).Lo;
//H1^ := GetHashValue(AStr.Buf^, AStr.len, 0);
//H2^ := GetHashValue(AStr.Buf^, AStr.len, $DEEDBEEF);
Result := GetHashValue(AStr.Buf^, AStr.len, $DEEDBEEF);
end;
function GetIdleHash(ADocData: PDocData): PHash;
var
HashArray: PHashArray;
begin
if ADocData.HashBufCount <= 0 then
begin
HashArray := AllocMem(SizeOf(THashArray));
HashArray.Next := ADocData.HashArrays;
ADocData.HashArrays := HashArray;
ADocData.HashBufCount := MAX_BUFCOUNT;
ADocData.IdleHash := @HashArray.Hashs[0];
end;
Result := ADocData.IdleHash;
Inc(ADocData.IdleHash);
Dec(ADocData.HashBufCount);
end;
// 在Hash表中检查是否已存在该WORD,存在则return False(Hash表未修改)
function AddHash(ADocData: PDocData
AStr: PStr): Boolean;
var
IsFind: Boolean;
HashData: THashData;
HashValue: Word;
Hash, NewHash: PHash;
begin
HashData := GetStrHashData(AStr);
HashValue := HashData and MAX_HASHSIZE;
//HashValue := GetHashValue(HashData, SizeOf(HashData), 0) and MAX_HASHSIZE;
Hash := ADocData.Hash[HashValue];
IsFind := False;
while Hash <> nil do
begin
IsFind := (Hash.Str.len = AStr.len) and (Hash.HashData = HashData);
if IsFind then break;
Hash := Hash.Next;
end;
Result := not IsFind;
if Result then
begin
Hash := ADocData.Hash[HashValue];
NewHash := GetIdleHash(ADocData);
Newhash.Str := AStr;
NewHash.HashData := HashData;
if Hash = nil then
NewHash.Next := nil
else
NewHash.Next := Hash;
ADocData.Hash[HashValue] := NewHash;
end;
end;
function GetIdleStr(ADocData: PDocData): PStr;
var
StrArray: PStrArray;
begin
if ADocData.StrBufCount <= 0 then
begin
StrArray := AllocMem(SizeOf(TStrArray));
StrArray.Next := ADocData.StrBufs;
ADocData.StrBufs := StrArray;
ADocData.StrBufCount := MAX_BUFCOUNT;
ADocData.IdleStr := @StrArray.Strs[0];
end;
Result := ADocData.IdleStr;
Inc(ADocData.IdleStr);
Dec(ADocData.StrBufCount);
end;
function InitDocData(const AFileName: string
var ADocData: PDocData): Boolean;
var
FileHandle: THandle;
begin
FileHandle := FileOpen(AFileName, fmShareDenyNone);
Result := FileHandle <> INVALID_HANDLE_VALUE;
if not Result then Exit;
New(ADocData);
try
ADocData.SourceDocData.len := FileSeek(FileHandle, 0, FILE_END);
FileSeek(FileHandle, 0, FILE_BEGIN);
ADocData.SourceDocData.buf := AllocMem(ADocData.SourceDocData.len);
FileRead(FileHandle, ADocData.SourceDocData.buf^, ADocData.SourceDocData.len);
finally
CloseHandle(FileHandle);
end;
ADocData.WordCount := 0;
ADocData.Word := nil;
ADocData.IdleStr := nil;
ADocData.StrBufCount := 0;
ADocData.StrBufs := nil;
FillChar(ADocData.Hash, SizeOf(ADocData.Hash), 0);
ADocData.IdleHash := nil;
ADocData.HashBufCount := 0;
ADocData.HashArrays := nil;
end;
procedure FreeDocData(var ADocData: PDocData);
var
StrArray, Next: PStrArray;
HashArray, NextHash: PHashArray;
begin
if ADocData.SourceDocData.len > 0 then
begin
FreeMem(ADocData.SourceDocData.buf);
ADocData.SourceDocData.len := 0;
end;
StrArray := ADocData.StrBufs;
while StrArray <> nil do
begin
Next := StrArray.Next;
FreeMem(StrArray);
StrArray := Next;
end;
ADocData.StrBufCount := 0;
ADocData.StrBufs := nil;
ADocData.IdleStr := nil;
HashArray := ADocData.HashArrays;
while HashArray <> nil do
begin
NextHash := HashArray.Next;
FreeMem(HashArray);
HashArray := NextHash;
end
ADocData.HashBufCount := 0;
ADocData.HashArrays := nil;
ADocData.IdleHash := nil;
Dispose(ADocData);
ADocData := nil;
end;
procedure SplitterDocData(ADocData: PDocData);
var
DocLen: Integer;
DocPtr, StartPtr, CurrPtr: PChar;
FirstWord, LastWord, CurrWord: PStr;
begin
DocPtr := ADocData.SourceDocData.buf;
DocLen := ADocData.SourceDocData.len;
CurrPtr := DocPtr;
FirstWord := GetIdleStr(ADocData);
CurrWord := FirstWord;
ADocData.Word := FirstWord;
LastWord := nil;
while CurrPtr - DocPtr < DocLen do
begin
while not (CurrPtr^ in ['a'..'z', 'A'..'Z', '0'..'9', '_', '-', #0]) do Inc(CurrPtr);
if CurrPtr - DocPtr > DocLen then
break;
StartPtr := CurrPtr;
while CurrPtr^ in ['a'..'z', 'A'..'Z', '0'..'9', '_', '-'] do Inc(CurrPtr);
if (CurrPtr - StartPtr = 1) and (StartPtr^ in ['0'..'9', '_', '-']) then
begin
Inc(CurrPtr);
Continue;
end;
CurrWord.buf := StartPtr;
CurrWord.len := CurrPtr - StartPtr;
CurrWord.next := nil;
if AddHash(ADocData, CurrWord) then
begin
if LastWord <> nil then
LastWord.next := CurrWord;
LastWord := CurrWord;
CurrWord := GetIdleStr(ADocData);
Inc(ADocData.WordCount);
end;
Inc(CurrPtr);
end;
end;
procedure TForm1.Button1Click(Sender: TObject);
var
MyWord: PStr;
DocData: PDocData;
TempStr: string;
StartSplitterTime, EndSplitterTime, StartWorkerTime, EndWorkerTime: Cardinal;
begin
StartWorkerTime := GetTickCount;
// 初始数据
if not InitDocData('c:/test2.txt', DocData) then
raise Exception.Create(SysErrorMessage(GetLastError));
// 分隔单词
StartSplitterTime := GetTickCount;
SplitterDocData(DocData);
EndSplitterTime := GetTickCount;
// 显示
MyWord := DocData.Word;
Memo1.Lines.BeginUpdate;
while MyWord <> nil do
begin
SetString(TempStr, MyWord.buf, MyWord.len);
Memo1.Lines.Add(TempStr);
MyWord := MyWord.next;
end;
Memo1.Lines.EndUpdate;
EndWorkerTime := GetTickCount;
Caption := Format('ALL Time: %d, Splitter Time: %d, WordCount: %d', [
EndWorkerTime - StartWorkerTime, EndSplitterTime - StartSplitterTime, DocData.WordCount]);
// Free
FreeDocData(DocData);
end
initialization
Init_CHAR_2_BYTEs;