英文分割,求最快速度算法 ( 积分: 300 )

  • 主题发起人 A delphi
  • 开始时间
U

ufo

Unregistered / Unconfirmed
GUEST, unregistred user!
to: A delphi,下面是我修改的errorcode的完整代码,你试试:
unit Unit1;

interface

uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls;

type
TForm1 = class(TForm)
Memo1: TMemo;
Button1: TButton;
procedure Button1Click(Sender: TObject);
private
{ Private declarations }
public
{ Public declarations }
end;

var
Form1: TForm1;

implementation

{$R *.dfm}
const
MAX_BUFCOUNT = 1024;
MAX_HASHSIZE = $FFFF;

type
PStr = ^TStr;
TStr = record
buf: PChar;
len: Integer;
next: PStr;
end;

PStrArray = ^TStrArray;
TStrArray = record
Next: PStrArray;
Strs: array [0..MAX_BUFCOUNT - 1] of TStr;
end;

THashData = Cardinal;

PHash = ^THash;
THash = record
HashData: THashData;
Str: PStr;
Next: PHash;
end;

PHashArray = ^THashArray;
THashArray = record
Next: PHashArray;
Hashs: array [0..MAX_BUFCOUNT - 1] of THash;
end;

PStrHash = ^TStrHash;
TStrHash = array [0..MAX_HASHSIZE - 1] of PHash;

PDocData = ^TDocData;
TDocData = record
SourceDocData: TStr
// 保存源串数据
Word: PStr
// 保存第一个单词指针,第二个单词在next
WordCount: Integer
// 源串中有多少个单词, 通过Word字段访问
Hash: TStrHash
// Hash表
IdleStr: PStr
// 空闲单词指针,from StrBufs
StrBufCount: Integer
// 空闲单词指针数量
StrBufs: PStrArray
// 单词指针缓存,一次性申请N个指针
IdleHash: PHash
// 如上str
HashBufCount: Integer
// 如上strbufcount
HashArrays: PHashArray
// 如上strbufs
end;

TCharToByteValue = array [Char] of Byte;

var
CHAR_2_BYTE: TCharToByteValue
// 区分大小写的值
CHAR_2_BYTE_CASEINSENSITIVE: TCharToByteValue
// 不区分大小写的值
CHAR_2_BYTEs: array [Boolean] of TCharToByteValue;

procedure Init_CHAR_2_BYTEs;
var
I: Char;
begin
for I := #0 to Chr(MAXCHAR - 1) do
begin
CHAR_2_BYTE := Ord(I);
CHAR_2_BYTE_CASEINSENSITIVE := Ord(I);
end;
for I := 'a' to 'z' do
Dec(CHAR_2_BYTE_CASEINSENSITIVE, 32);
CHAR_2_BYTEs[False] := CHAR_2_BYTE;
CHAR_2_BYTEs[True] := CHAR_2_BYTE_CASEINSENSITIVE;
end;

function GetHashValue(const Buf
Len: Integer
HashSeed: Cardinal
IgnoreCase: Boolean = False): Cardinal;
var
P: PChar;
I: Integer;
CharByteArray: ^TCharToByteValue;
begin
Result := HashSeed;
P := @Buf;
CharByteArray := @CHAR_2_BYTEs[IgnoreCase];
for I := 1 to Len do
begin
result:=((Result shl 2) or (Result shr (SizeOf(Result) * 8 - 2))) xor
Ord(CharByteArray[P^])
//ufo! 修改
inc(p);
end;
end;

function GetStrHashData(AStr: PStr): THashData;
//var
// PreHash: Int64;
// H1, H2: PCardinal;
begin
//Result := HashValue(AStr.Buf^, AStr.len, $DEEDBEEF) and MAX_HASHSIZE;
//H1 := @Int64Rec(Result).Hi;
//H2 := @Int64Rec(Result).Lo;
//H1^ := GetHashValue(AStr.Buf^, AStr.len, 0);
//H2^ := GetHashValue(AStr.Buf^, AStr.len, $DEEDBEEF);
Result := GetHashValue(AStr.Buf^, AStr.len, $DEEDBEEF);
end;

function GetIdleHash(ADocData: PDocData): PHash;
var
HashArray: PHashArray;
begin
if ADocData.HashBufCount <= 0 then
begin
HashArray := AllocMem(SizeOf(THashArray));
HashArray.Next := ADocData.HashArrays;
ADocData.HashArrays := HashArray;
ADocData.HashBufCount := MAX_BUFCOUNT;
ADocData.IdleHash := @HashArray.Hashs[0];
end;
Result := ADocData.IdleHash;
Inc(ADocData.IdleHash);
Dec(ADocData.HashBufCount);
end;

// 在Hash表中检查是否已存在该WORD,存在则return False(Hash表未修改)
function AddHash(ADocData: PDocData
AStr: PStr): Boolean;
var
IsFind: Boolean;
HashData: THashData;
HashValue: Word;
Hash, NewHash: PHash;
begin
HashData := GetStrHashData(AStr);
HashValue := HashData and MAX_HASHSIZE;
//HashValue := GetHashValue(HashData, SizeOf(HashData), 0) and MAX_HASHSIZE;
Hash := ADocData.Hash[HashValue];

IsFind := False;
while Hash <> nil do
begin
IsFind := CompareMem(Hash.Str.buf, AStr.buf,AStr.len) and (Hash.HashData = HashData)
//ufo! 修改
if IsFind then break;
Hash := Hash.Next;
end;

Result := not IsFind;
if Result then
begin
Hash := ADocData.Hash[HashValue];
NewHash := GetIdleHash(ADocData);
Newhash.Str := AStr;
NewHash.HashData := HashData;
if Hash = nil then
NewHash.Next := nil
else
NewHash.Next := Hash;
ADocData.Hash[HashValue] := NewHash;
end;
end;

function GetIdleStr(ADocData: PDocData): PStr;
var
StrArray: PStrArray;
begin
if ADocData.StrBufCount <= 0 then
begin
StrArray := AllocMem(SizeOf(TStrArray));
StrArray.Next := ADocData.StrBufs;
ADocData.StrBufs := StrArray;
ADocData.StrBufCount := MAX_BUFCOUNT;
ADocData.IdleStr := @StrArray.Strs[0];
end;
Result := ADocData.IdleStr;
Inc(ADocData.IdleStr);
Dec(ADocData.StrBufCount);
end;

function InitDocData(const AFileName: string
var ADocData: PDocData): Boolean;
var
FileHandle: THandle;
begin
FileHandle := FileOpen(AFileName, fmShareDenyNone);
Result := FileHandle <> INVALID_HANDLE_VALUE;
if not Result then Exit;
New(ADocData);
try
ADocData.SourceDocData.len := FileSeek(FileHandle, 0, FILE_END);
FileSeek(FileHandle, 0, FILE_BEGIN);
ADocData.SourceDocData.buf := AllocMem(ADocData.SourceDocData.len);
FileRead(FileHandle, ADocData.SourceDocData.buf^, ADocData.SourceDocData.len);
finally
CloseHandle(FileHandle);
end;
ADocData.WordCount := 0;
ADocData.Word := nil;
ADocData.IdleStr := nil;
ADocData.StrBufCount := 0;
ADocData.StrBufs := nil;
FillChar(ADocData.Hash, SizeOf(ADocData.Hash), 0);
ADocData.IdleHash := nil;
ADocData.HashBufCount := 0;
ADocData.HashArrays := nil;
end;

procedure FreeDocData(var ADocData: PDocData);
var
StrArray, Next: PStrArray;
HashArray, NextHash: PHashArray;
begin
if ADocData.SourceDocData.len > 0 then
begin
FreeMem(ADocData.SourceDocData.buf);
ADocData.SourceDocData.len := 0;
end;
StrArray := ADocData.StrBufs;
while StrArray <> nil do
begin
Next := StrArray.Next;
FreeMem(StrArray);
StrArray := Next;
end;
ADocData.StrBufCount := 0;
ADocData.StrBufs := nil;
ADocData.IdleStr := nil;

HashArray := ADocData.HashArrays;
while HashArray <> nil do
begin
NextHash := HashArray.Next;
FreeMem(HashArray);
HashArray := NextHash;
end

ADocData.HashBufCount := 0;
ADocData.HashArrays := nil;
ADocData.IdleHash := nil;
Dispose(ADocData);
ADocData := nil;
end;

procedure SplitterDocData(ADocData: PDocData);
var
DocLen: Integer;
DocPtr, StartPtr, CurrPtr: PChar;

FirstWord, LastWord, CurrWord: PStr;
begin
DocPtr := ADocData.SourceDocData.buf;
DocLen := ADocData.SourceDocData.len;
CurrPtr := DocPtr;

FirstWord := GetIdleStr(ADocData);
CurrWord := FirstWord;
ADocData.Word := FirstWord;
LastWord := nil;
while CurrPtr - DocPtr < DocLen do
begin
while not (CurrPtr^ in ['a'..'z', 'A'..'Z', '0'..'9', '_', '-', '''']) do Inc(CurrPtr);
if CurrPtr - DocPtr > DocLen then
break;
StartPtr := CurrPtr;
while CurrPtr^ in ['a'..'z', 'A'..'Z', '0'..'9', '_', '-',''''] do Inc(CurrPtr);

if (CurrPtr - StartPtr = 1) and (StartPtr^ in ['0'..'9', '_', '-','''']) then
begin
Inc(CurrPtr);
Continue;
end;

CurrWord.buf := StartPtr;
CurrWord.len := CurrPtr - StartPtr;
CurrWord.next := nil;
if AddHash(ADocData, CurrWord) then
begin
if LastWord <> nil then
LastWord.next := CurrWord;
LastWord := CurrWord;
CurrWord := GetIdleStr(ADocData);
Inc(ADocData.WordCount);
end;
Inc(CurrPtr);
end;
end;

procedure TForm1.Button1Click(Sender: TObject);
var
MyWord: PStr;
DocData: PDocData;
TempStr: string;
StartSplitterTime, EndSplitterTime, StartWorkerTime, EndWorkerTime: Cardinal;
begin
StartWorkerTime := GetTickCount;
// 初始数据
if not InitDocData('e:/ss2.txt', DocData) then
raise Exception.Create(SysErrorMessage(GetLastError));

// 分隔单词
StartSplitterTime := GetTickCount;
SplitterDocData(DocData);
EndSplitterTime := GetTickCount;

// 显示
MyWord := DocData.Word;
Memo1.Lines.BeginUpdate;
while MyWord <> nil do
begin
SetString(TempStr, MyWord.buf, MyWord.len);
Memo1.Lines.Add(TempStr);
MyWord := MyWord.next;
end;
Memo1.Lines.EndUpdate;

EndWorkerTime := GetTickCount;

Caption := Format('ALL Time: %d, Splitter Time: %d, WordCount: %d,单词数:%d', [
EndWorkerTime - StartWorkerTime, EndSplitterTime - StartSplitterTime,
DocData.WordCount,memo1.lines.count]);
// Free
FreeDocData(DocData);
end;


initialization
Init_CHAR_2_BYTEs;
end.
 
E

errorcode

Unregistered / Unconfirmed
GUEST, unregistred user!
ufo:
我测试的情况是只简单的几个E单词,所以,hash冲突重复是正常,所以,起始我也是像你这样用CompareMem,不过后来发现,用两次得到的Hash的值,再做一次hash,得到的HashValue,这样的HashData就是重复率基本无,也就是所说的:一而再,再而三,这种几率是很小的。所以我保留那几段注解的意思。当然也可以用CompareMem,不过失去hash的本意,所以一早就改过。
此程序很简单,不想说之。

还是打游戏来的爽。不说了。
 
U

ufo

Unregistered / Unconfirmed
GUEST, unregistred user!
to: errorcode,什么游戏这么吸引人?
另外,你注释掉的那几行代码是编译不过的,类型不匹配。@Int64Rec(X).hi 这里的X必须是合适的变量。
另外,比较原值而不是hash值是链地址抗冲突法的必备手段。
且用hash直接定址找到原值的,因此效率还是有保证,不存在失去hash本意的说法,毕竟,如果结果不可靠,再快的速度也没用。
其次,在数据源不可预知的情况下,多次hash不能保证无冲突,因为丢失一个数据都是不允许的。
再次,如果hash域空间过大,造成内存浪费严重。
 
A

A&nbsp;delphi

Unregistered / Unconfirmed
GUEST, unregistred user!
经过测试,好像还是有出入,具体在那我没有细瞧了
 
U

ufo

Unregistered / Unconfirmed
GUEST, unregistred user!
哈希表比二分表快,这是原理上决定的。
楼主应该仔细看看errorcode的代码,我原先的代码速度慢,主要的原因是字符串的比对很耗时以及字符串拷贝了两次,一次在建立stringlist时,再一次在显示时。
errorcode大量使用了pchar指针,来引用原始的数据,减少了字符串拷贝,这也提高了效率。
errorcode的代码还可以优化,比如,在while循环内多次判断字符串,这个可以想办法合并。
至于单词数量的出入,我想是不是errorcode在分词代码上有纰漏?
因为只有不同单词产生相同哈希码的可能,不会有相同的单词产生不同的哈希码,因此单词只会少不会多。如果单词多出来了,应该重点考虑分词模块。
 
顶部