这是一个人名识别的单元
////////////////////////////////////////////////////////////////////////////
//使用说明:这是一个抽象类,该类被treeCutWord类继承 // //
// 1,设置TRecgName的DictPath(姓名字典的路径)。 //
// 2,覆盖类的抽象方法---GetsFee,GetgFee,GetsgFee,ComputeName. //
// 3,调用 CheckName方法分析字符串中的姓名。该方法将姓名的前后加上 //
// 分隔标志(StartSplit,EndSplit),然后返回加上了分隔标志的字串。 //
////////////////////////////////////////////////////////////////////////////
unit RecgName;
interface
uses SysUtils, StrUtils, Variants, Classes, Dialogs, untDict;
const
HZ_NUM=6768; //国标码中汉字的个数
CorpusSize=200000; // 语料库规模,用于计算词的出现概率
SurNameSize=174000; // 姓名语料库中姓氏用字总数
GivenNameSize=320000;// 姓名语料库中人名用字总数
Max2Fee=-2.14; //2个字的姓名的费用最大值
Max3Fee=-0.80; //3个字的姓名的费用最大值
hanzixm='hanzixm.txt'; //字典文件名
type
PMaybeName=^MaybeName;
MaybeName=record //可能是名字的词
offset,len:smallint; //偏移,长度
fee:double; //费用
end;
type
PNameFreq=^recNameFreq;
recNameFreq=record
HanZi:Word;
wSFreq,wGFreq:Word;
end;
type TNameDict=class
private
HZFreqs:array[0..65535] of PNameFreq; //保存汉字作为人名和姓氏的费用 。因为GBK码共有汉字20902个,所以数组大小为20902
function GetIndex(character:string):word; //获得数组索引
procedure InitArray;
public
protected
//procedure SetArrayValue(C:string;sf:word=0;gf:word=0);
published
constructor create(Apath:string);
destructor destroy;override;
procedure LoadNameDict(APath:string); //载入汉字作为姓,名,的费用表
function GetsNameFreq(character:string):Word; //查找汉字当作姓的频率
function GetgNameFreq(character:string):Word; //查找汉字当作名的频率
end;
type TRecgName=class
private
FNameDict:TNameDict;
FDict:TInterfacedDict;
MaybeNames:TList;
lstName:TStringList; //保存所有姓名
public
constructor Create(APath:string);
destructor Destroy;override;
function isCrossPair(p1, p2: PMaybeName): boolean; //侯选姓名是否有重叠
function isHomoPair(p1, p2: PMaybeName): boolean; //侯选姓名是否有相同的起点
published
function CheckName(S, StartSplit, EndSplit: string): string; //检查分词碎片
function GetgNameFreq(character: string): Word;
function GetsNameFreq(character: string): Word;
function GetsFee(character:string)
ouble;virtual;//abstract; //查找姓的费用
function GetgFee(character:string)
ouble;virtual;//abstract; //查找名的费用
function GetsgFee(character:string)
ouble;virtual;//abstract; //姓+名的费用
procedure ClearNames;
function GetNames:TStringList;
end;
implementation
{ TNameDict }
constructor TNameDict.create(Apath:string);
begin
InitArray;
LoadNameDict(Apath+hanzixm);
end;
destructor TNameDict.destroy;
var
i:Word;
begin
for i:=0 to 65535 do
Dispose(hzfreqs
);
inherited;
end;
function TNameDict.GetgNameFreq(character: string): Word;
begin
Result:=HZfreqs[GetIndex(character)].wGFreq;
end;
function TNameDict.GetIndex(character: string): word;
begin
Result:=Word(byte(character[1]) shl 8) + Word(character[2]);
end;
function TNameDict.GetsNameFreq(character: string): Word;
begin
Result:=HZfreqs[GetIndex(character)].wSFreq;
end;
procedure TNameDict.InitArray;
var
i:Word;
PNameFreq;
begin
for i:=0 to 65535 do
begin
New(HZFreqs);
HZFreqs.HanZi:=i;
HZFreqs.wSFreq:=0;
HZFreqs.wGFreq:=0;
end;
end;
procedure TNameDict.LoadNameDict(APath: string);
var
F:TextFile;
Line,HanZi:string;
QuotePos1,QuotePos2:Integer;
index:Word;
begin
try
AssignFile(F,APath);
//ShowMessage(APath);
Reset(F);
while (not Eof(F)) do
begin
Readln(F,line);
QuotePos1:=Pos('"',Line);
QuotePos2:=LastDelimiter('"',Line);
HanZi:=Copy(Line,QuotePos1+1,QuotePos2-QuotePos1-1);
index:=GetIndex(HanZi);
QuotePos1:=Pos(#9,Line);
QuotePos2:=LastDelimiter(#9,Line);
if QuotePos2-QuotePos1<>1 then
HZFreqs[index].wSFreq:=StrToInt(Copy(Line,QuotePos1+1,QuotePos2-QuotePos1-1)); //获取姓氏频度
if QuotePos2<>Length(Line) then
HZFreqs[index].wGFreq:=StrToInt(Copy(Line,QuotePos2+1,Length(Line)-QuotePos2));//获取人名频度
end;
finally
CloseFile(F);
end;
end;
{ TRecgName }
function TRecgName.CheckName(S, StartSplit, EndSplit: string): string;
var
i,j,len,Slen:integer;
fee:double;
PName,PName1,PName2MaybeName; //会不会被过程里的局部变量释放掉?
iDelete:boolean;
S2,W:string;
preName,sucName:string;
begin
Slen:=length(S) div 2;
for i:=1 to Slen do //遍历句子里的每个字
begin
len:=2;
while ((len<=3) and (i+len-1<=Slen)) do //向前取2个字或3个字,看是否是姓名
begin
fee:=GetsgFee(copy(S,i*2-1,len*2)); //计算一下当作“姓+名”模式的费用
//ShowMessage(copy(S,i*2-1,len*2)+FloatToStr(fee)); ////////////////////////////////////
if ((len=2) and (fee>=Max2Fee)) or ((len=3) and (fee>=Max3Fee)) then //费用超过最大值,不是姓名
begin
inc(len);
continue;
end;
new(PName); //否则,存在这个姓名,将它加入到列表
PName^.offset :=i * 2-1;
PName^.len:=len *2;
PName^.fee:=fee;
MaybeNames.Add(PName);
inc(len);
end;
end;
iDelete:=false;
i:=0;
while (i< MaybeNames.Count) do
begin
PName1:=PMaybeName(MaybeNames.Items);
//ShowMessage(Copy(s, Pname1.offset,pname1.len) );
j:=i+1;
while ((j<= i+2) and (j<MaybeNames.Count) ) do //只比较临近的2个姓名
begin
PName2:=PMaybeName(MaybeNames.Items[j]);
//ShowMessage(Copy(s, Pname2.offset,pname2.len) );
if (isHomoPair(PName1,PName2)) then //2个姓名是否有相同的起点
if (PName1^.fee >PName2^.fee) then //比较姓名的费用,删除
begin
MaybeNames.Delete(i); //删除
iDelete:=true;
break;
end
else
begin
MaybeNames.Delete(j);
end
else
if (isCrossPair(PName1,PName2)) then //2个姓名是否有重叠部分
if (PName1^.fee >PName2^.fee) then
begin
MaybeNames.Delete(i); //删除
iDelete:=true;
break;
end
else
begin
MaybeNames.Delete(j);
end
else //如果有重复,下一个词的比较没有必要,所以inc(j).
inc(j);
end; //内层while循环结束
if not iDelete then
inc(i) //跳过下次循环
else
iDelete:=false;
end;//最外层while循环结束
//----------------------------------------
s2:='';
if MaybeNames.Count<1 then //不存在姓名,作为单字输出。。
begin
for i:=1 to Slen do
s2:=s2+copy(S,i*2-1,2)+' ';
Result:=s2;
exit;
end;
for i:=0 to MaybeNames.Count-1 do
begin
if i=0 then
j:=1
else
j:=Pname^.offset+PName^.len;
PName:=PMaybeName(MaybeNames.Items);
while(j<PName.offset) do
begin
S2:=S2+copy(S,j,2)+' ';
inc(j,2);
end;
W:=copy(S,PName^.offset ,PName^.len);
if lstName.IndexOf(W)=-1 then //不能重复
lstName.Add(W);
S2:=S2+ StartSplit+W+endsplit; //加上词性标志
//preName:=leftbstr(W,2);
//sucName:=copy(W,3,2);
end;
j:=PName^.offset+PName^.len;
while(j<Slen*2) do
begin
S2:=S2+copy(s,j,2)+' ';
inc(j,2);
end;
for i:=MaybeNames.Count-1 downto 0 do
MaybeNames.Delete(i);
Result:=S2;
end;
procedure TRecgName.ClearNames;
begin
lstName.Clear;
end;
constructor TRecgName.Create(APath:string);
begin
//ShowMessage(APath);
FNameDict:=TNameDict.create(APath);
//FDict:=TDict.CreateInstance(APath); //不创建单例类的实例。只是在要用到的地方实时地引用
MaybeNames:=TList.Create;
lstName:=TStringList.create;
end;
destructor TRecgName.Destroy;
begin
FNameDict.Free;
//TDict(FDict).ReleaseInstance;
MaybeNames.Free;
lstName.Free;
inherited;
end;
function TRecgName.GetgFee(character: string): Double;
var
wFreq:Word;
wFee,Feeouble;
index,ID:Word;
nd:TCharTreeNode;
begin
FDict:=TDict.GetInstance(0);
nd:=FDict.GetRoot(character);
wFreq:=FDict.GetFreq(nd);
wFee:=-ln((wFreq+1.000000)/CorpusSize);
Fee:=GetgNameFreq(character);
ID:=(Byte(character[1])-176)*94+byte(character[2])-161;
if (ID>=0) and (ID<=HZ_NUM) and (Fee>0.0) then
if Fee>0.0 then
Result:=-ln((fee +1.00000)/GivenNameSize)-wFee
else
Result:=-ln(1.000000/GivenNameSize)-wFee //作为姓的费用减去作为单字的费用
else
Result:=20.000000;
end;
function TRecgName.GetgNameFreq(character: string): Word;
begin
Result:=FNameDict.GetgNameFreq(character);
end;
function TRecgName.GetNames: TStringList;
begin
Result:=lstName;
end;
function TRecgName.GetsFee(character: string): Double;
var
wFreq:Word;
wFee,Feeouble;
index,ID:Word;
nd:TCharTreeNode;
begin
FDict:=TDict.GetInstance(0);
nd:=FDict.GetRoot(character);
wFreq:=FDict.GetFreq(nd);
wFee:=-ln((wFreq+1.000000)/CorpusSize); //作为单字使用的费用
Fee:=GetsNameFreq(character); //作为姓的费用
ID:=(Byte(character[1])-176)*94+byte(character[2])-161;
if (ID>=0) and (ID<=HZ_NUM) and (Fee>0.0) then
Result:=-ln((fee +1.000000)/SurNameSize)-wFee //作为姓的费用减去作为单字的费用
else
Result:=20.000000;
end;
function TRecgName.GetsgFee(character: string): Double;
var
fee:double;
preName,sucName,lstName:string;
preByte,sucByte:byte;
begin
preName:=leftbstr(character,2);
sucName:=copy(character,3,2 );
lstName:=RightBStr(character,2);
if GetsNameFreq(preName)>0 then //字串最左边的一个字是否是姓
begin
fee:=GetsFee(preName)+GetgFee(sucName);
if (Length(character)=4) then
fee:=fee-ln(0.370000)
else
fee:=fee+GetgFee(lstName)+(-ln(0.630000));
end
else //全部当作名的费用
begin
if (Length(character)>6) then
fee:=20.000000
else if (Length(character)=4) then
fee:=GetgFee(preName)+GetgFee(lstName);
end;
Result:=fee;
end;
function TRecgName.GetsNameFreq(character: string): Word;
begin
Result:=FNameDict.GetsNameFreq(character);
end;
function TRecgName.isCrossPair(p1, p2: PMaybeName): boolean;
begin
Result:=not ((p1^.offset=p2^.offset) or (p1^.offset+p1^.len<=p2^.offset)or(p2^.offset+p2^.len<=p1^.offset));
end;
function TRecgName.isHomoPair(p1, p2: PMaybeName): boolean;
begin
Result:=(p1^.offset=p2^.offset);
end;
end.