为两个文本文件的相似度比较,求DIFF源码(VB,VC等均可,YYSUN大侠请进).(200分)

  • 主题发起人 主题发起人 maxwell_zhang
  • 开始时间 开始时间
M

maxwell_zhang

Unregistered / Unconfirmed
GUEST, unregistred user!
因工作上用到两个文本文件的相似度比较,因不是简单比较,特别对算法有点不清,故
对5月9日yysun大侠发在"随便说说"中DIFF标题的文章提到Open Wiki 带的350
行的 VBScript写的DIFF源码很感兴趣.但本人愚笨,未在所列网址http://www.openwiki.com/ow.asp?OpenWiki上找到可下载的源码.
请那位有此源码或其它Diff源码的大侠将源码贴在答复里,如太大
则请发到我的信箱,maxwell_zhang@sina.com,不胜感激.
 
《葵》中的一段比较字串的代码,希望帮到你
'John' and 'John' = 100%
'John' and 'Jon' = 75%
'Jim' and 'James' = 40%
"Luke Skywalker" and 'Darth Vader' = 0%
function StrSimilar (s1, s2: string): Integer;
var hit: Integer;
// Number of identical chars
p1, p2: Integer;
// Position count
l1, l2: Integer;
// Length of strings
pt: Integer;
// for counter
diff: Integer;
// unsharp factor
hstr: string;
// help var for swapping strings
// Array shows is position is already tested
test: array [1..255] of Boolean;
begin

// Test Length and swap, if s1 is smaller
// we alway search along the longer string
if Length(s1) < Length(s2) then
begin

hstr:= s2;
s2:= s1;
s1:= hstr;
end;

// store length of strings to speed up the function
l1:= Length (s1);
l2:= Length (s2);
p1:= 1;
p2:= 1;
hit:= 0;
// calc the unsharp factor depending on the length
// of the strings. Its about a third of the length
diff:= Max (l1, l2) div 3 + ABS (l1 - l2);
// init the test array
for pt:= 1 to l1do
test[pt]:= False;
// loop through the string
repeat
// position tested?
if not test[p1] then
begin

// found a matching character?
if (s1[p1] = s2[p2]) and (ABS(p1-p2) <= diff) then
begin

test[p1]:= True;
Inc (hit);
// increment the hit count
// next positions
Inc (p1);
Inc (p2);
if p1 > l1 then
p1:= 1;
end else
begin

// Set test array
test[p1]:= False;
Inc (p1);
// Loop back to next test position if end of the string
if p1 > l1 then
begin

while (p1 > 1) and not (test[p1])do
Dec (p1);
Inc (p2)
end;

end;

end else
begin

Inc (p1);
// Loop back to next test position if end of string
if p1 > l1 then
begin

repeat Dec (p1);
until (p1 = 1) or test[p1];
Inc (p2);
end;

end;

until p2 > Length(s2);
// calc procentual value
Result:= 100 * hit DIV l1;
end;
 
那个叶面上有一个.exe可以下载,下载安装后就可以找到那个程序了。
<%
'
' ---------------------------------------------------------------------------
' Copyright(c) 2000-2002, Laurens Pit
' All rights reserved.
'
' Redistribution and use in source and binary forms, with or without
' modification, are permitted provided that the following conditions
' are met:
'
' * Redistributions of source code must retain the above copyright
' notice, this list of conditions and the following disclaimer.
' * Redistributions in binary form must reproduce the above
' copyright notice, this list of conditions and the following
' disclaimer in thedo
cumentation and/or other materials provided
' with the distribution.
' * Neither the name of OpenWiki nor the names of its contributors
' may be used to endorse or promote products derived from this
' software without specific prior written permission.
'
' THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
' "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
' LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
' FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
' REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
' INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
' BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
' LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER
' CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
' LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
' ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
' POSSIBILITY OF SUCH DAMAGE.
'
' ---------------------------------------------------------------------------
' $Source: /usr/local/cvsroot/openwiki/dist/owbase/ow/owdiff.asp,v $
' $Revision: 1.2 $
' $Author: pit $
' ---------------------------------------------------------------------------
'
' Computes the difference between two page revisions.
'
Class Matcher
Private vLineBreak
Private vLineOriented
Private vA
Private vB
Private vBhash
Private vOut
Private vOutlen
Private vDebug
Private Sub Class_Initialize()
vLineBreak = vbCRLF
vLineOriented = True
vOut = ""
vOutlen = 0
vDebug = False
End Sub
Private Sub Class_Terminate()
End Sub
Public Property Let Preformatted(pPreformatted)
If pPreformatted then
vLineBreak = vbCRLF
else
vLineBreak = "<br/>"
End If
End Property
Public Property Let LineOriented(pLineOriented)
vLineOriented = pLineOriented
End Property
Public Property Let Debug(pDebug)
vDebug = pDebug
End Property
Public Property Let Outlen(pOutlen)
vOutlen = pOutlen
End Property
Private Function Tokenize(pText)
Dim vRegEx, vMatches, vMatch, vRegEx2, vMatches2, vMatch2, vValue
Set Tokenize = New Vector
Set vRegEx = New RegExp
vRegEx.IgnoreCase = False
vRegEx.Global = True
vRegEx.Pattern = ".+"
pText = Replace(pText, Chr(9), Space(8))
If Not vLineOriented then
Set vRegEx2 = New RegExp
vRegEx2.IgnoreCase = False
vRegEx2.Global = True
vRegEx2.Pattern = "/s*/S+"
End If
Set vMatches = vRegEx.Execute(pText)
For Each vMatch In vMatches
vValue = Replace(vMatch.Value, vbCR, "")
If vLineOriented then
Tokenize.Push(vValue)
else
If Trim(vValue) = "" then
Tokenize.Push(vValue)
else
Set vMatches2 = vRegEx2.Execute(vValue)
For Each vMatch2 In vMatches2
Tokenize.Push(vMatch2.Value)
Next
End If
Tokenize.Push(vbCRLF)
End If
Next
If vValue = "" then
Tokenize.Push("")
else
if Not vLineOriented then
Tokenize.Pop()
End If
Set vRegEx = Nothing
If Not vLineOriented then
Set vRegEx2 = Nothing
End If
End Function
Private Sub HashB
Dim i, vElem, vList
Set vBhash = CreateObject("Scripting.Dictionary")
For i = 0 To vB.Count - 1
vElem = vB.ElementAt(i)
If Trim(vElem) <> "" And vElem <> vbCRLF then
If vBhash.Exists(vElem) then
Set vList = vBhash.Item(vElem)
vList.Push(i)
else
Set vList = New Vector
vList.Push(i)
vBhash.Add vElem, vList
End If
End If
Next
End Sub
Private bestStartA
Private bestStartB
Private bestSize
' find longest matching block in vA[pALow,pAHigh] and vB[pBLow,pBHigh]
Private Sub FindLongestMatch(pALow, pAHigh, pBLow, pBHigh)
Dim i, j, k, x, vList
bestStartA = pALow
bestStartB = pBLow
bestSize = 0
Dim vLen, vNewLen, vElem
Set vLen = New Vector
vLen.Dimension = vB.Count
For i = pALow To pAHigh
Set vNewLen = New Vector
vNewLen.Dimension = vB.Count
vElem = vA.ElementAt(i)
If vBhash.Exists(vElem) then
Set vList = vBhash.Item(vElem)
For x = 0 To vList.Count - 1
j = vList.ElementAt(x)
If j > pBHigh then
Exit For
End If
If j >= pBLow then
If j > 0 then
k = vLen.ElementAt(j - 1) + 1
else
k = 1
End If
vNewLen.SetElementAt j, k
If k > bestSize then
bestStartA = i - k + 1
bestStartB = j - k + 1
bestSize = k
End If
End If
Next
End If
Set vLen = vNewLen
Next
' add junk on both sides
do
While bestStartA > pALow And bestStartB > pBLow
If (Trim(vA.ElementAt(bestStartA - 1)) = "" Or vA.ElementAt(bestStartA - 1) = vbCRLF) And (Trim(vB.ElementAt(bestStartB - 1)) = "" Or vB.ElementAt(bestStartB - 1) = vbCRLF) then
bestStartA = bestStartA - 1
bestStartB = bestStartB - 1
bestSize = bestSize + 1
else
Exitdo
End If
Loop
do
While bestStartA + bestSize <= pAHigh And bestStartB + bestSize <= pBHigh
If (Trim(vA.ElementAt(bestStartA + bestSize)) = "" Or vA.ElementAt(bestStartA + bestSize) = vbCRLF) And (Trim(vB.ElementAt(bestStartB + bestSize)) = "" Or vB.ElementAt(bestStartB + bestSize) = vbCRLF) then
bestSize = bestSize + 1
else
Exitdo
End If
Loop
End Sub
Private Sub SplitLine(pLine)
Dim i
do
i = InStrRev(pLine, " ", 80)
If i > 0 then
vOut = vOut &amp;
Left(pLine, i) &amp;
vLineBreak
pLine = LTrim(Mid(pLine, i))
else
vOut = vOut &amp;
pLine
End If
Loop While i > 0
End Sub
Private Sub Output(pTag, pVector, pFrom, pTo)
Dim i, vElem
If pTag = "delete" then
vOut = vOut &amp;
"<strike class='diff'>"
else
if pTag = "insert" then
vOut = vOut &amp;
"<u class='diff'>"
End If
For i = pFrom To pTo
vElem = pVector.ElementAt(i)
If vElem = vbCRLF then
vElem = vLineBreak
vOutlen = 0
else
if vElem = "" then
vElem = " "
End If
vOutlen = vOutlen + Len(vElem)
If vOutlen > 80 then
If Len(vElem) > 80 then
SplitLine(vElem)
vElem = ""
else
vOut = vOut &amp;
vLineBreak
vElem = LTrim(vElem)
vOutlen = Len(vElem)
End If
End If
vOut = vOut &amp;
vElem
If vLineOriented then
vOut = vOut &amp;
vLineBreak
vOutlen = 0
End If
Next
If pTag = "delete" then
vOut = vOut &amp;
"</strike>"
else
if pTag = "insert" then
vOut = vOut &amp;
"</u>"
End If
End Sub
Private Sub InnerReplace(pAFrom, pATo, pBFrom, pBTo)
Dim i, vText1, vText2, vMatcher
vText1 = ""
vText2 = ""
For i = pAFrom To pATo
vText1 = vText1 &amp;
vA.ElementAt(i)
If i < pATo then
vText1 = vText1 &amp;
vbCRLF
End If
Next
For i = pBFrom To pBTo
vText2 = vText2 &amp;
vB.ElementAt(i)
If i < pBTo then
vText2 = vText2 &amp;
vbCRLF
End If
Next
Set vMatcher = New Matcher
vMatcher.Outlen = vOutlen
vMatcher.LineOriented = False
vMatcher.Debug = vDebug
vOut = vOut &amp;
vMatcher.Compare(vText1, vText2) &amp;
vLineBreak
End Sub
Private Sub Out(vAFound, vBFound, vSize)
If matchedI < vAFound And matchedJ < vBFound then
If vLineOriented then
Call InnerReplace(matchedI, vAFound - 1, matchedJ, vBFound - 1)
else
Call Output("delete", vA, matchedI, vAFound - 1)
' TODO: maybe, add "<br/>" when the intraline deleted was part of the last line
Call Output("insert", vB, matchedJ, vBFound - 1)
End If
else
if matchedI < vAFound then
Call Output("delete", vA, matchedI, vAFound - 1)
else
if matchedJ < vBFound then
Call Output("insert", vB, matchedJ, vBFound - 1)
End If
If vSize > 0 then
Call Output("equal", vA, vAFound, vAFound + vSize - 1)
End If
End Sub
Dim matchedI, matchedJ
' match between [pALow,pAHigh] and [pBLow,pBHigh]
Private Sub GetMatchingBlocks(pDepth, pALow, pAHigh, pBLow, pBHigh)
If pDepth = 1 then
matchedI = 0
matchedJ = 0
End If
Call FindLongestMatch(pALow, pAHigh, pBLow, pBHigh)
If bestSize > 0 then
Dim vAFound, vBFound, vSize
vAFound = bestStartA
vBFound = bestStartB
vSize = bestSize
If pALow < vAFound And pBLow < vBFound then
Call GetMatchingBlocks(pDepth + 1, pALow, vAFound - 1, pBLow, vBFound - 1)
End If
Call Out(vAFound, vBFound, vSize)
matchedI = vAFound + vSize
matchedJ = vBFound + vSize
If matchedI <= pAHigh And matchedJ <= pBHigh then
Call GetMatchingBlocks(pDepth + 1, matchedI, pAHigh, matchedJ, pBHigh)
End If
End If
If pDepth = 1 then
Call Out(vA.Count, vB.Count, 0)
End If
End Sub
Public Function Compare(pText1, pText2)
vOut = ""
Set vA = Tokenize(pText1)
Set vB = Tokenize(pText2)
HashB()
Call GetMatchingBlocks(1, 0, vA.Count - 1, 0, vB.Count - 1)
Compare = vOut
End Function
End Class
%>
 
多人接受答案了。
 
后退
顶部