一种更准确、更高效的模糊搜索算法
A more accurate and more efficient fuzzy searching algorithm
我一直在研究互联网上的模糊匹配/搜索算法。我已经尝试了几个解决方案。
唯一给出比较准确结果的是 Excel 先生 (http://www.mrexcel.com/pc07.shtml)。这种方法的问题是单词中字符的顺序或相对位置,单词本身的顺序对结果没有影响。
我想根据单词的相对位置以及每个单词的字母顺序获得更好的结果。
Function FuzzyMatchByWord(ByVal lsPhrase1 As String, ByVal lsPhrase2 As String, Optional lbStripVowels As Boolean = False, Optional lbDiscardExtra As Boolean = False) As Double
'
' Compare two phrases and return a similarity value (between 0 and 100).
'
' Arguments:
'
' 1. Phrase1 String; any text string
' 2. Phrase2 String; any text string
' 3. StripVowels Optional to strip all vowels from the phrases
' 4. DiscardExtra Optional to discard any unmatched words
'
'local variables
Dim lsWord1() As String
Dim lsWord2() As String
Dim ldMatch() As Double
Dim ldCur As Double
Dim ldMax As Double
Dim liCnt1 As Integer
Dim liCnt2 As Integer
Dim liCnt3 As Integer
Dim lbMatched() As Boolean
Dim lsNew As String
Dim lsChr As String
Dim lsKeep As String
'set default value as failure
FuzzyMatchByWord = 0
'create list of characters to keep
lsKeep = "BCDFGHJKLMNPQRSTVWXYZ0123456789 "
If Not lbStripVowels Then
lsKeep = lsKeep & "AEIOU"
End If
'clean up phrases by stripping undesired characters
'phrase1
lsPhrase1 = Trim$(UCase$(lsPhrase1))
lsNew = ""
For liCnt1 = 1 To Len(lsPhrase1)
lsChr = Mid$(lsPhrase1, liCnt1, 1)
If InStr(lsKeep, lsChr) <> 0 Then
lsNew = lsNew & lsChr
End If
Next
lsPhrase1 = lsNew
lsPhrase1 = Replace(lsPhrase1, " ", " ")
lsWord1 = Split(lsPhrase1, " ")
If UBound(lsWord1) = -1 Then
Exit Function
End If
ReDim ldMatch(UBound(lsWord1))
'phrase2
lsPhrase2 = Trim$(UCase$(lsPhrase2))
lsNew = ""
For liCnt1 = 1 To Len(lsPhrase2)
lsChr = Mid$(lsPhrase2, liCnt1, 1)
If InStr(lsKeep, lsChr) <> 0 Then
lsNew = lsNew & lsChr
End If
Next
lsPhrase2 = lsNew
lsPhrase2 = Replace(lsPhrase2, " ", " ")
lsWord2 = Split(lsPhrase2, " ")
If UBound(lsWord2) = -1 Then
Exit Function
End If
ReDim lbMatched(UBound(lsWord2))
'exit if empty
If Trim$(lsPhrase1) = "" Or Trim$(lsPhrase2) = "" Then
Exit Function
End If
'compare words in each phrase
For liCnt1 = 0 To UBound(lsWord1)
ldMax = 0
For liCnt2 = 0 To UBound(lsWord2)
If Not lbMatched(liCnt2) Then
ldCur = FuzzyMatch(lsWord1(liCnt1), lsWord2(liCnt2))
If ldCur > ldMax Then
liCnt3 = liCnt2
ldMax = ldCur
End If
End If
Next
lbMatched(liCnt3) = True
ldMatch(liCnt1) = ldMax
Next
'discard extra words
ldMax = 0
For liCnt1 = 0 To UBound(ldMatch)
ldMax = ldMax + ldMatch(liCnt1)
Next
If lbDiscardExtra Then
liCnt2 = 0
For liCnt1 = 0 To UBound(lbMatched)
If lbMatched(liCnt1) Then
liCnt2 = liCnt2 + 1
End If
Next
Else
liCnt2 = UBound(lsWord2) + 1
End If
'return overall similarity
FuzzyMatchByWord = 100 * (ldMax / liCnt2)
End Function
Function FuzzyMatch(Fstr As String, Sstr As String) As Double
'
' Code sourced from: http://www.mrexcel.com/pc07.shtml
' Credited to: Ed Acosta
' Modified: Joe Stanton
'
Dim L, L1, L2, M, SC, T, R As Integer
L = 0
M = 0
SC = 1
L1 = Len(Fstr)
L2 = Len(Sstr)
Do While L < L1
L = L + 1
For T = SC To L1
If Mid$(Sstr, L, 1) = Mid$(Fstr, T, 1) Then
M = M + 1
SC = T
T = L1 + 1
End If
Next T
Loop
If L1 = 0 Then
FuzzyMatch = 0
Else
FuzzyMatch = M / L1
End If
End Function
我正在尝试将试算表中的帐户描述与过去 30,000 个帐户描述的列表进行比较,我想找到每个帐户的前 5 个结果。
举个例子:
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Bank and Cash")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Cash and Bank")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Shack sequential")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Sequential shack")
Returns:
75
75
37.5
37.5
我希望单词在短语中的相对位置对分数的影响更大,我也希望字母的顺序有更大的影响。与现金和现金等价物相比,连续的小屋不应该得分那么高。
比较字符串时我通常使用Levenshtein-Distance. You can find an implementation of the algorithm here。您可以通过比率扩展函数,这是衡量 "close" 两个字符串如何的一个很好的指标。
Function levenshtein(a As String, b As String, Optional ratio As Boolean) As Double
Dim i As Long, j As Long, cost As Long
Dim d() As Long
Dim min1 As Long, min2 As Long, min3 As Long
If Len(a) = 0 Then
levenshtein = Len(b)
Exit Function
End If
If Len(b) = 0 Then
levenshtein = Len(a)
Exit Function
End If
ReDim d(Len(a), Len(b))
For i = 0 To Len(a)
d(i, 0) = i
Next
For j = 0 To Len(b)
d(0, j) = j
Next
For i = 1 To Len(a)
For j = 1 To Len(b)
If Mid(a, i, 1) = Mid(b, j, 1) Then
cost = 0
Else
cost = 1
End If
min1 = (d(i - 1, j) + 1)
min2 = (d(i, j - 1) + 1)
min3 = (d(i - 1, j - 1) + cost)
d(i, j) = Application.WorksheetFunction.Min(min1, min2, min3)
Next
Next
If ratio Then
levenshtein = (Len(a) + Len(b) - d(Len(a), Len(b))) / (Len(a) + Len(b))
Else
levenshtein = d(Len(a), Len(b))
End If
End Function
举个例子:
Debug.Print levenshtein("Cash and Cash Equivalents", "Bank and Cash", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Cash and Bank", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Shack sequential", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Sequential shack", True)
Returns:
0.605263157894737
0.631578947368421
0.560975609756098
0.48780487804878
编辑
我想字符串比较会大大降低速度。加快速度的一种方法是将字符串转换为字节数组并比较数值。可以这样做:
Function levenshtein(a As String, b As String, Optional ratio As Boolean) As Double
Dim i As Long, j As Long
Dim k As Long, l As Long
Dim cost As Long
Dim d() As Long
Dim min1 As Long, min2 As Long, min3 As Long
Dim aByte1() As Byte, aByte2() As Byte
If Len(a) = 0 Then
levenshtein = Len(b)
Exit Function
End If
If Len(b) = 0 Then
levenshtein = Len(a)
Exit Function
End If
ReDim d(Len(a), Len(b))
For i = 0 To Len(a)
d(i, 0) = i
Next
For j = 0 To Len(b)
d(0, j) = j
Next
aByte1 = a
aByte2 = b
For i = 0 To UBound(aByte1, 1) Step 2
k = Int(i / 2) + 1
For j = 0 To UBound(aByte2, 1) Step 2
If aByte1(i) = aByte2(j) Then
cost = 0
Else
cost = 1
End If
l = Int(j / 2) + 1
min1 = (d(k - 1, l) + 1)
min2 = (d(k, l - 1) + 1)
min3 = (d(k - 1, l - 1) + cost)
d(k, l) = Application.WorksheetFunction.Min(min1, min2, min3)
Next
Next
If ratio Then
levenshtein = (Len(a) + Len(b) - d(Len(a), Len(b))) / (Len(a) + Len(b))
Else
levenshtein = d(Len(a), Len(b))
End If
End Function
我一直在研究互联网上的模糊匹配/搜索算法。我已经尝试了几个解决方案。
唯一给出比较准确结果的是 Excel 先生 (http://www.mrexcel.com/pc07.shtml)。这种方法的问题是单词中字符的顺序或相对位置,单词本身的顺序对结果没有影响。
我想根据单词的相对位置以及每个单词的字母顺序获得更好的结果。
Function FuzzyMatchByWord(ByVal lsPhrase1 As String, ByVal lsPhrase2 As String, Optional lbStripVowels As Boolean = False, Optional lbDiscardExtra As Boolean = False) As Double
'
' Compare two phrases and return a similarity value (between 0 and 100).
'
' Arguments:
'
' 1. Phrase1 String; any text string
' 2. Phrase2 String; any text string
' 3. StripVowels Optional to strip all vowels from the phrases
' 4. DiscardExtra Optional to discard any unmatched words
'
'local variables
Dim lsWord1() As String
Dim lsWord2() As String
Dim ldMatch() As Double
Dim ldCur As Double
Dim ldMax As Double
Dim liCnt1 As Integer
Dim liCnt2 As Integer
Dim liCnt3 As Integer
Dim lbMatched() As Boolean
Dim lsNew As String
Dim lsChr As String
Dim lsKeep As String
'set default value as failure
FuzzyMatchByWord = 0
'create list of characters to keep
lsKeep = "BCDFGHJKLMNPQRSTVWXYZ0123456789 "
If Not lbStripVowels Then
lsKeep = lsKeep & "AEIOU"
End If
'clean up phrases by stripping undesired characters
'phrase1
lsPhrase1 = Trim$(UCase$(lsPhrase1))
lsNew = ""
For liCnt1 = 1 To Len(lsPhrase1)
lsChr = Mid$(lsPhrase1, liCnt1, 1)
If InStr(lsKeep, lsChr) <> 0 Then
lsNew = lsNew & lsChr
End If
Next
lsPhrase1 = lsNew
lsPhrase1 = Replace(lsPhrase1, " ", " ")
lsWord1 = Split(lsPhrase1, " ")
If UBound(lsWord1) = -1 Then
Exit Function
End If
ReDim ldMatch(UBound(lsWord1))
'phrase2
lsPhrase2 = Trim$(UCase$(lsPhrase2))
lsNew = ""
For liCnt1 = 1 To Len(lsPhrase2)
lsChr = Mid$(lsPhrase2, liCnt1, 1)
If InStr(lsKeep, lsChr) <> 0 Then
lsNew = lsNew & lsChr
End If
Next
lsPhrase2 = lsNew
lsPhrase2 = Replace(lsPhrase2, " ", " ")
lsWord2 = Split(lsPhrase2, " ")
If UBound(lsWord2) = -1 Then
Exit Function
End If
ReDim lbMatched(UBound(lsWord2))
'exit if empty
If Trim$(lsPhrase1) = "" Or Trim$(lsPhrase2) = "" Then
Exit Function
End If
'compare words in each phrase
For liCnt1 = 0 To UBound(lsWord1)
ldMax = 0
For liCnt2 = 0 To UBound(lsWord2)
If Not lbMatched(liCnt2) Then
ldCur = FuzzyMatch(lsWord1(liCnt1), lsWord2(liCnt2))
If ldCur > ldMax Then
liCnt3 = liCnt2
ldMax = ldCur
End If
End If
Next
lbMatched(liCnt3) = True
ldMatch(liCnt1) = ldMax
Next
'discard extra words
ldMax = 0
For liCnt1 = 0 To UBound(ldMatch)
ldMax = ldMax + ldMatch(liCnt1)
Next
If lbDiscardExtra Then
liCnt2 = 0
For liCnt1 = 0 To UBound(lbMatched)
If lbMatched(liCnt1) Then
liCnt2 = liCnt2 + 1
End If
Next
Else
liCnt2 = UBound(lsWord2) + 1
End If
'return overall similarity
FuzzyMatchByWord = 100 * (ldMax / liCnt2)
End Function
Function FuzzyMatch(Fstr As String, Sstr As String) As Double
'
' Code sourced from: http://www.mrexcel.com/pc07.shtml
' Credited to: Ed Acosta
' Modified: Joe Stanton
'
Dim L, L1, L2, M, SC, T, R As Integer
L = 0
M = 0
SC = 1
L1 = Len(Fstr)
L2 = Len(Sstr)
Do While L < L1
L = L + 1
For T = SC To L1
If Mid$(Sstr, L, 1) = Mid$(Fstr, T, 1) Then
M = M + 1
SC = T
T = L1 + 1
End If
Next T
Loop
If L1 = 0 Then
FuzzyMatch = 0
Else
FuzzyMatch = M / L1
End If
End Function
我正在尝试将试算表中的帐户描述与过去 30,000 个帐户描述的列表进行比较,我想找到每个帐户的前 5 个结果。
举个例子:
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Bank and Cash")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Cash and Bank")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Shack sequential")
Debug.Print FuzzyMatchByWord("Cash and Cash Equivalents", "Sequential shack")
Returns:
75
75
37.5
37.5
我希望单词在短语中的相对位置对分数的影响更大,我也希望字母的顺序有更大的影响。与现金和现金等价物相比,连续的小屋不应该得分那么高。
比较字符串时我通常使用Levenshtein-Distance. You can find an implementation of the algorithm here。您可以通过比率扩展函数,这是衡量 "close" 两个字符串如何的一个很好的指标。
Function levenshtein(a As String, b As String, Optional ratio As Boolean) As Double
Dim i As Long, j As Long, cost As Long
Dim d() As Long
Dim min1 As Long, min2 As Long, min3 As Long
If Len(a) = 0 Then
levenshtein = Len(b)
Exit Function
End If
If Len(b) = 0 Then
levenshtein = Len(a)
Exit Function
End If
ReDim d(Len(a), Len(b))
For i = 0 To Len(a)
d(i, 0) = i
Next
For j = 0 To Len(b)
d(0, j) = j
Next
For i = 1 To Len(a)
For j = 1 To Len(b)
If Mid(a, i, 1) = Mid(b, j, 1) Then
cost = 0
Else
cost = 1
End If
min1 = (d(i - 1, j) + 1)
min2 = (d(i, j - 1) + 1)
min3 = (d(i - 1, j - 1) + cost)
d(i, j) = Application.WorksheetFunction.Min(min1, min2, min3)
Next
Next
If ratio Then
levenshtein = (Len(a) + Len(b) - d(Len(a), Len(b))) / (Len(a) + Len(b))
Else
levenshtein = d(Len(a), Len(b))
End If
End Function
举个例子:
Debug.Print levenshtein("Cash and Cash Equivalents", "Bank and Cash", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Cash and Bank", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Shack sequential", True)
Debug.Print levenshtein("Cash and Cash Equivalents", "Sequential shack", True)
Returns:
0.605263157894737
0.631578947368421
0.560975609756098
0.48780487804878
编辑
我想字符串比较会大大降低速度。加快速度的一种方法是将字符串转换为字节数组并比较数值。可以这样做:
Function levenshtein(a As String, b As String, Optional ratio As Boolean) As Double
Dim i As Long, j As Long
Dim k As Long, l As Long
Dim cost As Long
Dim d() As Long
Dim min1 As Long, min2 As Long, min3 As Long
Dim aByte1() As Byte, aByte2() As Byte
If Len(a) = 0 Then
levenshtein = Len(b)
Exit Function
End If
If Len(b) = 0 Then
levenshtein = Len(a)
Exit Function
End If
ReDim d(Len(a), Len(b))
For i = 0 To Len(a)
d(i, 0) = i
Next
For j = 0 To Len(b)
d(0, j) = j
Next
aByte1 = a
aByte2 = b
For i = 0 To UBound(aByte1, 1) Step 2
k = Int(i / 2) + 1
For j = 0 To UBound(aByte2, 1) Step 2
If aByte1(i) = aByte2(j) Then
cost = 0
Else
cost = 1
End If
l = Int(j / 2) + 1
min1 = (d(k - 1, l) + 1)
min2 = (d(k, l - 1) + 1)
min3 = (d(k - 1, l - 1) + cost)
d(k, l) = Application.WorksheetFunction.Min(min1, min2, min3)
Next
Next
If ratio Then
levenshtein = (Len(a) + Len(b) - d(Len(a), Len(b))) / (Len(a) + Len(b))
Else
levenshtein = d(Len(a), Len(b))
End If
End Function