文本相似度分析(Excel)
Text similarity analysis (Excel)
我有一个项目列表,我想确定它们与此列表中其他项目的相似性。
我想要的输出是这样的:
相似度栏中显示的百分比纯粹是说明性的。我认为相似性测试类似于:
number of concurrent letters / by the total number of letters in the
matched item
但很想就此征求意见。
这在 Excel 上是否合理可行?我是一个仅包含字母数字值的小数据集 (140kb)。
我也愿意接受其他方法来解决这个问题,因为我以前没有处理过这样的事情!
P.s。我已经学习 Python 几个月了,所以建议使用 Python 也不错!
在python中,您可以使用编辑距离来获得结果。查看此答案:
Fuzzy string comparison in Python, confused with which library to use
我真的不明白整个逻辑,但如果你需要 100% 的逻辑,那就是:
Option Explicit
Sub TestMe()
Dim rngCell As Range
Dim rngCell2 As Range
Dim lngTotal As Long
Dim lngTotal2 As Long
Dim lngCount As Long
For Each rngCell In Sheets(1).Range("A1:A5")
For Each rngCell2 In Sheets(1).Range("A1:A5")
If rngCell.Address <> rngCell2.Address Then
If InStr(1, rngCell, rngCell2) Then
rngCell.Offset(0, 1) = 1
Else
If InStr(1, rngCell2, rngCell) Then
rngCell.Offset(0, 2) = Round(CDbl(Len(rngCell) / Len(rngCell2)), 2)
End If
End If
End If
Next rngCell2
Next rngCell
End Sub
给你配图:
这是一个使用 VBA UDF 的解决方案:
EDIT:添加了一个名为 arg_lMinConsecutive
的新可选参数,用于确定必须匹配的最少连续字符数。请注意以下公式中的额外参数 2
,它表示必须至少匹配 2 个连续的字符。
Public Function FuzzyMatch(ByVal arg_sText As String, _
ByVal arg_vList As Variant, _
ByVal arg_lOutput As Long, _
Optional ByVal arg_lMinConsecutive As Long = 1, _
Optional ByVal arg_bMatchCase As Boolean = True, _
Optional ByVal arg_bExactCount As Boolean = True) _
As Variant
Dim dExactCounts As Object
Dim aResults() As Variant
Dim vList As Variant
Dim vListItem As Variant
Dim sLetter As String
Dim dMaxMatch As Double
Dim lMaxIndex As Long
Dim lResultIndex As Long
Dim lLastMatch As Long
Dim i As Long
Dim bMatch As Boolean
If arg_lMinConsecutive <= 0 Then
FuzzyMatch = CVErr(xlErrNum)
Exit Function
End If
If arg_bExactCount = True Then Set dExactCounts = CreateObject("Scripting.Dictionary")
If TypeName(arg_vList) = "Collection" Or TypeName(arg_vList) = "Range" Then
ReDim aResults(1 To arg_vList.Count, 1 To 3)
Set vList = arg_vList
ElseIf IsArray(arg_vList) Then
ReDim aResults(1 To UBound(arg_vList) - LBound(arg_vList) + 1, 1 To 3)
vList = arg_vList
Else
ReDim vList(1 To 1)
vList(1) = arg_vList
ReDim aResults(1 To 1, 1 To 3)
End If
dMaxMatch = 0#
lMaxIndex = 0
lResultIndex = 0
For Each vListItem In vList
If vListItem <> arg_sText Then
lLastMatch = -arg_lMinConsecutive
lResultIndex = lResultIndex + 1
aResults(lResultIndex, 3) = vListItem
If arg_bExactCount Then dExactCounts.RemoveAll
For i = 1 To Len(arg_sText) - arg_lMinConsecutive + 1
bMatch = False
sLetter = Mid(arg_sText, i, arg_lMinConsecutive)
If Not arg_bMatchCase Then sLetter = LCase(sLetter)
If arg_bExactCount Then dExactCounts(sLetter) = dExactCounts(sLetter) + 1
Select Case Abs(arg_bMatchCase) + Abs(arg_bExactCount) * 2
Case 0
'MatchCase is false and ExactCount is false
If InStr(1, vListItem, sLetter, vbTextCompare) > 0 Then bMatch = True
Case 1
'MatchCase is true and ExactCount is false
If InStr(1, vListItem, sLetter) > 0 Then bMatch = True
Case 2
'MatchCase is false and ExactCount is true
If Len(vListItem) - Len(Replace(vListItem, sLetter, vbNullString, Compare:=vbTextCompare)) >= dExactCounts(sLetter) Then bMatch = True
Case 3
'MatchCase is true and ExactCount is true
If Len(vListItem) - Len(Replace(vListItem, sLetter, vbNullString)) >= dExactCounts(sLetter) Then bMatch = True
End Select
If bMatch Then
aResults(lResultIndex, 1) = aResults(lResultIndex, 1) + WorksheetFunction.Min(arg_lMinConsecutive, i - lLastMatch)
lLastMatch = i
End If
Next i
If Len(vListItem) > 0 Then
aResults(lResultIndex, 2) = aResults(lResultIndex, 1) / Len(vListItem)
If aResults(lResultIndex, 2) > dMaxMatch Then
dMaxMatch = aResults(lResultIndex, 2)
lMaxIndex = lResultIndex
End If
Else
aResults(lResultIndex, 2) = 0
End If
End If
Next vListItem
If dMaxMatch = 0# Then
Select Case arg_lOutput
Case 1: FuzzyMatch = 0
Case 2: FuzzyMatch = vbNullString
Case Else: FuzzyMatch = CVErr(xlErrNum)
End Select
Else
Select Case arg_lOutput
Case 1: FuzzyMatch = Application.Min(1, aResults(lMaxIndex, 2))
Case 2: FuzzyMatch = aResults(lMaxIndex, 3)
Case Else: FuzzyMatch = CVErr(xlErrNum)
End Select
End If
End Function
仅使用 A 列和 B 列中的原始数据,您可以使用此 UDF 在 C 列和 D 列中获得所需的结果:
在单元格 C2 中复制下来的是这个公式:
=FuzzyMatch($B2,$B:$B,COLUMN(A2),2)
在单元格 D2 中复制下来的是这个公式:
=IFERROR(INDEX(A:A,MATCH(FuzzyMatch($B2,$B:$B,COLUMN(B2),2),B:B,0)),"-")
请注意,它们都使用 FuzzyMatch
UDF。
我有一个项目列表,我想确定它们与此列表中其他项目的相似性。
我想要的输出是这样的:
相似度栏中显示的百分比纯粹是说明性的。我认为相似性测试类似于:
number of concurrent letters / by the total number of letters in the matched item
但很想就此征求意见。
这在 Excel 上是否合理可行?我是一个仅包含字母数字值的小数据集 (140kb)。
我也愿意接受其他方法来解决这个问题,因为我以前没有处理过这样的事情!
P.s。我已经学习 Python 几个月了,所以建议使用 Python 也不错!
在python中,您可以使用编辑距离来获得结果。查看此答案:
Fuzzy string comparison in Python, confused with which library to use
我真的不明白整个逻辑,但如果你需要 100% 的逻辑,那就是:
Option Explicit
Sub TestMe()
Dim rngCell As Range
Dim rngCell2 As Range
Dim lngTotal As Long
Dim lngTotal2 As Long
Dim lngCount As Long
For Each rngCell In Sheets(1).Range("A1:A5")
For Each rngCell2 In Sheets(1).Range("A1:A5")
If rngCell.Address <> rngCell2.Address Then
If InStr(1, rngCell, rngCell2) Then
rngCell.Offset(0, 1) = 1
Else
If InStr(1, rngCell2, rngCell) Then
rngCell.Offset(0, 2) = Round(CDbl(Len(rngCell) / Len(rngCell2)), 2)
End If
End If
End If
Next rngCell2
Next rngCell
End Sub
给你配图:
这是一个使用 VBA UDF 的解决方案:
EDIT:添加了一个名为 arg_lMinConsecutive
的新可选参数,用于确定必须匹配的最少连续字符数。请注意以下公式中的额外参数 2
,它表示必须至少匹配 2 个连续的字符。
Public Function FuzzyMatch(ByVal arg_sText As String, _
ByVal arg_vList As Variant, _
ByVal arg_lOutput As Long, _
Optional ByVal arg_lMinConsecutive As Long = 1, _
Optional ByVal arg_bMatchCase As Boolean = True, _
Optional ByVal arg_bExactCount As Boolean = True) _
As Variant
Dim dExactCounts As Object
Dim aResults() As Variant
Dim vList As Variant
Dim vListItem As Variant
Dim sLetter As String
Dim dMaxMatch As Double
Dim lMaxIndex As Long
Dim lResultIndex As Long
Dim lLastMatch As Long
Dim i As Long
Dim bMatch As Boolean
If arg_lMinConsecutive <= 0 Then
FuzzyMatch = CVErr(xlErrNum)
Exit Function
End If
If arg_bExactCount = True Then Set dExactCounts = CreateObject("Scripting.Dictionary")
If TypeName(arg_vList) = "Collection" Or TypeName(arg_vList) = "Range" Then
ReDim aResults(1 To arg_vList.Count, 1 To 3)
Set vList = arg_vList
ElseIf IsArray(arg_vList) Then
ReDim aResults(1 To UBound(arg_vList) - LBound(arg_vList) + 1, 1 To 3)
vList = arg_vList
Else
ReDim vList(1 To 1)
vList(1) = arg_vList
ReDim aResults(1 To 1, 1 To 3)
End If
dMaxMatch = 0#
lMaxIndex = 0
lResultIndex = 0
For Each vListItem In vList
If vListItem <> arg_sText Then
lLastMatch = -arg_lMinConsecutive
lResultIndex = lResultIndex + 1
aResults(lResultIndex, 3) = vListItem
If arg_bExactCount Then dExactCounts.RemoveAll
For i = 1 To Len(arg_sText) - arg_lMinConsecutive + 1
bMatch = False
sLetter = Mid(arg_sText, i, arg_lMinConsecutive)
If Not arg_bMatchCase Then sLetter = LCase(sLetter)
If arg_bExactCount Then dExactCounts(sLetter) = dExactCounts(sLetter) + 1
Select Case Abs(arg_bMatchCase) + Abs(arg_bExactCount) * 2
Case 0
'MatchCase is false and ExactCount is false
If InStr(1, vListItem, sLetter, vbTextCompare) > 0 Then bMatch = True
Case 1
'MatchCase is true and ExactCount is false
If InStr(1, vListItem, sLetter) > 0 Then bMatch = True
Case 2
'MatchCase is false and ExactCount is true
If Len(vListItem) - Len(Replace(vListItem, sLetter, vbNullString, Compare:=vbTextCompare)) >= dExactCounts(sLetter) Then bMatch = True
Case 3
'MatchCase is true and ExactCount is true
If Len(vListItem) - Len(Replace(vListItem, sLetter, vbNullString)) >= dExactCounts(sLetter) Then bMatch = True
End Select
If bMatch Then
aResults(lResultIndex, 1) = aResults(lResultIndex, 1) + WorksheetFunction.Min(arg_lMinConsecutive, i - lLastMatch)
lLastMatch = i
End If
Next i
If Len(vListItem) > 0 Then
aResults(lResultIndex, 2) = aResults(lResultIndex, 1) / Len(vListItem)
If aResults(lResultIndex, 2) > dMaxMatch Then
dMaxMatch = aResults(lResultIndex, 2)
lMaxIndex = lResultIndex
End If
Else
aResults(lResultIndex, 2) = 0
End If
End If
Next vListItem
If dMaxMatch = 0# Then
Select Case arg_lOutput
Case 1: FuzzyMatch = 0
Case 2: FuzzyMatch = vbNullString
Case Else: FuzzyMatch = CVErr(xlErrNum)
End Select
Else
Select Case arg_lOutput
Case 1: FuzzyMatch = Application.Min(1, aResults(lMaxIndex, 2))
Case 2: FuzzyMatch = aResults(lMaxIndex, 3)
Case Else: FuzzyMatch = CVErr(xlErrNum)
End Select
End If
End Function
仅使用 A 列和 B 列中的原始数据,您可以使用此 UDF 在 C 列和 D 列中获得所需的结果:
在单元格 C2 中复制下来的是这个公式:
=FuzzyMatch($B2,$B:$B,COLUMN(A2),2)
在单元格 D2 中复制下来的是这个公式:
=IFERROR(INDEX(A:A,MATCH(FuzzyMatch($B2,$B:$B,COLUMN(B2),2),B:B,0)),"-")
请注意,它们都使用 FuzzyMatch
UDF。