更有效地从文档中删除重复的单词

Question

我将每个单词与其他单词进行比较，检查是否重复，如果是，则将其删除。 1 到 4 页最多需要 5 分钟。

对于 50 或 100 页的文档，我需要修改或新想法，以便用更少的时间比较和删除重复项。

Sub Delete_Duplicates()
    '***********'
    'By
    'MBA
    '***********'
    Dim AD As Range
    Dim F As Range
    Dim i As Long
    
    Set AD = ActiveDocument.Range
    Z = AD.Words.Count
    y = 1
    For i = Z To 1 Step -1
        y = y + 1
        
        Set F = AD.Words(i)
        
        On Error Resume Next
        Set s = AD.Words(i - 1)
        If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
        If Err.Number > 0 Then Exit Sub
            
        If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
            F.Text = ""
            If Not c Is Nothing Then c.Text = " ": Set c = Nothing
        End If
        If Not c Is Nothing Then Set c = Nothing
    
        On Error Resume Next
        Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
        On Error GoTo 0
    
    Next
    Beep
End Sub

Before/After

Answer 1

这只是概念，但请尝试准备文档中所有单词的列表，如果存在，则替换双字或三字。

Private Sub DeleteDuplicate()
    
    Dim wholeTxt As String
    
    Dim w As Range
    Dim col As New Collection
    Dim c
    
    For Each w In ActiveDocument.Words
        AddUniqueItem col, Trim(w.Text)
    Next w

    wholeTxt = ActiveDocument.Range.Text
    
    For Each c In col
        
        'add case with ","
        'maybe one letter word should be forbidden, or add extra boundary
        If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
            'start of doc
            Selection.HomeKey wdStory
            
            'here should be all stuff to prepare replacement
            '(...)
            Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
            wholeTxt = ActiveDocument.Range.Text
        End If
    Next c
    
    Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
    Dim s As String
    On Error Resume Next
    s = col(itemValAndKey)
    If Err.Number <> 0 Then
        col.Add itemValAndKey, itemValAndKey
        Err.Clear
    End If
    On Error GoTo 0
End Sub

Answer 2

假设整个文档都是纯文本，我们可以赋值整个文档的文本，然后用Split转换成单词数组

因为它在数组中，所以处理它们比访问 Words 集合更快。

这是我能想到的全部，但也许有更好的方法来做到这一点？下面的示例使用正则表达式搜索并替换所有匹配的重复项：

Option Explicit

Sub Delete_Duplicate()
    Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
    
    Dim fullTxt As String
    fullTxt = ActiveDocument.Range.Text
    
    Dim txtArr() As String
    txtArr = Split(fullTxt, " ")
    
    Dim regex As RegExp
    Set regex = New RegExp
    regex.Global = True
    regex.IgnoreCase = True
    
    Dim outputTxt As String
    outputTxt = fullTxt
    
    Dim n As Long
    Dim i As Long
    
    For i = UBound(txtArr) To 0 Step -1
        Dim matchWord As String
        
        matchWord = vbNullString
        For n = 0 To maxWord - 1
            If (i - n) < 0 Then Exit For
            
            matchWord = txtArr(i - n) & " " & matchWord
            matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
        
            regex.Pattern = matchWord & "[, ]{0,}" & matchWord
            If regex.test(outputTxt) Then
                outputTxt = regex.Replace(outputTxt, matchWord)
            End If
        Next n
    Next i
    Set regex = Nothing

    Application.UndoRecord.StartCustomRecord "Delete Duplicates"
    ActiveDocument.Range.Text = outputTxt
    Application.UndoRecord.EndCustomRecord
End Sub

Answer 3

您可以按照以下方式尝试：

Sub Demo()
Application.ScreenUpdating = False
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Replacement.Text = ""
    .Forward = True
    .Wrap = wdFindContinue
    .MatchWildcards = True
    .Text = "([A-Za-z0-9'’]@)[, ]@"
    .Execute
    Do While .Found = True
      .Execute Replace:=wdReplaceAll
    Loop
    .Text = "([A-Za-z0-9'’]@[, ]@[A-Za-z0-9'’]@)[, ]@"
    .Execute
    Do While .Found = True
      .Execute Replace:=wdReplaceAll
    Loop
  End With
End With
Application.ScreenUpdating = True
End Sub

更有效地从文档中删除重复的单词

Delete duplicate words from document more efficiently

performance

vba

ms-word

duplicates