更有效地从文档中删除重复的单词
Delete duplicate words from document more efficiently
我将每个单词与其他单词进行比较,检查是否重复,如果是,则将其删除。 1 到 4 页最多需要 5 分钟。
对于 50 或 100 页的文档,我需要修改或新想法,以便用更少的时间比较和删除重复项。
Sub Delete_Duplicates()
'***********'
'By
'MBA
'***********'
Dim AD As Range
Dim F As Range
Dim i As Long
Set AD = ActiveDocument.Range
Z = AD.Words.Count
y = 1
For i = Z To 1 Step -1
y = y + 1
Set F = AD.Words(i)
On Error Resume Next
Set s = AD.Words(i - 1)
If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
If Err.Number > 0 Then Exit Sub
If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
F.Text = ""
If Not c Is Nothing Then c.Text = " ": Set c = Nothing
End If
If Not c Is Nothing Then Set c = Nothing
On Error Resume Next
Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
On Error GoTo 0
Next
Beep
End Sub
Before/After
这只是概念,但请尝试准备文档中所有单词的列表,如果存在,则替换双字或三字。
Private Sub DeleteDuplicate()
Dim wholeTxt As String
Dim w As Range
Dim col As New Collection
Dim c
For Each w In ActiveDocument.Words
AddUniqueItem col, Trim(w.Text)
Next w
wholeTxt = ActiveDocument.Range.Text
For Each c In col
'add case with ","
'maybe one letter word should be forbidden, or add extra boundary
If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
'start of doc
Selection.HomeKey wdStory
'here should be all stuff to prepare replacement
'(...)
Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
wholeTxt = ActiveDocument.Range.Text
End If
Next c
Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
Dim s As String
On Error Resume Next
s = col(itemValAndKey)
If Err.Number <> 0 Then
col.Add itemValAndKey, itemValAndKey
Err.Clear
End If
On Error GoTo 0
End Sub
假设整个文档都是纯文本,我们可以赋值整个文档的文本,然后用Split
转换成单词数组
因为它在数组中,所以处理它们比访问 Words
集合更快。
这是我能想到的全部,但也许有更好的方法来做到这一点?下面的示例使用正则表达式搜索并替换所有匹配的重复项:
Option Explicit
Sub Delete_Duplicate()
Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
Dim fullTxt As String
fullTxt = ActiveDocument.Range.Text
Dim txtArr() As String
txtArr = Split(fullTxt, " ")
Dim regex As RegExp
Set regex = New RegExp
regex.Global = True
regex.IgnoreCase = True
Dim outputTxt As String
outputTxt = fullTxt
Dim n As Long
Dim i As Long
For i = UBound(txtArr) To 0 Step -1
Dim matchWord As String
matchWord = vbNullString
For n = 0 To maxWord - 1
If (i - n) < 0 Then Exit For
matchWord = txtArr(i - n) & " " & matchWord
matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
regex.Pattern = matchWord & "[, ]{0,}" & matchWord
If regex.test(outputTxt) Then
outputTxt = regex.Replace(outputTxt, matchWord)
End If
Next n
Next i
Set regex = Nothing
Application.UndoRecord.StartCustomRecord "Delete Duplicates"
ActiveDocument.Range.Text = outputTxt
Application.UndoRecord.EndCustomRecord
End Sub
您可以按照以下方式尝试:
Sub Demo()
Application.ScreenUpdating = False
With ActiveDocument.Range
With .Find
.ClearFormatting
.Replacement.ClearFormatting
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.MatchWildcards = True
.Text = "([A-Za-z0-9'’]@)[, ]@"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
.Text = "([A-Za-z0-9'’]@[, ]@[A-Za-z0-9'’]@)[, ]@"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
End With
End With
Application.ScreenUpdating = True
End Sub
我将每个单词与其他单词进行比较,检查是否重复,如果是,则将其删除。 1 到 4 页最多需要 5 分钟。
对于 50 或 100 页的文档,我需要修改或新想法,以便用更少的时间比较和删除重复项。
Sub Delete_Duplicates()
'***********'
'By
'MBA
'***********'
Dim AD As Range
Dim F As Range
Dim i As Long
Set AD = ActiveDocument.Range
Z = AD.Words.Count
y = 1
For i = Z To 1 Step -1
y = y + 1
Set F = AD.Words(i)
On Error Resume Next
Set s = AD.Words(i - 1)
If Trim(AD.Words(i - 1)) = "," Then Set s = AD.Words(i - 2): Set c = AD.Words(i - 1)
If Err.Number > 0 Then Exit Sub
If Not F.Text = Chr(13) And UCase(Trim(F.Text)) = UCase(Trim(s.Text)) Then
F.Text = ""
If Not c Is Nothing Then c.Text = " ": Set c = Nothing
End If
If Not c Is Nothing Then Set c = Nothing
On Error Resume Next
Call ProgressBar.Progress(y / Z * 100, True) '<<-- Progress Bar
On Error GoTo 0
Next
Beep
End Sub
Before/After
这只是概念,但请尝试准备文档中所有单词的列表,如果存在,则替换双字或三字。
Private Sub DeleteDuplicate()
Dim wholeTxt As String
Dim w As Range
Dim col As New Collection
Dim c
For Each w In ActiveDocument.Words
AddUniqueItem col, Trim(w.Text)
Next w
wholeTxt = ActiveDocument.Range.Text
For Each c In col
'add case with ","
'maybe one letter word should be forbidden, or add extra boundary
If InStr(1, wholeTxt, c & " " & c, vbBinaryCompare) <> 0 Then
'start of doc
Selection.HomeKey wdStory
'here should be all stuff to prepare replacement
'(...)
Selection.Find.Execute Findtext:=c & " " & c, ReplaceWith:=c
wholeTxt = ActiveDocument.Range.Text
End If
Next c
Set col = Nothing
End Sub
Private Sub AddUniqueItem(ByRef col As Collection, ByVal itemValAndKey As String)
Dim s As String
On Error Resume Next
s = col(itemValAndKey)
If Err.Number <> 0 Then
col.Add itemValAndKey, itemValAndKey
Err.Clear
End If
On Error GoTo 0
End Sub
假设整个文档都是纯文本,我们可以赋值整个文档的文本,然后用Split
转换成单词数组
因为它在数组中,所以处理它们比访问 Words
集合更快。
这是我能想到的全部,但也许有更好的方法来做到这一点?下面的示例使用正则表达式搜索并替换所有匹配的重复项:
Option Explicit
Sub Delete_Duplicate()
Const maxWord As Long = 2 'Change this to increase the max amount of words should be used to match as a phrase.
Dim fullTxt As String
fullTxt = ActiveDocument.Range.Text
Dim txtArr() As String
txtArr = Split(fullTxt, " ")
Dim regex As RegExp
Set regex = New RegExp
regex.Global = True
regex.IgnoreCase = True
Dim outputTxt As String
outputTxt = fullTxt
Dim n As Long
Dim i As Long
For i = UBound(txtArr) To 0 Step -1
Dim matchWord As String
matchWord = vbNullString
For n = 0 To maxWord - 1
If (i - n) < 0 Then Exit For
matchWord = txtArr(i - n) & " " & matchWord
matchWord = Trim$(Replace(matchWord, vbCr, vbNullString))
regex.Pattern = matchWord & "[, ]{0,}" & matchWord
If regex.test(outputTxt) Then
outputTxt = regex.Replace(outputTxt, matchWord)
End If
Next n
Next i
Set regex = Nothing
Application.UndoRecord.StartCustomRecord "Delete Duplicates"
ActiveDocument.Range.Text = outputTxt
Application.UndoRecord.EndCustomRecord
End Sub
您可以按照以下方式尝试:
Sub Demo()
Application.ScreenUpdating = False
With ActiveDocument.Range
With .Find
.ClearFormatting
.Replacement.ClearFormatting
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.MatchWildcards = True
.Text = "([A-Za-z0-9'’]@)[, ]@"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
.Text = "([A-Za-z0-9'’]@[, ]@[A-Za-z0-9'’]@)[, ]@"
.Execute
Do While .Found = True
.Execute Replace:=wdReplaceAll
Loop
End With
End With
Application.ScreenUpdating = True
End Sub