合并重叠日期并查找 non-overlaps 多个唯一 ID
Combining overlapping dates and finding non-overlaps for multiple unique IDs
我有一个包含 headers(超过 180k 行)的大型电子表格,在 A 中具有唯一 ID,在 B 中具有开始日期,在 C 中具有结束日期。每个 ID 有多行并且开始日期和结束日期重叠.
我需要找出每个 ID 的日期范围内的任何空白。我编写了一些不同的公式和宏,尝试并调整了我找到的 VBA 脚本。我尝试了一个电源查询和电源枢轴抓住吸管,但如果 Excel 没有崩溃,我就没有得到可用的输出。
示例数据:
ID
start
end
100
1/1/2015
3/1/2015
100
3/1/2015
1/1/2300
100
1/1/2018
1/1/2019
096
7/1/2020
1/1/2021
182
9/17/2017
1/1/2018
182
1/1/2018
1/1/2019
607
1/1/2015
9/1/2015
607
9/1/2015
1/1/2017
607
1/1/2018
1/1/2020
607
1/1/2021
1/1/2300
我想合并或合并这些以删除在日期范围内没有任何间隙的 ID 的额外行,但会为有以下 ID 的 ID 留下额外的行:
ID
start
end
100
1/1/2015
1/1/2300
096
7/1/2020
1/1/2021
182
9/17/2017
1/1/2019
607
1/1/2015
1/1/2017
607
1/1/2018
1/1/2020
607
1/1/2021
1/1/2300
我不需要它来组合;不过,为了演示,这会很好。此外,我会满足于能够告诉我哪些 ID 在范围内有差距的东西,即使它没有合并日期或删除额外的行。
我确实从另一个网站找到了一个几乎可以完成这项工作的脚本,但是因为日期范围不能全部按正确的顺序排序,比如示例中的 ID 100,它会在不应该的时候创建一个额外的行' t.
Sub Consolidate_Dates()
Dim cell As Range
Dim Nextrow As Long
Dim Startdate As Date
Nextrow = Range("A" & Rows.Count).End(xlUp).Row + 2
Startdate = Range("B2").Value
Application.ScreenUpdating = False
For Each cell In Range("A2", Range("A2").End(xlDown))
If cell.Value <> cell.Offset(1).Value Or _
cell.Offset(0, 2).Value < cell.Offset(1, 1).Value - 1 Then
Range("A" & Nextrow).Resize(1, 3).Value = cell.Resize(1, 3).Value
Range("B" & Nextrow).Value = Startdate
Nextrow = Nextrow + 1
Startdate = cell.Offset(1, 1).Value
End If
Next cell
Application.ScreenUpdating = True
End sub
试试这个。在开始之前确保数据范围按 ID 和开始日期排序。
Option Explicit
Public Enum ColId
ColId_Id = 1
ColId_Start_Date
ColId_End_Date
End Enum
Public Sub Test()
Dim row As Integer
' Skip the header row & the first data row.
' Start on the second data row.
row = 3
With Worksheets("Sheet1")
' Loop until you run out of data
Do While .Cells(row, ColId_Id) <> ""
' Compare the current row to the previous row.
' We're looking for the same id value and a start date that is
' within or adjoins the previous row's date range.
If .Cells(row, ColId_Id).Value = .Cells(row - 1, ColId_Id).Value _
And .Cells(row, ColId_Start_Date).Value >= .Cells(row - 1, ColId_Start_Date).Value _
And .Cells(row, ColId_Start_Date).Value <= .Cells(row - 1, ColId_End_Date).Value _
And .Cells(row, ColId_End_Date).Value > .Cells(row - 1, ColId_End_Date).Value _
Then
' Update the previous row and delete the current row.
.Cells(row - 1, ColId_End_Date).Value = .Cells(row, ColId_End_Date).Value
.Rows(row).Delete
Else
' Next row.
row = row + 1
End If
Loop
End With
End Sub
这是一个 Power Query 解决方案:
请阅读代码中的注释并探索应用步骤 window 以更好地理解算法,但是:
- 为每个 ID 在每个范围内创建 包含日期 的列表
- 将它们合并成一个列表
- 为每个 ID 创建一个包含 所有 个可能日期的列表
- 如果“ALL”范围内的所有日期都包含在组合列表中,那么我们就没有间隔。
- 创建两个单独的 table
- 一个与无间隙列表的组
- 一秒钟,然后我们 expand
- 追加两个 table。
请注意,许多步骤无法从 UI
完成
M码
粘贴到高级编辑器
确保将第 2 行中的 table 姓名更改为您实际的 table 姓名
let
Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
#"Changed Type" = Table.TransformColumnTypes(Source,{{"ID", Int64.Type}, {"start", type date}, {"end", type date}}),
//Turn each date range into a list
#"Added Custom" = Table.AddColumn(#"Changed Type", "dateList", each
List.Dates([start],
Duration.Days([end]-[start])+1,
#duration(1,0,0,0))),
//Group the rows by ID
/*Generate columns where
actual date ranges are combined into a list,
and a list of the Full date range for that ID*/
#"Grouped Rows" = Table.Group(#"Added Custom", {"ID"},
{{"All", each _, type table [ID=nullable number, start=nullable date, end=nullable date, dateList=list]},
{"combinedDates", each List.Distinct(List.Combine([dateList]))},
{"startToEnd", each List.Dates(List.Min([start]),
Duration.Days(List.Max([end])-List.Min([start]))+1,
#duration(1,0,0,0))}
}),
//if the full list and the combined list Match, then there are no gaps and return True else False
#"Added Custom1" = Table.AddColumn(#"Grouped Rows",
"Custom", each List.IsEmpty(List.Difference([startToEnd],[combinedDates]))),
#"Added Custom2" = Table.AddColumn(#"Added Custom1",
"start", each if [Custom] = false then null
else List.Min([combinedDates])),
#"Added Custom3" = Table.AddColumn(#"Added Custom2",
"end", each if [Custom] = false then null
else List.Max([combinedDates])),
//create the table of Trues which we will NOT expand
trueTbl = Table.SelectRows(#"Added Custom3", each [Custom] = true),
trueRemoveColumns = Table.RemoveColumns(trueTbl,
{"All", "combinedDates", "startToEnd","Custom"}),
trueTyped = Table.TransformColumnTypes(trueRemoveColumns,
{{"start", type date}, {"end", type date}}),
//create the table of False which we WILL expand
falseTbl = Table.SelectRows(#"Added Custom3", each [Custom] = false),
expandFalse = Table.ExpandTableColumn(falseTbl, "All",
{"start", "end"}, {"start.1", "end.1"}),
falseRemoveColumns = Table.RemoveColumns(expandFalse,
{"combinedDates", "startToEnd", "Custom", "start", "end"}),
falseRenameColumns = Table.RenameColumns(falseRemoveColumns,
{{"start.1", "start"}, {"end.1", "end"}}),
//Combine the tables
comb = Table.Combine({trueTyped, falseRenameColumns})
in
comb
这使用面向对象的方法。它首先将一组 ID 对象添加到字典中,每个对象对应一个唯一 ID。它向每个 ID 对象添加 ID 具有的日期跨度集合。添加每个跨度时,将开始数据与前一个结束日期进行比较,以确定是否存在差距。数据必须按ID、开始日期
排序
输入数据到sheet1,输出到sheet2。它显示了 D 和 E 列中的差距。还显示了创建测试数据的脚本。
Option Explicit
Sub Consolidate_Dates()
Const SHT_DATA = "Sheet1"
Const SHT_OUTPUT = "Sheet2"
Dim wb As Workbook, ws As Worksheet
Dim iLastRow As Long, i As Long, n As Integer
Dim dict As Object, id As String, objID As clsID
Dim t0 As Single, ar As Variant, obj As Variant
t0 = Timer
Set dict = CreateObject("Scripting.Dictionary")
' scan data on sheet 1
Set wb = ThisWorkbook
Set ws = wb.Sheets(SHT_DATA)
iLastRow = ws.Cells(rows.count, "A").End(xlUp).row
ar = ws.Range("A2:C" & iLastRow).Value2 ' put data in array
For i = 1 To UBound(ar)
id = Trim(ar(i, 1))
If Not dict.exists(id) Then
Set objID = New clsID
objID.id = id
dict.Add id, objID
End If
dict(id).AddSpan CDate(ar(i, 2)), CDate(ar(i, 3))
Next
' results sheet
With wb.Sheets(SHT_OUTPUT)
.Cells.Clear
.Range("A1:E1") = Array("ID", "Start", "End", "Gap Start", "Gap End")
.Columns("B:E").NumberFormat = "mm/dd/yyyy"
End With
ReDim ar(1 To iLastRow, 1 To 5) ' reuse part of array for output
i = 1
For Each obj In dict.items
Set objID = obj
' output spans and gaps
For n = 1 To obj.spansOut.count
ar(i, 1) = objID.id
ar(i, 2) = objID.spansOut(n).StartDate
ar(i, 3) = objID.spansOut(n).EndDate
' show gaps
If n > 1 Then
ar(i - 1, 4) = objID.spansOut(n - 1).EndDate
ar(i - 1, 5) = objID.spansOut(n).StartDate
End If
i = i + 1
Next
Next
' finish
Set dict = Nothing
With wb.Sheets(SHT_OUTPUT)
.Range("A2:E" & i).Value2 = ar
.Columns("A:E").AutoFit
.Activate
.Range("A1").Select
End With
Erase ar
MsgBox Format(i - 1, "#,###") & " rows output to " & SHT_OUTPUT, vbInformation, Int(Timer - t0) & " seconds"
End Sub
一个名为 clsID
的 Class 模块
Option Explicit
Public id As String ' unique id
Public hasGaps As Boolean
Public spans As New Collection
Public spansOut As New Collection
Sub AddSpan(dtStart As Date, dtEnd As Date)
Dim spNew As New clsSpan, spLast As clsSpan
spNew.StartDate = dtStart
spNew.EndDate = dtEnd
spans.Add spNew, CStr(spans.count + 1)
If spansOut.count = 0 Then
spansOut.Add spNew, "1"
hasGaps = False
Else
Set spLast = spansOut(spansOut.count)
If spNew.StartDate < spLast.StartDate Then
MsgBox "Start dates not sorted correctly for " & id, vbCritical
ElseIf spNew.StartDate > spLast.EndDate Then
' add new span
spansOut.Add spNew, CStr(spansOut.count + 1)
hasGaps = True
ElseIf spNew.EndDate > spLast.EndDate Then
' extend last span
spLast.EndDate = spNew.EndDate
Else
' no change
End If
End If
End Sub
一个名为 clsSpan
的 Class 模块
Option Explicit
Public StartDate As Date
Public EndDate As Date
生成随机测试数据的脚本
Sub testdata()
Const ROW_COUNT = 200000
Dim dt1 As Date, i As Long
Sheet1.Cells.Clear
For i = 2 To ROW_COUNT + 1
Sheet1.Cells(i, 1) = 1000 + Int(9000 * Rnd)
dt1 = CDate("1/1/2000") + Int(3650 * Rnd)
Sheet1.Cells(i, 2) = dt1
Sheet1.Cells(i, 3) = dt1 + Int(1000 * Rnd)
Next
With Sheet1.Sort
.SortFields.Clear
.SortFields.Add key:=Range("A1:A" & i)
.SortFields.Add key:=Range("B1:B" & i)
.SetRange Range("A1:C" & i)
.Header = xlYes
.MatchCase = False
.Orientation = xlTopToBottom
.SortMethod = xlPinYin
.Apply
End With
Sheet1.Activate
Sheet1.Range("A" & ROW_COUNT + 1).Select
MsgBox Format(ROW_COUNT, "#,###") & " rows created and sorted"
End Sub
我有一个包含 headers(超过 180k 行)的大型电子表格,在 A 中具有唯一 ID,在 B 中具有开始日期,在 C 中具有结束日期。每个 ID 有多行并且开始日期和结束日期重叠.
我需要找出每个 ID 的日期范围内的任何空白。我编写了一些不同的公式和宏,尝试并调整了我找到的 VBA 脚本。我尝试了一个电源查询和电源枢轴抓住吸管,但如果 Excel 没有崩溃,我就没有得到可用的输出。
示例数据:
ID | start | end |
---|---|---|
100 | 1/1/2015 | 3/1/2015 |
100 | 3/1/2015 | 1/1/2300 |
100 | 1/1/2018 | 1/1/2019 |
096 | 7/1/2020 | 1/1/2021 |
182 | 9/17/2017 | 1/1/2018 |
182 | 1/1/2018 | 1/1/2019 |
607 | 1/1/2015 | 9/1/2015 |
607 | 9/1/2015 | 1/1/2017 |
607 | 1/1/2018 | 1/1/2020 |
607 | 1/1/2021 | 1/1/2300 |
我想合并或合并这些以删除在日期范围内没有任何间隙的 ID 的额外行,但会为有以下 ID 的 ID 留下额外的行:
ID | start | end |
---|---|---|
100 | 1/1/2015 | 1/1/2300 |
096 | 7/1/2020 | 1/1/2021 |
182 | 9/17/2017 | 1/1/2019 |
607 | 1/1/2015 | 1/1/2017 |
607 | 1/1/2018 | 1/1/2020 |
607 | 1/1/2021 | 1/1/2300 |
我不需要它来组合;不过,为了演示,这会很好。此外,我会满足于能够告诉我哪些 ID 在范围内有差距的东西,即使它没有合并日期或删除额外的行。
我确实从另一个网站找到了一个几乎可以完成这项工作的脚本,但是因为日期范围不能全部按正确的顺序排序,比如示例中的 ID 100,它会在不应该的时候创建一个额外的行' t.
Sub Consolidate_Dates()
Dim cell As Range
Dim Nextrow As Long
Dim Startdate As Date
Nextrow = Range("A" & Rows.Count).End(xlUp).Row + 2
Startdate = Range("B2").Value
Application.ScreenUpdating = False
For Each cell In Range("A2", Range("A2").End(xlDown))
If cell.Value <> cell.Offset(1).Value Or _
cell.Offset(0, 2).Value < cell.Offset(1, 1).Value - 1 Then
Range("A" & Nextrow).Resize(1, 3).Value = cell.Resize(1, 3).Value
Range("B" & Nextrow).Value = Startdate
Nextrow = Nextrow + 1
Startdate = cell.Offset(1, 1).Value
End If
Next cell
Application.ScreenUpdating = True
End sub
试试这个。在开始之前确保数据范围按 ID 和开始日期排序。
Option Explicit
Public Enum ColId
ColId_Id = 1
ColId_Start_Date
ColId_End_Date
End Enum
Public Sub Test()
Dim row As Integer
' Skip the header row & the first data row.
' Start on the second data row.
row = 3
With Worksheets("Sheet1")
' Loop until you run out of data
Do While .Cells(row, ColId_Id) <> ""
' Compare the current row to the previous row.
' We're looking for the same id value and a start date that is
' within or adjoins the previous row's date range.
If .Cells(row, ColId_Id).Value = .Cells(row - 1, ColId_Id).Value _
And .Cells(row, ColId_Start_Date).Value >= .Cells(row - 1, ColId_Start_Date).Value _
And .Cells(row, ColId_Start_Date).Value <= .Cells(row - 1, ColId_End_Date).Value _
And .Cells(row, ColId_End_Date).Value > .Cells(row - 1, ColId_End_Date).Value _
Then
' Update the previous row and delete the current row.
.Cells(row - 1, ColId_End_Date).Value = .Cells(row, ColId_End_Date).Value
.Rows(row).Delete
Else
' Next row.
row = row + 1
End If
Loop
End With
End Sub
这是一个 Power Query 解决方案:
请阅读代码中的注释并探索应用步骤 window 以更好地理解算法,但是:
- 为每个 ID 在每个范围内创建 包含日期 的列表
- 将它们合并成一个列表
- 为每个 ID 创建一个包含 所有 个可能日期的列表
- 如果“ALL”范围内的所有日期都包含在组合列表中,那么我们就没有间隔。
- 创建两个单独的 table
- 一个与无间隙列表的组
- 一秒钟,然后我们 expand
- 追加两个 table。
请注意,许多步骤无法从 UI
完成M码
粘贴到高级编辑器
确保将第 2 行中的 table 姓名更改为您实际的 table 姓名
let
Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
#"Changed Type" = Table.TransformColumnTypes(Source,{{"ID", Int64.Type}, {"start", type date}, {"end", type date}}),
//Turn each date range into a list
#"Added Custom" = Table.AddColumn(#"Changed Type", "dateList", each
List.Dates([start],
Duration.Days([end]-[start])+1,
#duration(1,0,0,0))),
//Group the rows by ID
/*Generate columns where
actual date ranges are combined into a list,
and a list of the Full date range for that ID*/
#"Grouped Rows" = Table.Group(#"Added Custom", {"ID"},
{{"All", each _, type table [ID=nullable number, start=nullable date, end=nullable date, dateList=list]},
{"combinedDates", each List.Distinct(List.Combine([dateList]))},
{"startToEnd", each List.Dates(List.Min([start]),
Duration.Days(List.Max([end])-List.Min([start]))+1,
#duration(1,0,0,0))}
}),
//if the full list and the combined list Match, then there are no gaps and return True else False
#"Added Custom1" = Table.AddColumn(#"Grouped Rows",
"Custom", each List.IsEmpty(List.Difference([startToEnd],[combinedDates]))),
#"Added Custom2" = Table.AddColumn(#"Added Custom1",
"start", each if [Custom] = false then null
else List.Min([combinedDates])),
#"Added Custom3" = Table.AddColumn(#"Added Custom2",
"end", each if [Custom] = false then null
else List.Max([combinedDates])),
//create the table of Trues which we will NOT expand
trueTbl = Table.SelectRows(#"Added Custom3", each [Custom] = true),
trueRemoveColumns = Table.RemoveColumns(trueTbl,
{"All", "combinedDates", "startToEnd","Custom"}),
trueTyped = Table.TransformColumnTypes(trueRemoveColumns,
{{"start", type date}, {"end", type date}}),
//create the table of False which we WILL expand
falseTbl = Table.SelectRows(#"Added Custom3", each [Custom] = false),
expandFalse = Table.ExpandTableColumn(falseTbl, "All",
{"start", "end"}, {"start.1", "end.1"}),
falseRemoveColumns = Table.RemoveColumns(expandFalse,
{"combinedDates", "startToEnd", "Custom", "start", "end"}),
falseRenameColumns = Table.RenameColumns(falseRemoveColumns,
{{"start.1", "start"}, {"end.1", "end"}}),
//Combine the tables
comb = Table.Combine({trueTyped, falseRenameColumns})
in
comb
这使用面向对象的方法。它首先将一组 ID 对象添加到字典中,每个对象对应一个唯一 ID。它向每个 ID 对象添加 ID 具有的日期跨度集合。添加每个跨度时,将开始数据与前一个结束日期进行比较,以确定是否存在差距。数据必须按ID、开始日期
排序输入数据到sheet1,输出到sheet2。它显示了 D 和 E 列中的差距。还显示了创建测试数据的脚本。
Option Explicit
Sub Consolidate_Dates()
Const SHT_DATA = "Sheet1"
Const SHT_OUTPUT = "Sheet2"
Dim wb As Workbook, ws As Worksheet
Dim iLastRow As Long, i As Long, n As Integer
Dim dict As Object, id As String, objID As clsID
Dim t0 As Single, ar As Variant, obj As Variant
t0 = Timer
Set dict = CreateObject("Scripting.Dictionary")
' scan data on sheet 1
Set wb = ThisWorkbook
Set ws = wb.Sheets(SHT_DATA)
iLastRow = ws.Cells(rows.count, "A").End(xlUp).row
ar = ws.Range("A2:C" & iLastRow).Value2 ' put data in array
For i = 1 To UBound(ar)
id = Trim(ar(i, 1))
If Not dict.exists(id) Then
Set objID = New clsID
objID.id = id
dict.Add id, objID
End If
dict(id).AddSpan CDate(ar(i, 2)), CDate(ar(i, 3))
Next
' results sheet
With wb.Sheets(SHT_OUTPUT)
.Cells.Clear
.Range("A1:E1") = Array("ID", "Start", "End", "Gap Start", "Gap End")
.Columns("B:E").NumberFormat = "mm/dd/yyyy"
End With
ReDim ar(1 To iLastRow, 1 To 5) ' reuse part of array for output
i = 1
For Each obj In dict.items
Set objID = obj
' output spans and gaps
For n = 1 To obj.spansOut.count
ar(i, 1) = objID.id
ar(i, 2) = objID.spansOut(n).StartDate
ar(i, 3) = objID.spansOut(n).EndDate
' show gaps
If n > 1 Then
ar(i - 1, 4) = objID.spansOut(n - 1).EndDate
ar(i - 1, 5) = objID.spansOut(n).StartDate
End If
i = i + 1
Next
Next
' finish
Set dict = Nothing
With wb.Sheets(SHT_OUTPUT)
.Range("A2:E" & i).Value2 = ar
.Columns("A:E").AutoFit
.Activate
.Range("A1").Select
End With
Erase ar
MsgBox Format(i - 1, "#,###") & " rows output to " & SHT_OUTPUT, vbInformation, Int(Timer - t0) & " seconds"
End Sub
一个名为 clsID
Option Explicit
Public id As String ' unique id
Public hasGaps As Boolean
Public spans As New Collection
Public spansOut As New Collection
Sub AddSpan(dtStart As Date, dtEnd As Date)
Dim spNew As New clsSpan, spLast As clsSpan
spNew.StartDate = dtStart
spNew.EndDate = dtEnd
spans.Add spNew, CStr(spans.count + 1)
If spansOut.count = 0 Then
spansOut.Add spNew, "1"
hasGaps = False
Else
Set spLast = spansOut(spansOut.count)
If spNew.StartDate < spLast.StartDate Then
MsgBox "Start dates not sorted correctly for " & id, vbCritical
ElseIf spNew.StartDate > spLast.EndDate Then
' add new span
spansOut.Add spNew, CStr(spansOut.count + 1)
hasGaps = True
ElseIf spNew.EndDate > spLast.EndDate Then
' extend last span
spLast.EndDate = spNew.EndDate
Else
' no change
End If
End If
End Sub
一个名为 clsSpan
Option Explicit
Public StartDate As Date
Public EndDate As Date
生成随机测试数据的脚本
Sub testdata()
Const ROW_COUNT = 200000
Dim dt1 As Date, i As Long
Sheet1.Cells.Clear
For i = 2 To ROW_COUNT + 1
Sheet1.Cells(i, 1) = 1000 + Int(9000 * Rnd)
dt1 = CDate("1/1/2000") + Int(3650 * Rnd)
Sheet1.Cells(i, 2) = dt1
Sheet1.Cells(i, 3) = dt1 + Int(1000 * Rnd)
Next
With Sheet1.Sort
.SortFields.Clear
.SortFields.Add key:=Range("A1:A" & i)
.SortFields.Add key:=Range("B1:B" & i)
.SetRange Range("A1:C" & i)
.Header = xlYes
.MatchCase = False
.Orientation = xlTopToBottom
.SortMethod = xlPinYin
.Apply
End With
Sheet1.Activate
Sheet1.Range("A" & ROW_COUNT + 1).Select
MsgBox Format(ROW_COUNT, "#,###") & " rows created and sorted"
End Sub