合并重叠日期并查找 non-overlaps 多个唯一 ID

Combining overlapping dates and finding non-overlaps for multiple unique IDs

我有一个包含 headers(超过 180k 行)的大型电子表格,在 A 中具有唯一 ID,在 B 中具有开始日期,在 C 中具有结束日期。每个 ID 有多行并且开始日期和结束日期重叠.

我需要找出每个 ID 的日期范围内的任何空白。我编写了一些不同的公式和宏,尝试并调整了我找到的 VBA 脚本。我尝试了一个电源查询和电源枢轴抓住吸管,但如果 Excel 没有崩溃,我就没有得到可用的输出。

示例数据:

ID start end
100 1/1/2015 3/1/2015
100 3/1/2015 1/1/2300
100 1/1/2018 1/1/2019
096 7/1/2020 1/1/2021
182 9/17/2017 1/1/2018
182 1/1/2018 1/1/2019
607 1/1/2015 9/1/2015
607 9/1/2015 1/1/2017
607 1/1/2018 1/1/2020
607 1/1/2021 1/1/2300

我想合并或合并这些以删除在日期范围内没有任何间隙的 ID 的额外行,但会为有以下 ID 的 ID 留下额外的行:

ID start end
100 1/1/2015 1/1/2300
096 7/1/2020 1/1/2021
182 9/17/2017 1/1/2019
607 1/1/2015 1/1/2017
607 1/1/2018 1/1/2020
607 1/1/2021 1/1/2300

不需要它来组合;不过,为了演示,这会很好。此外,我会满足于能够告诉我哪些 ID 在范围内有差距的东西,即使它没有合并日期或删除额外的行。

我确实从另一个网站找到了一个几乎可以完成这项工作的脚本,但是因为日期范围不能全部按正确的顺序排序,比如示例中的 ID 100,它会在不应该的时候创建一个额外的行' t.

Sub Consolidate_Dates()
    
    Dim cell As Range
    Dim Nextrow As Long
    Dim Startdate As Date
    
    Nextrow = Range("A" & Rows.Count).End(xlUp).Row + 2
    Startdate = Range("B2").Value
    
    Application.ScreenUpdating = False
    For Each cell In Range("A2", Range("A2").End(xlDown))
        If cell.Value <> cell.Offset(1).Value Or _
           cell.Offset(0, 2).Value < cell.Offset(1, 1).Value - 1 Then
            Range("A" & Nextrow).Resize(1, 3).Value = cell.Resize(1, 3).Value
            Range("B" & Nextrow).Value = Startdate
            Nextrow = Nextrow + 1
            Startdate = cell.Offset(1, 1).Value
        End If
    Next cell
    Application.ScreenUpdating = True
End sub

试试这个。在开始之前确保数据范围按 ID 和开始日期排序。

Option Explicit

Public Enum ColId
    ColId_Id = 1
    ColId_Start_Date
    ColId_End_Date
End Enum

Public Sub Test()

    Dim row As Integer
    
    ' Skip the header row & the first data row. 
    ' Start on the second data row.
    row = 3
    
    With Worksheets("Sheet1")
        
        ' Loop until you run out of data
        Do While .Cells(row, ColId_Id) <> ""
            
            ' Compare the current row to the previous row.
            ' We're looking for the same id value and a start date that is 
            ' within or adjoins the previous row's date range.
            If .Cells(row, ColId_Id).Value = .Cells(row - 1, ColId_Id).Value _
            And .Cells(row, ColId_Start_Date).Value >= .Cells(row - 1, ColId_Start_Date).Value _
            And .Cells(row, ColId_Start_Date).Value <= .Cells(row - 1, ColId_End_Date).Value _
            And .Cells(row, ColId_End_Date).Value > .Cells(row - 1, ColId_End_Date).Value _
            Then

                ' Update the previous row and delete the current row.
                .Cells(row - 1, ColId_End_Date).Value = .Cells(row, ColId_End_Date).Value
                .Rows(row).Delete

            Else

                ' Next row.
                row = row + 1

            End If
        
        Loop
    
    End With

End Sub

这是一个 Power Query 解决方案:

请阅读代码中的注释并探索应用步骤 window 以更好地理解算法,但是:

  • 为每个 ID 在每个范围内创建 包含日期 的列表
    • 将它们合并成一个列表
  • 为每个 ID 创建一个包含 所有 个可能日期的列表
  • 如果“ALL”范围内的所有日期都包含在组合列表中,那么我们就没有间隔。
  • 创建两个单独的 table
    • 一个与无间隙列表的组
    • 一秒钟,然后我们 expand
  • 追加两个 table。

请注意,许多步骤无法从 UI

完成

M码

粘贴到高级编辑器

确保将第 2 行中的 table 姓名更改为您实际的 table 姓名

let
    Source = Excel.CurrentWorkbook(){[Name="Table1"]}[Content],
    #"Changed Type" = Table.TransformColumnTypes(Source,{{"ID", Int64.Type}, {"start", type date}, {"end", type date}}),

//Turn each date range into a list
    #"Added Custom" = Table.AddColumn(#"Changed Type", "dateList", each 
        List.Dates([start],
                    Duration.Days([end]-[start])+1,
                    #duration(1,0,0,0))),

  //Group the rows by ID
  /*Generate columns where 
      actual date ranges are combined into a list,
      and a list of the Full date range for that ID*/
    #"Grouped Rows" = Table.Group(#"Added Custom", {"ID"}, 
        {{"All", each _, type table [ID=nullable number, start=nullable date, end=nullable date, dateList=list]},
        {"combinedDates", each List.Distinct(List.Combine([dateList]))},
        {"startToEnd", each List.Dates(List.Min([start]),
                                Duration.Days(List.Max([end])-List.Min([start]))+1,
                                #duration(1,0,0,0))}        
        }),

  //if the full list and the combined list Match, then there are no gaps and return True else False        
    #"Added Custom1" = Table.AddColumn(#"Grouped Rows", 
          "Custom", each List.IsEmpty(List.Difference([startToEnd],[combinedDates]))),
    #"Added Custom2" = Table.AddColumn(#"Added Custom1", 
          "start", each if [Custom] = false then null
                else List.Min([combinedDates])),
    #"Added Custom3" = Table.AddColumn(#"Added Custom2", 
          "end", each if [Custom] = false then null 
                else List.Max([combinedDates])),

  //create the table of Trues which we will NOT expand
  trueTbl = Table.SelectRows(#"Added Custom3", each [Custom] = true),
    trueRemoveColumns = Table.RemoveColumns(trueTbl,
            {"All", "combinedDates", "startToEnd","Custom"}),
    trueTyped = Table.TransformColumnTypes(trueRemoveColumns,
            {{"start", type date}, {"end", type date}}),

   //create the table of False which we WILL expand 
  falseTbl = Table.SelectRows(#"Added Custom3", each [Custom] = false),
    expandFalse = Table.ExpandTableColumn(falseTbl, "All", 
            {"start", "end"}, {"start.1", "end.1"}),
    falseRemoveColumns = Table.RemoveColumns(expandFalse,
            {"combinedDates", "startToEnd", "Custom", "start", "end"}),
    falseRenameColumns = Table.RenameColumns(falseRemoveColumns,
            {{"start.1", "start"}, {"end.1", "end"}}),

//Combine the tables
    comb = Table.Combine({trueTyped, falseRenameColumns})
in 
   comb

这使用面向对象的方法。它首先将一组 ID 对象添加到字典中,每个对象对应一个唯一 ID。它向每个 ID 对象添加 ID 具有的日期跨度集合。添加每个跨度时,将开始数据与前一个结束日期进行比较,以确定是否存在差距。数据必须按ID、开始日期

排序

输入数据到sheet1,输出到sheet2。它显示了 D 和 E 列中的差距。还显示了创建测试数据的脚本。

Option Explicit

Sub Consolidate_Dates()
    
    Const SHT_DATA = "Sheet1"
    Const SHT_OUTPUT = "Sheet2"

    Dim wb As Workbook, ws As Worksheet
    Dim iLastRow As Long, i As Long, n As Integer
    Dim dict As Object, id As String, objID As clsID
    Dim t0 As Single, ar As Variant, obj As Variant
    t0 = Timer
   
    Set dict = CreateObject("Scripting.Dictionary")
  
    ' scan data on sheet 1
    Set wb = ThisWorkbook
    Set ws = wb.Sheets(SHT_DATA)
    iLastRow = ws.Cells(rows.count, "A").End(xlUp).row
    ar = ws.Range("A2:C" & iLastRow).Value2 ' put data in array
    For i = 1 To UBound(ar)
        id = Trim(ar(i, 1))
        If Not dict.exists(id) Then
            Set objID = New clsID
            objID.id = id
            dict.Add id, objID
        End If
        dict(id).AddSpan CDate(ar(i, 2)), CDate(ar(i, 3))
    Next

    ' results sheet
    With wb.Sheets(SHT_OUTPUT)
        .Cells.Clear
        .Range("A1:E1") = Array("ID", "Start", "End", "Gap Start", "Gap End")
        .Columns("B:E").NumberFormat = "mm/dd/yyyy"
    End With
    ReDim ar(1 To iLastRow, 1 To 5) ' reuse part of array for output
    i = 1
    For Each obj In dict.items
        Set objID = obj
        ' output spans and gaps
        For n = 1 To obj.spansOut.count
            ar(i, 1) = objID.id
            ar(i, 2) = objID.spansOut(n).StartDate
            ar(i, 3) = objID.spansOut(n).EndDate
            ' show gaps
            If n > 1 Then
                ar(i - 1, 4) = objID.spansOut(n - 1).EndDate
                ar(i - 1, 5) = objID.spansOut(n).StartDate
            End If
            i = i + 1
        Next
    Next
    
    ' finish
    Set dict = Nothing
    With wb.Sheets(SHT_OUTPUT)
        .Range("A2:E" & i).Value2 = ar
        .Columns("A:E").AutoFit
        .Activate
        .Range("A1").Select
    End With
    Erase ar
   
    MsgBox Format(i - 1, "#,###") & " rows output to " & SHT_OUTPUT, vbInformation, Int(Timer - t0) & " seconds"
End Sub

一个名为 clsID

的 Class 模块
Option Explicit

Public id As String ' unique id
Public hasGaps As Boolean
Public spans As New Collection
Public spansOut As New Collection

Sub AddSpan(dtStart As Date, dtEnd As Date)
    
    Dim spNew As New clsSpan, spLast As clsSpan
    spNew.StartDate = dtStart
    spNew.EndDate = dtEnd
    spans.Add spNew, CStr(spans.count + 1)

    If spansOut.count = 0 Then
        spansOut.Add spNew, "1"
        hasGaps = False
    Else
        Set spLast = spansOut(spansOut.count)
        If spNew.StartDate < spLast.StartDate Then
            MsgBox "Start dates not sorted correctly for " & id, vbCritical
        ElseIf spNew.StartDate > spLast.EndDate Then
            ' add new span
            spansOut.Add spNew, CStr(spansOut.count + 1)
            hasGaps = True
        ElseIf spNew.EndDate > spLast.EndDate Then
            ' extend last span
            spLast.EndDate = spNew.EndDate
        Else
            ' no change
        End If
    End If
End Sub

一个名为 clsSpan

的 Class 模块
Option Explicit

Public StartDate As Date
Public EndDate As Date

生成随机测试数据的脚本

Sub testdata()
    Const ROW_COUNT = 200000
    Dim dt1 As Date, i As Long
    Sheet1.Cells.Clear
    For i = 2 To ROW_COUNT + 1
        Sheet1.Cells(i, 1) = 1000 + Int(9000 * Rnd)
        dt1 = CDate("1/1/2000") + Int(3650 * Rnd)
        Sheet1.Cells(i, 2) = dt1
        Sheet1.Cells(i, 3) = dt1 + Int(1000 * Rnd)
    Next

    With Sheet1.Sort
        .SortFields.Clear
        .SortFields.Add key:=Range("A1:A" & i)
        .SortFields.Add key:=Range("B1:B" & i)
        .SetRange Range("A1:C" & i)
        .Header = xlYes
        .MatchCase = False
        .Orientation = xlTopToBottom
        .SortMethod = xlPinYin
        .Apply
    End With
    Sheet1.Activate
    Sheet1.Range("A" & ROW_COUNT + 1).Select
   
    MsgBox Format(ROW_COUNT, "#,###") & " rows created and sorted"
End Sub