将重叠范围拆分为所有唯一范围

Split overlapping ranges into all unique ranges

我正在尝试拆分具有关联属性的动态数量的范围,以便每当 2 个或更多范围重叠时,重叠部分将拆分为具有所有重叠属性组合的唯一范围。

数据集的类型是

case class testCC(bgn_ts: java.sql.Timestamp, end_ts: java.sql.Timestamp, supply1: Int, supply2: Int)

现在数据 dataset1 的 supply2 始终为 0,dataset2 的 supply 1 始终为 0。

                         Set1                                    
bgn_ts            | end_ts            |  supply1   |supply2 |    
2020-12-13T11:45  | 2020-12-16T11:45  | 10         |  0     |    
2020-12-16T11:45  | 9999-12-31T00:00  | 11         |  0     |

                         Set2
bgn_ts            | end_ts            |  supply1   |supply2 | 
2020-12-15T11:45  | 9999-12-31T00:00  | 0          |  12    |

这应该导致

bgn_ts            | end_ts            |  supply1   |supply2 |
2020-12-13T11:45  | 2020-12-15T11:45  |   10       |  0     |
2020-12-15T11:45  | 2020-12-16T11:45  |   10       |  12    |
2020-12-16T11:45  | 9999-12-31T00:00  |   11       |  12    |

另一个例子:

                         Set1                                    
bgn_ts            | end_ts            |  supply1   |supply2 |    
2020-12-13T11:45  | 2020-12-16T11:45  | 10         |  0     |    
2020-12-16T11:45  | 2020-12-17T11:45  | 12         |  0     |
2020-12-17T11:45  | 9999-12-31T00:00  | 20         |  0     |

                         Set2                                    
bgn_ts            | end_ts            |  supply1   |supply2 |    
2020-12-13T11:45  | 2020-12-15T11:45  | 0          | 10     |    
2020-12-15T11:45  | 2020-12-16T11:45  | 0          |  9     |
2020-12-16T11:45  | 2020-12-18T11:45  | 0          | 12     |
2020-12-18T11:45  | 9999-12-31T00:00  | 0          | 21     |

这应该导致

bgn_ts            | end_ts            |  supply1   |supply2 |    
2020-12-13T11:45  | 2020-12-15T11:45  | 10         | 10     |    
2020-12-15T11:45  | 2020-12-16T11:45  | 10         |  9     |
2020-12-16T11:45  | 2020-12-17T11:45  | 12         | 12     |
2020-12-17T11:45  | 2020-12-18T11:45  | 20         | 12     |
2020-12-18T11:45  | 9999-12-31T00:00  | 20         | 21     |

下面是我试过的代码:

implicit def ts_ordering: Ordering[Timestamp] = new Ordering[Timestamp] {
            override def compare(x: Timestamp, y: Timestamp): Int = x compareTo y
          }

set2.sortBy(_.end_ts)
.foldLeft(set1.sortBy(_.end_ts)) {
  case (result, set2_entry) =>
    result.span(!_.end_ts.after(set2_entry.bgn_ts)) match {
      case (before, remaining) =>
        val (overlap, after) = remaining.span(_.bgn_ts.before(set2_entry.end_ts))
        before ++ (
          overlap.flatMap {
            case o if o.bgn_ts.equals(set2_entry.bgn_ts) && o.end_ts.equals(set2_entry.end_ts) =>
              o.copy(supply2 = set2_entry.supply2) :: Nil
            case o if o.bgn_ts.before(set2_entry.bgn_ts) && o.end_ts.after(set2_entry.end_ts) =>
              // need to split into three here, set2 record is entirely within this record
              o.copy(end_ts = set2_entry.bgn_ts) ::
              o.copy(bgn_ts = set2_entry.bgn_ts, end_ts = set2_entry.end_ts, supply2 = set2_entry.supply2) ::
              o.copy(bgn_ts = set2_entry.end_ts) ::
              Nil
            case o if !o.bgn_ts.after(set2_entry.bgn_ts) =>
              // split into two here, set2 record overlaps after the beginning of this record
              o.copy(end_ts = set2_entry.bgn_ts) ::
              o.copy(bgn_ts = set2_entry.bgn_ts, supply2 = set2_entry.supply2) ::
              Nil
            case o =>
              // split into two here, set2 record overlaps before the end of this record
              o.copy(end_ts = set2_entry.end_ts, supply2 = set2_entry.supply2) ::
              o.copy(bgn_ts = set2_entry.end_ts) ::
              Nil
          }
        ) ++ after
    }
}

为了方便测试,我提供测试数据:

Example1
val set1 = List(
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-13 11:45:00"),
        end_ts = Timestamp.valueOf("2020-12-16 11:45:00"), supply2 = 0, supply1 = 10),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-16 11:45:00"),
        end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply2 = 0, supply1 = 11)
)

val set2 = List(
  testCC(
    bgn_ts = Timestamp.valueOf("2020-12-15 11:45:00"),
    end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply1 = 0, supply2 = 12)
)

val expected1 = List(
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-13 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-15 11:45:00.0"), supply1 = 10, supply2 = 0),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-15 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-16 11:45:00.0"), supply1 = 10, supply2 = 12),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-16 11:45:00.0"),
        end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply1 = 11, supply2 = 12)
    )


Example2
val set1 = List(
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-13 11:45:00"),
        end_ts = Timestamp.valueOf("2020-12-16 11:45:00"), supply2 = 0, supply1 = 10),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-16 11:45:00"),
        end_ts = Timestamp.valueOf("2020-12-17 11:45:00"), supply2 = 0, supply1 = 12),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-17 11:45:00"),
        end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply2 = 0, supply1 = 20)
    )

val set2 = List(
  testCC(
    bgn_ts = Timestamp.valueOf("2020-12-13 11:45:00"),
    end_ts = Timestamp.valueOf("2020-12-15 11:45:00"), supply1 = 0, supply2 = 10),
  testCC(
    bgn_ts = Timestamp.valueOf("2020-12-15 11:45:00"),
    end_ts = Timestamp.valueOf("2020-12-16 11:45:00"), supply1 = 0, supply2 = 9),
  testCC(
    bgn_ts = Timestamp.valueOf("2020-12-16 11:45:00"),
    end_ts = Timestamp.valueOf("2020-12-18 11:45:00"), supply1 = 0, supply2 = 12),
  testCC(
    bgn_ts = Timestamp.valueOf("2020-12-18 11:45:00"),
    end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply1 = 0, supply2 = 21)
)

val expected1 = List(
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-13 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-15 11:45:00.0"), supply1 = 10, supply2 = 10),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-15 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-16 11:45:00.0"), supply1 = 10, supply2 = 9),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-16 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-17 11:45:00.0"), supply1 = 12, supply2 = 12),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-17 11:45:00.0"),
        end_ts = Timestamp.valueOf("2020-12-18 11:45:00.0"), supply1 = 20, supply2 = 12),
      testCC(
        bgn_ts = Timestamp.valueOf("2020-12-18 11:45:00.0"),
        end_ts = Timestamp.valueOf("9999-12-31 00:00:00"), supply1 = 20, supply2 = 21)
    )

这似乎适用于提供的 2 个测试示例。

import java.sql.Timestamp

case class TestCC(bgn_ts: Timestamp, end_ts: Timestamp
                 ,supply1: Int, supply2: Int)

def mergeRanges(lst1: List[TestCC]
               ,lst2: List[TestCC]): List[TestCC] = {
  val lsts = lst1 ++ lst2
  val bMap = lsts.groupMapReduce(_.bgn_ts)(identity){
    case (a,b) => TestCC(a.bgn_ts, b.end_ts
                        ,a.supply1+b.supply1, a.supply2+b.supply2)
  }
  val eMap = lsts.groupMapReduce(_.end_ts)(identity){
    case (a,b) => TestCC(a.bgn_ts, b.end_ts
                        ,a.supply1+b.supply1, a.supply2+b.supply2)
  }
  List.unfold(bMap.keys.toList.sorted ->
              eMap.keys.toList.sorted){
    case (Nil, Nil) => None
    case (Nil,_)|(_,Nil) => throw new Error("unexpected")
    case (b::bs, e::es) =>
      if (bs.nonEmpty && bs.head.before(e))
        Some(bMap(b).copy(end_ts = bs.head) -> (bs,e::es))
      else
        Some(TestCC(b, e
                   ,bMap(b).supply1 max eMap(e).supply1
                   ,bMap(b).supply2 max eMap(e).supply2) -> (bs,es))
  }
}

该代码对输入数据做了一些假设,我不确定它处理格式错误的输入的效果如何,但试试看它是否能让您更接近您的目标。


第二次尝试

经过深思熟虑,我决定采用不同的方法。

def mergeRanges(lst1: List[TestCC]
               ,lst2: List[TestCC]): List[TestCC] = {
  val lsts = lst1 ++ lst2
  val s1Map = lsts.filter(_.supply1 > 0)
                  .sortBy(_.bgn_ts)
                  .foldLeft(Map.empty[Timestamp,Int]){
                    case (mp, TestCC(b,e,s1,_)) =>
                      mp + (b -> s1) + (e -> s1)
                  }.withDefaultValue(0)
  val s1Lst = s1Map.keys.toList.sorted.reverse
  val s2Map = lsts.filter(_.supply2 > 0)
                  .sortBy(_.bgn_ts)
                  .foldLeft(Map.empty[Timestamp,Int]){
                    case (mp, TestCC(b,e,_,s2)) =>
                      mp + (b -> s2) + (e -> s2)
                  }.withDefaultValue(0)
  val s2Lst = s2Map.keys.toList.sorted.reverse
  lsts.flatMap{case TestCC(b,e,_,_) => List(b,e)}
      .distinct.sorted.sliding(2).toList
      .map{case List(ts1, ts2) =>
        TestCC(ts1, ts2
          ,s1Lst.dropWhile(_.after(ts1)).headOption.fold(0)(s1Map)
          ,s2Lst.dropWhile(_.after(ts1)).headOption.fold(0)(s2Map))
      }
}

我相信这通过了所有 current 测试示例。

这个是我想出来的,有点像@jwvh的思路。我没有使用地图,而是将事件放在 Set


case class Event(offset: Timestamp, isEnd: Boolean, supply1: Int, supply2: Int)
case class CoreData(supply1: Int, supply2: Int)

def createEventsFromSymbol(x: TestCC): List[Event] = {
    Event(x.bgn_ts, isEnd = false, x.supply1, x.supply2) ::
    Event(x.end_ts, isEnd = true, x.supply1, x.supply2) :: Nil
}
val lst = ats ++ inv

val events = lst
      .flatMap(createEventsFromSymbol)
      .sortBy(x => (x.offset, x.isEnd))

val defaultTs = Timestamp.valueOf("1970-01-01 00:00:00.0")

val (result1, _, _) =
    events.foldLeft(List.empty[InvAts], defaultTs, List.empty[CoreData]){

      case ((result, currentStart, currentSet), currentEvent) =>
        if(currentEvent.isEnd) {
          val mergedEvent =
            if(!currentStart.equals(defaultTs) && currentEvent.offset.after(currentStart)) {
              val coreData: CoreData = currentSet
                .reduce((a, b) => CoreData(a.current_supply + b.current_supply, a.available_to_sell + b.available_to_sell))

              TestCC(currentStart, currentEvent.offset, x.current_supply, x.available_to_sell) :: Nil
            }
            else {
              Nil
            }

            val new_set = currentSet
                .filterNot(x =>  x.current_supply == currentEvent.current_supply && x.available_to_sell == currentEvent.available_to_sell)

            (mergedEvent ++ result, currentEvent.offset, new_set)
        }
        else {
            val mergedEvent =
            if (!currentStart.equals(defaultTs) && !currentEvent.offset.equals(currentStart) && currentEvent.offset.after(currentStart)) {
              val x = currentSet
                .reduce((a, b) => CoreData(a.current_supply + b.current_supply, a.available_to_sell + b.available_to_sell))

              TestCC(currentStart, currentEvent.offset, x.current_supply, x.available_to_sell) :: Nil
            }
            else {
              Nil
            }

            val new_set = (CoreData(currentEvent.current_supply, currentEvent.available_to_sell) :: currentSet).distinct
            (mergedEvent ++ result, currentEvent.offset, new_set)
        }
    }