使用 Needle 创建 K 折交叉验证测试和训练集

Question

假设我每个月都有一堆订单。订单也可以延长到特定月份。我做了一些预测，我想用 KFold 交叉验证来验证我的预测。我的目标是创建 K 个测试和训练集。

我将我的订单 ID 分组，将索引分成测试训练集并根据这些索引收集行。

我想出的解决方案很慢，因为它对每个订单 ID 使用 Series.filter 和 Seq.contains。有人知道在 F# 中更有效的方法吗？

简化示例：

#r "nuget: Deedle"
#r "nuget: FSharp.Stats"

open Deedle

type Order = 
  { OrderId:string; Month:int; Amount:int }

let OrderRecds = 
  [ { OrderId = "I1"; Month = 1; Amount = 100}
    { OrderId = "I2"; Month = 1; Amount = 200}
    { OrderId = "I3"; Month = 1; Amount = 300}
    { OrderId = "I4"; Month = 1; Amount = 400}
    { OrderId = "I5"; Month = 1; Amount = 500}
    { OrderId = "I6"; Month = 1; Amount = 600}
    { OrderId = "I1"; Month = 2; Amount = 100}
    { OrderId = "I2"; Month = 2; Amount = 200}
    { OrderId = "I3"; Month = 2; Amount = 300}
    { OrderId = "I4"; Month = 2; Amount = 400}
    { OrderId = "I5"; Month = 2; Amount = 500}
    { OrderId = "I6"; Month = 2; Amount = 600}
    { OrderId = "I1"; Month = 3; Amount = 100}
    { OrderId = "I2"; Month = 3; Amount = 200}
    { OrderId = "I3"; Month = 3; Amount = 300}
    { OrderId = "I4"; Month = 3; Amount = 400}
    { OrderId = "I5"; Month = 3; Amount = 500}
    { OrderId = "I6"; Month = 3; Amount = 600}
     ]

let df_order = OrderRecds |> Frame.ofRecords

let order_ids_series = 
    df_order 
    |> Frame.groupRowsByString "OrderId" 
    |> Frame.nest
    |> Series.indexOrdinally

let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
        [|0 .. n-1|]
        |> FSharp.Stats.Array.shuffleFisherYates
        |> Seq.chunkBySize chunkSize

let train_indexes =
    { 0 .. NUM_FOLDS - 1 }
    |> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)

let test_indexes = chunkIndices

let train_data = 
    train_indexes 
    |> Seq.map (fun indexes -> order_ids_series |> Series.filter (fun k _ -> Seq.contains k indexes) ) 
    |> Seq.map Frame.unnest 
    |> Seq.map (Frame.mapRowKeys snd)

Answer 1

我认为诀窍是通过键访问系列，因此您可以直接转到相应的值。所以尝试这样的事情：

let train_data =
    train_indexes
    |> Seq.map (fun indexes ->
         indexes
            |> Seq.sort
            |> Seq.map (fun index -> order_ids_series.[index])   // access the series by its key
            |> Series.ofValues)
    |> Seq.map Frame.unnest 
    |> Seq.map (Frame.mapRowKeys snd)

更新 #1

为了提高性能，我觉得你可以去掉Frame.unnest，像这样：

let train_data =
    train_indexes
    |> Seq.map (fun indexes ->
         indexes
            |> Seq.map (fun index -> order_ids_series.[index])
            |> Frame.mergeAll)

这似乎无法保留行顺序，但希望对您仍然有效。沿着这些路线进一步优化是可能的。

更新#2

我刚刚对 24,000 行的 unnest 与 mergeAll 性能进行了基准测试，它们在我的盒子上都需要大约 9.6 秒。我不确定如何进一步降低这个数字。我确实认为 mergeAll 版本更容易理解，但它实际上似乎并没有更快。

Answer 2

我对另一个答案的一个担心是 Frame.unnest 或 Frame.mergeAll 需要很多时间。此解决方案创建索引映射并使用 filterRows。在我的数据集上，它大约快了 60 倍。

let df_order_grouped = 
    df_order 
    |> Frame.groupRowsByString "OrderId"

let order_ids_series = 
  df_order_grouped
    |> Frame.nest

let order_ids = order_ids_series.Keys

let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
        order_ids |> Array.ofSeq
        |> FSharp.Stats.Array.shuffleFisherYates
        |> Seq.chunkBySize chunkSize

let train_indexes =
    { 0 .. NUM_FOLDS - 1 }
    |> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)

let test_indexes = chunkIndices

let toMap (indexes:seq<string>)= 
    indexes 
    |> Seq.map (fun i -> i,true) 
    |> Map.ofSeq

let filter (indexes:Map<string,bool>) (df:Frame<string*int,string>) = 
    df 
    |> Frame.filterRows (fun k _ -> indexes.ContainsKey (fst k))

let train_data:seq<Frame<string*int,string>> =
    train_indexes
       |> Seq.map (fun indexes -> filter (indexes |> toMap) df_order_grouped)

train_data |> Seq.iter (fun df -> df.Print())

Answer 3

我觉得使用基础库更简单，速度更快。使用 162k 样本大小，这在我的笔记本电脑上花费的时间不到十分之一秒。

let kfold folds data =
    let rand = new System.Random()
    let splits =
        data 
        |> List.sortBy (fun _ -> rand.NextDouble())
        |> List.splitInto folds 
    [ for i = 0 to folds-1 do 
        splits |> List.removeAt i |> List.concat, 
        splits[i] ]

显示整数列表的示例

kfold 3 [0..5]
// [([5; 2; 1; 0], [3; 4]); ([3; 4; 1; 0], [5; 2]); ([3; 4; 5; 2], [1; 0])]

现在对您的数据进行大量抽样：

// 162k sample size
let orderRecds =
    [ for i = 1 to 4_500 do
      for j = 1 to 12  do
      for k = 1 to 3 do
        { OrderId = string i; Month = j; Amount = i * j  }]

let groupedOrderRecds = 
    orderRecds 
    |> List.groupBy (fun order -> order.OrderId)

let train_data = 
    kfold 3 groupedOrderRecds
    |> List.map (fun (train, test) ->
        train |> List.collect snd,
        test |> List.collect snd)

// first train and test sample
let train0, test0 = train_data[0]

使用 Needle 创建 K 折交叉验证测试和训练集

Create KFold crossvalidation test and training sets with Deedle

f#

deedle

更新 #1

更新#2