使用 Needle 创建 K 折交叉验证测试和训练集
Create KFold crossvalidation test and training sets with Deedle
假设我每个月都有一堆订单。订单也可以延长到特定月份。我做了一些预测,我想用 KFold 交叉验证来验证我的预测。我的目标是创建 K 个测试和训练集。
我将我的订单 ID 分组,将索引分成测试训练集并根据这些索引收集行。
我想出的解决方案很慢,因为它对每个订单 ID 使用 Series.filter
和 Seq.contains
。有人知道在 F# 中更有效的方法吗?
简化示例:
#r "nuget: Deedle"
#r "nuget: FSharp.Stats"
open Deedle
type Order =
{ OrderId:string; Month:int; Amount:int }
let OrderRecds =
[ { OrderId = "I1"; Month = 1; Amount = 100}
{ OrderId = "I2"; Month = 1; Amount = 200}
{ OrderId = "I3"; Month = 1; Amount = 300}
{ OrderId = "I4"; Month = 1; Amount = 400}
{ OrderId = "I5"; Month = 1; Amount = 500}
{ OrderId = "I6"; Month = 1; Amount = 600}
{ OrderId = "I1"; Month = 2; Amount = 100}
{ OrderId = "I2"; Month = 2; Amount = 200}
{ OrderId = "I3"; Month = 2; Amount = 300}
{ OrderId = "I4"; Month = 2; Amount = 400}
{ OrderId = "I5"; Month = 2; Amount = 500}
{ OrderId = "I6"; Month = 2; Amount = 600}
{ OrderId = "I1"; Month = 3; Amount = 100}
{ OrderId = "I2"; Month = 3; Amount = 200}
{ OrderId = "I3"; Month = 3; Amount = 300}
{ OrderId = "I4"; Month = 3; Amount = 400}
{ OrderId = "I5"; Month = 3; Amount = 500}
{ OrderId = "I6"; Month = 3; Amount = 600}
]
let df_order = OrderRecds |> Frame.ofRecords
let order_ids_series =
df_order
|> Frame.groupRowsByString "OrderId"
|> Frame.nest
|> Series.indexOrdinally
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
[|0 .. n-1|]
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let train_data =
train_indexes
|> Seq.map (fun indexes -> order_ids_series |> Series.filter (fun k _ -> Seq.contains k indexes) )
|> Seq.map Frame.unnest
|> Seq.map (Frame.mapRowKeys snd)
我认为诀窍是通过键访问系列,因此您可以直接转到相应的值。所以尝试这样的事情:
let train_data =
train_indexes
|> Seq.map (fun indexes ->
indexes
|> Seq.sort
|> Seq.map (fun index -> order_ids_series.[index]) // access the series by its key
|> Series.ofValues)
|> Seq.map Frame.unnest
|> Seq.map (Frame.mapRowKeys snd)
更新 #1
为了提高性能,我觉得你可以去掉Frame.unnest
,像这样:
let train_data =
train_indexes
|> Seq.map (fun indexes ->
indexes
|> Seq.map (fun index -> order_ids_series.[index])
|> Frame.mergeAll)
这似乎无法保留行顺序,但希望对您仍然有效。沿着这些路线进一步优化是可能的。
更新#2
我刚刚对 24,000 行的 unnest
与 mergeAll
性能进行了基准测试,它们在我的盒子上都需要大约 9.6 秒。我不确定如何进一步降低这个数字。我确实认为 mergeAll
版本更容易理解,但它实际上似乎并没有更快。
我对另一个答案的一个担心是 Frame.unnest
或 Frame.mergeAll
需要很多时间。此解决方案创建索引映射并使用 filterRows
。在我的数据集上,它大约快了 60 倍。
let df_order_grouped =
df_order
|> Frame.groupRowsByString "OrderId"
let order_ids_series =
df_order_grouped
|> Frame.nest
let order_ids = order_ids_series.Keys
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
order_ids |> Array.ofSeq
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let toMap (indexes:seq<string>)=
indexes
|> Seq.map (fun i -> i,true)
|> Map.ofSeq
let filter (indexes:Map<string,bool>) (df:Frame<string*int,string>) =
df
|> Frame.filterRows (fun k _ -> indexes.ContainsKey (fst k))
let train_data:seq<Frame<string*int,string>> =
train_indexes
|> Seq.map (fun indexes -> filter (indexes |> toMap) df_order_grouped)
train_data |> Seq.iter (fun df -> df.Print())
我觉得使用基础库更简单,速度更快。使用 162k 样本大小,这在我的笔记本电脑上花费的时间不到十分之一秒。
let kfold folds data =
let rand = new System.Random()
let splits =
data
|> List.sortBy (fun _ -> rand.NextDouble())
|> List.splitInto folds
[ for i = 0 to folds-1 do
splits |> List.removeAt i |> List.concat,
splits[i] ]
显示整数列表的示例
kfold 3 [0..5]
// [([5; 2; 1; 0], [3; 4]); ([3; 4; 1; 0], [5; 2]); ([3; 4; 5; 2], [1; 0])]
现在对您的数据进行大量抽样:
// 162k sample size
let orderRecds =
[ for i = 1 to 4_500 do
for j = 1 to 12 do
for k = 1 to 3 do
{ OrderId = string i; Month = j; Amount = i * j }]
let groupedOrderRecds =
orderRecds
|> List.groupBy (fun order -> order.OrderId)
let train_data =
kfold 3 groupedOrderRecds
|> List.map (fun (train, test) ->
train |> List.collect snd,
test |> List.collect snd)
// first train and test sample
let train0, test0 = train_data[0]
假设我每个月都有一堆订单。订单也可以延长到特定月份。我做了一些预测,我想用 KFold 交叉验证来验证我的预测。我的目标是创建 K 个测试和训练集。
我将我的订单 ID 分组,将索引分成测试训练集并根据这些索引收集行。
我想出的解决方案很慢,因为它对每个订单 ID 使用 Series.filter
和 Seq.contains
。有人知道在 F# 中更有效的方法吗?
简化示例:
#r "nuget: Deedle"
#r "nuget: FSharp.Stats"
open Deedle
type Order =
{ OrderId:string; Month:int; Amount:int }
let OrderRecds =
[ { OrderId = "I1"; Month = 1; Amount = 100}
{ OrderId = "I2"; Month = 1; Amount = 200}
{ OrderId = "I3"; Month = 1; Amount = 300}
{ OrderId = "I4"; Month = 1; Amount = 400}
{ OrderId = "I5"; Month = 1; Amount = 500}
{ OrderId = "I6"; Month = 1; Amount = 600}
{ OrderId = "I1"; Month = 2; Amount = 100}
{ OrderId = "I2"; Month = 2; Amount = 200}
{ OrderId = "I3"; Month = 2; Amount = 300}
{ OrderId = "I4"; Month = 2; Amount = 400}
{ OrderId = "I5"; Month = 2; Amount = 500}
{ OrderId = "I6"; Month = 2; Amount = 600}
{ OrderId = "I1"; Month = 3; Amount = 100}
{ OrderId = "I2"; Month = 3; Amount = 200}
{ OrderId = "I3"; Month = 3; Amount = 300}
{ OrderId = "I4"; Month = 3; Amount = 400}
{ OrderId = "I5"; Month = 3; Amount = 500}
{ OrderId = "I6"; Month = 3; Amount = 600}
]
let df_order = OrderRecds |> Frame.ofRecords
let order_ids_series =
df_order
|> Frame.groupRowsByString "OrderId"
|> Frame.nest
|> Series.indexOrdinally
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
[|0 .. n-1|]
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let train_data =
train_indexes
|> Seq.map (fun indexes -> order_ids_series |> Series.filter (fun k _ -> Seq.contains k indexes) )
|> Seq.map Frame.unnest
|> Seq.map (Frame.mapRowKeys snd)
我认为诀窍是通过键访问系列,因此您可以直接转到相应的值。所以尝试这样的事情:
let train_data =
train_indexes
|> Seq.map (fun indexes ->
indexes
|> Seq.sort
|> Seq.map (fun index -> order_ids_series.[index]) // access the series by its key
|> Series.ofValues)
|> Seq.map Frame.unnest
|> Seq.map (Frame.mapRowKeys snd)
更新 #1
为了提高性能,我觉得你可以去掉Frame.unnest
,像这样:
let train_data =
train_indexes
|> Seq.map (fun indexes ->
indexes
|> Seq.map (fun index -> order_ids_series.[index])
|> Frame.mergeAll)
这似乎无法保留行顺序,但希望对您仍然有效。沿着这些路线进一步优化是可能的。
更新#2
我刚刚对 24,000 行的 unnest
与 mergeAll
性能进行了基准测试,它们在我的盒子上都需要大约 9.6 秒。我不确定如何进一步降低这个数字。我确实认为 mergeAll
版本更容易理解,但它实际上似乎并没有更快。
我对另一个答案的一个担心是 Frame.unnest
或 Frame.mergeAll
需要很多时间。此解决方案创建索引映射并使用 filterRows
。在我的数据集上,它大约快了 60 倍。
let df_order_grouped =
df_order
|> Frame.groupRowsByString "OrderId"
let order_ids_series =
df_order_grouped
|> Frame.nest
let order_ids = order_ids_series.Keys
let NUM_FOLDS = 3
let n = order_ids_series |> Series.countKeys
let chunkSize = int (ceil (float n / float NUM_FOLDS))
let chunkIndices =
order_ids |> Array.ofSeq
|> FSharp.Stats.Array.shuffleFisherYates
|> Seq.chunkBySize chunkSize
let train_indexes =
{ 0 .. NUM_FOLDS - 1 }
|> Seq.map (fun i -> chunkIndices |> Seq.removeAt i |> Seq.concat)
let test_indexes = chunkIndices
let toMap (indexes:seq<string>)=
indexes
|> Seq.map (fun i -> i,true)
|> Map.ofSeq
let filter (indexes:Map<string,bool>) (df:Frame<string*int,string>) =
df
|> Frame.filterRows (fun k _ -> indexes.ContainsKey (fst k))
let train_data:seq<Frame<string*int,string>> =
train_indexes
|> Seq.map (fun indexes -> filter (indexes |> toMap) df_order_grouped)
train_data |> Seq.iter (fun df -> df.Print())
我觉得使用基础库更简单,速度更快。使用 162k 样本大小,这在我的笔记本电脑上花费的时间不到十分之一秒。
let kfold folds data =
let rand = new System.Random()
let splits =
data
|> List.sortBy (fun _ -> rand.NextDouble())
|> List.splitInto folds
[ for i = 0 to folds-1 do
splits |> List.removeAt i |> List.concat,
splits[i] ]
显示整数列表的示例
kfold 3 [0..5]
// [([5; 2; 1; 0], [3; 4]); ([3; 4; 1; 0], [5; 2]); ([3; 4; 5; 2], [1; 0])]
现在对您的数据进行大量抽样:
// 162k sample size
let orderRecds =
[ for i = 1 to 4_500 do
for j = 1 to 12 do
for k = 1 to 3 do
{ OrderId = string i; Month = j; Amount = i * j }]
let groupedOrderRecds =
orderRecds
|> List.groupBy (fun order -> order.OrderId)
let train_data =
kfold 3 groupedOrderRecds
|> List.map (fun (train, test) ->
train |> List.collect snd,
test |> List.collect snd)
// first train and test sample
let train0, test0 = train_data[0]