F#:按重复出现的元素序列分组
F#: grouping by recurring sequences of elements
我有一个序列对(键,值),比如
[("a", 1), ("a", 2), ("a", 111), ("b", 3), ("bb", 1), ("bb", -1), ...]
,将其转换成像
这样的序列的最有效方法是什么
[("a", [1,2,111]), ("b", [3]), ("bb", [1,-1])]
或类似的?
序列有以下内容属性:它真的很大 (>2Gb)
这使得 Seq.groupBy 确实无效且不正确,还有其他方法吗?
P.S.: 这个序列:
[("a", 1), ("a", 2), ("a", 111), ("bb", 1), ("bb", -1), ("a", 5), ("a", 6), ...]
应转换为
[("a", [1,2,111]), ("bb", [1,-1]), ("a", [5,6]), ...]
--
编辑 #1:修复了不正确的示例
编辑 #2:序列很大,因此首选惰性(或最快)解决方案
如果您想要获得惰性结果的选项,那么我认为没有保持可变状态的优雅方式。这是一个相对简单的突变。您维护了您看到的最后一个键的存储,以及与该键对应的所有值:
let s = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]
let s2 =
[
let mutable prevKey = None
let mutable values = System.Collections.Generic.List<_>()
let init key value =
prevKey <- Some key
values.Clear()
values.Add value
for (key, value) in s do
match prevKey with
| None -> init key value
| Some k when k = key -> values.Add value
| Some k ->
yield (k, List.ofSeq values)
init key value
match prevKey with
| Some k -> yield (k, List.ofSeq values)
| _ -> ()
]
这给出:
val s2 : (string * int list) list =
[("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]
对于延迟评估,将 [ ... ]
替换为 seq { ... }
没有可变状态的简单递归方法。
let rec chunk inseq (accumelem,accumlist) =
match inseq with
|(a,b)::c ->
match accumelem with
|Some(t) -> if t=a then chunk c (accumelem,b::accumlist) else (t,accumlist)::(chunk c (Some(a),b::[]))
|None -> chunk c (Some a,b::[])
|[] ->
match accumelem with
|Some(t) -> (t,accumlist)::[]
|None -> []
chunk [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5);("a", 6)] (None,[])
val it : (string * int list) list =
[("a", [111; 2; 1]); ("bb", [-1; 1]); ("a", [6; 5])]
这是一个递归的解决方案:
let test = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]
let groupByAdjacentElements alist =
let rec group a groupAcc prevElement adjacentAcc =
match a with
| [] -> match adjacentAcc with
| [] -> groupAcc
| _ -> (prevElement, List.rev adjacentAcc)::groupAcc
| (b, c)::tail -> if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]
group alist [] (fst alist.Head) []
|> List.rev
let b = groupByAdjacentElements test
它returns:[("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]
如果你想要惰性评估,你应该考虑尝试LazyList
编辑:这是一个脚本,将 ExtCore 中的 LazyList 与公认的解决方案进行比较。它生成一个大文本文件,然后执行要求的转换。请注意,LazyList 以相反的顺序返回:
open System.Diagnostics
open System.IO
open ExtCore
let fileName = "Test.txt"
let outFile = new StreamWriter(fileName)
for i in [1..20000*300] do
outFile.WriteLine("a,1")
outFile.WriteLine("a,2")
outFile.WriteLine("a,111")
outFile.WriteLine("bb,1")
outFile.WriteLine("bb,-1")
outFile.WriteLine("a,5")
outFile.WriteLine("a,6")
outFile.WriteLine("c,8")
outFile.Close()
printfn "Finished Writing to File"
let data = System.IO.File.ReadLines(fileName)
|> Seq.map (fun i -> let parts = i.Split(',')
(parts.[0], parts.[1]))
printfn "Finished Reading File"
let s2 data =
[
let mutable prevKey = None
let mutable values = System.Collections.Generic.List<_>()
let init key value =
prevKey <- Some key
values.Clear()
values.Add value
for (key, value) in data do
match prevKey with
| None -> init key value
| Some k when k = key -> values.Add value
| Some k ->
yield (k, List.ofSeq values)
init key value
match prevKey with
| Some key -> yield (key, List.ofSeq values)
| _ -> ()
]
let groupByAdjacentElements aseq =
let alist = LazyList.ofSeq aseq
let rec group alist groupAcc prevElement adjacentAcc =
match alist with
| Cons((b, c), tail) ->
if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail (LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)) b [c]
| Nil ->
match adjacentAcc with
| [] -> groupAcc
| _ -> LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)
group alist LazyList.empty (fst (alist.Head())) []
let groupByAdjacentElementsList aseq =
let alist = aseq |> Seq.toList
let rec group a groupAcc prevElement adjacentAcc =
match a with
| [] -> match adjacentAcc with
| [] -> groupAcc
| _ -> (prevElement, List.rev adjacentAcc)::groupAcc
| (b, c)::tail -> if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]
group alist [] (fst alist.Head) []
|> List.rev
[<EntryPoint>]
let main argv =
let stopwatch = new Stopwatch()
stopwatch.Start()
let b = s2 data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
System.GC.WaitForFullGCComplete() |> ignore
stopwatch.Reset()
stopwatch.Start()
let b = groupByAdjacentElements data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
System.GC.WaitForFullGCComplete() |> ignore
stopwatch.Reset()
stopwatch.Start()
let b = groupByAdjacentElementsList data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
0
我在使用大约 300MB 大小的文件时,LazyList
比 seq
解决方案稍慢(83 秒到 94 秒)。也就是说,与序列解决方案不同,LazyList 的主要优点是迭代它是缓存的。即使在执行 List.rev 时,普通列表解决方案也比两者都快(没有它大约 73s)。
按相邻键分组也可以在没有可变绑定的情况下完成。使用 Seq.scan
,可以生成带有 eager chunk 的惰性序列。它已经提供了一种特殊情况,即序列的第一个元素;通过将输入序列包装为选项,然后是 None
我们可以处理另一个。之后,我们跳过中间结果并用 Seq.choose
.
去除状态
为了获得最大的通用性,我想推荐一个类似于 Seq.groupBy
、
的签名
f:('T -> 'Key) -> xs:seq<'T> -> seq<'Key * 'T list> when 'Key : equality
它以一个键投影函数作为第一个参数。
let chunkBy (f : 'T-> 'Key) xs =
// Determine key and wrap in option
seq{for x in xs -> Some(f x, x)
// Indicates end of sequence
yield None }
|> Seq.scan (fun (_, acc, previous) current ->
match previous, current with
| Some(pKey, _), Some(key, value) when pKey = key ->
// No intermediate result, but add to accumulator
None, value::acc, current
| _ ->
// New state is 3-tuple of previous key and completed chunk,
// accumulator from current element, and new previous element
Option.map (fun (k, _) -> k, List.rev acc) previous,
Option.map snd current |> Option.toList, current )
(None, [], None)
|> Seq.choose (fun (result, _, _) -> result)
这可以通过提供结果投影功能来满足 OP 的要求。
let chunkBy2 (f : 'T-> 'Key) (g : 'T->'Result) =
chunkBy f >> Seq.map (fun (k, gs) -> k, List.map g gs)
// val chunkBy2 :
// f:('T -> 'Key) -> g:('T -> 'Result) -> (seq<'T> -> seq<'Key * 'Result list>)
// when 'Key : equality
["a", 1; "a", 2; "a", 111; "b", 3; "bb", 1; "bb", -1]
|> chunkBy2 fst snd
// val it : seq<string * int list> =
// seq [("a", [1; 2; 111]); ("b", [3]); ("bb", [1; -1])]
Seq.initInfinite (fun x ->
if (x / 2) % 2 = 0 then "a", x else "b", x)
|> chunkBy2 fst snd
|> Seq.skip 50000
// val it : seq<string * int list> =
// seq
// [("a", [100000; 100001]); ("b", [100002; 100003]); ("a", [100004; 100005]);
// ("b", [100006; 100007]); ...]
我有一个序列对(键,值),比如
[("a", 1), ("a", 2), ("a", 111), ("b", 3), ("bb", 1), ("bb", -1), ...]
,将其转换成像
这样的序列的最有效方法是什么[("a", [1,2,111]), ("b", [3]), ("bb", [1,-1])]
或类似的?
序列有以下内容属性:它真的很大 (>2Gb)
这使得 Seq.groupBy 确实无效且不正确,还有其他方法吗?
P.S.: 这个序列:
[("a", 1), ("a", 2), ("a", 111), ("bb", 1), ("bb", -1), ("a", 5), ("a", 6), ...]
应转换为
[("a", [1,2,111]), ("bb", [1,-1]), ("a", [5,6]), ...]
--
编辑 #1:修复了不正确的示例
编辑 #2:序列很大,因此首选惰性(或最快)解决方案
如果您想要获得惰性结果的选项,那么我认为没有保持可变状态的优雅方式。这是一个相对简单的突变。您维护了您看到的最后一个键的存储,以及与该键对应的所有值:
let s = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]
let s2 =
[
let mutable prevKey = None
let mutable values = System.Collections.Generic.List<_>()
let init key value =
prevKey <- Some key
values.Clear()
values.Add value
for (key, value) in s do
match prevKey with
| None -> init key value
| Some k when k = key -> values.Add value
| Some k ->
yield (k, List.ofSeq values)
init key value
match prevKey with
| Some k -> yield (k, List.ofSeq values)
| _ -> ()
]
这给出:
val s2 : (string * int list) list =
[("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]
对于延迟评估,将 [ ... ]
替换为 seq { ... }
没有可变状态的简单递归方法。
let rec chunk inseq (accumelem,accumlist) =
match inseq with
|(a,b)::c ->
match accumelem with
|Some(t) -> if t=a then chunk c (accumelem,b::accumlist) else (t,accumlist)::(chunk c (Some(a),b::[]))
|None -> chunk c (Some a,b::[])
|[] ->
match accumelem with
|Some(t) -> (t,accumlist)::[]
|None -> []
chunk [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5);("a", 6)] (None,[])
val it : (string * int list) list =
[("a", [111; 2; 1]); ("bb", [-1; 1]); ("a", [6; 5])]
这是一个递归的解决方案:
let test = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]
let groupByAdjacentElements alist =
let rec group a groupAcc prevElement adjacentAcc =
match a with
| [] -> match adjacentAcc with
| [] -> groupAcc
| _ -> (prevElement, List.rev adjacentAcc)::groupAcc
| (b, c)::tail -> if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]
group alist [] (fst alist.Head) []
|> List.rev
let b = groupByAdjacentElements test
它returns:[("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]
如果你想要惰性评估,你应该考虑尝试LazyList
编辑:这是一个脚本,将 ExtCore 中的 LazyList 与公认的解决方案进行比较。它生成一个大文本文件,然后执行要求的转换。请注意,LazyList 以相反的顺序返回:
open System.Diagnostics
open System.IO
open ExtCore
let fileName = "Test.txt"
let outFile = new StreamWriter(fileName)
for i in [1..20000*300] do
outFile.WriteLine("a,1")
outFile.WriteLine("a,2")
outFile.WriteLine("a,111")
outFile.WriteLine("bb,1")
outFile.WriteLine("bb,-1")
outFile.WriteLine("a,5")
outFile.WriteLine("a,6")
outFile.WriteLine("c,8")
outFile.Close()
printfn "Finished Writing to File"
let data = System.IO.File.ReadLines(fileName)
|> Seq.map (fun i -> let parts = i.Split(',')
(parts.[0], parts.[1]))
printfn "Finished Reading File"
let s2 data =
[
let mutable prevKey = None
let mutable values = System.Collections.Generic.List<_>()
let init key value =
prevKey <- Some key
values.Clear()
values.Add value
for (key, value) in data do
match prevKey with
| None -> init key value
| Some k when k = key -> values.Add value
| Some k ->
yield (k, List.ofSeq values)
init key value
match prevKey with
| Some key -> yield (key, List.ofSeq values)
| _ -> ()
]
let groupByAdjacentElements aseq =
let alist = LazyList.ofSeq aseq
let rec group alist groupAcc prevElement adjacentAcc =
match alist with
| Cons((b, c), tail) ->
if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail (LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)) b [c]
| Nil ->
match adjacentAcc with
| [] -> groupAcc
| _ -> LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)
group alist LazyList.empty (fst (alist.Head())) []
let groupByAdjacentElementsList aseq =
let alist = aseq |> Seq.toList
let rec group a groupAcc prevElement adjacentAcc =
match a with
| [] -> match adjacentAcc with
| [] -> groupAcc
| _ -> (prevElement, List.rev adjacentAcc)::groupAcc
| (b, c)::tail -> if b = prevElement then
group tail groupAcc prevElement (c::adjacentAcc)
else
group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]
group alist [] (fst alist.Head) []
|> List.rev
[<EntryPoint>]
let main argv =
let stopwatch = new Stopwatch()
stopwatch.Start()
let b = s2 data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
System.GC.WaitForFullGCComplete() |> ignore
stopwatch.Reset()
stopwatch.Start()
let b = groupByAdjacentElements data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
System.GC.WaitForFullGCComplete() |> ignore
stopwatch.Reset()
stopwatch.Start()
let b = groupByAdjacentElementsList data
printfn "The result is: %A" b
stopwatch.Stop()
printfn "It took %A ms." stopwatch.ElapsedMilliseconds
0
我在使用大约 300MB 大小的文件时,LazyList
比 seq
解决方案稍慢(83 秒到 94 秒)。也就是说,与序列解决方案不同,LazyList 的主要优点是迭代它是缓存的。即使在执行 List.rev 时,普通列表解决方案也比两者都快(没有它大约 73s)。
按相邻键分组也可以在没有可变绑定的情况下完成。使用 Seq.scan
,可以生成带有 eager chunk 的惰性序列。它已经提供了一种特殊情况,即序列的第一个元素;通过将输入序列包装为选项,然后是 None
我们可以处理另一个。之后,我们跳过中间结果并用 Seq.choose
.
为了获得最大的通用性,我想推荐一个类似于 Seq.groupBy
、
f:('T -> 'Key) -> xs:seq<'T> -> seq<'Key * 'T list> when 'Key : equality
它以一个键投影函数作为第一个参数。
let chunkBy (f : 'T-> 'Key) xs =
// Determine key and wrap in option
seq{for x in xs -> Some(f x, x)
// Indicates end of sequence
yield None }
|> Seq.scan (fun (_, acc, previous) current ->
match previous, current with
| Some(pKey, _), Some(key, value) when pKey = key ->
// No intermediate result, but add to accumulator
None, value::acc, current
| _ ->
// New state is 3-tuple of previous key and completed chunk,
// accumulator from current element, and new previous element
Option.map (fun (k, _) -> k, List.rev acc) previous,
Option.map snd current |> Option.toList, current )
(None, [], None)
|> Seq.choose (fun (result, _, _) -> result)
这可以通过提供结果投影功能来满足 OP 的要求。
let chunkBy2 (f : 'T-> 'Key) (g : 'T->'Result) =
chunkBy f >> Seq.map (fun (k, gs) -> k, List.map g gs)
// val chunkBy2 :
// f:('T -> 'Key) -> g:('T -> 'Result) -> (seq<'T> -> seq<'Key * 'Result list>)
// when 'Key : equality
["a", 1; "a", 2; "a", 111; "b", 3; "bb", 1; "bb", -1]
|> chunkBy2 fst snd
// val it : seq<string * int list> =
// seq [("a", [1; 2; 111]); ("b", [3]); ("bb", [1; -1])]
Seq.initInfinite (fun x ->
if (x / 2) % 2 = 0 then "a", x else "b", x)
|> chunkBy2 fst snd
|> Seq.skip 50000
// val it : seq<string * int list> =
// seq
// [("a", [100000; 100001]); ("b", [100002; 100003]); ("a", [100004; 100005]);
// ("b", [100006; 100007]); ...]