如何将大流分组为子流
How to group large stream into sub streams
我想将大 Stream[F, A]
分组到 Stream[Stream[F, A]]
中,内部流最多 n
个元素。
这就是我所做的,基本上是将块通过管道传输到 Queue[F, Queue[F, Chunk[A]]
,然后生成队列元素作为结果流。
implicit class StreamSyntax[F[_], A](s: Stream[F, A])(
implicit F: Concurrent[F]) {
def groupedPipe(
lastQRef: Ref[F, Queue[F, Option[Chunk[A]]]],
n: Int): Pipe[F, A, Stream[F, A]] = { in =>
val initQs =
Queue.unbounded[F, Option[Queue[F, Option[Chunk[A]]]]].flatMap { qq =>
Queue.bounded[F, Option[Chunk[A]]](1).flatMap { q =>
lastQRef.set(q) *> qq.enqueue1(Some(q)).as(qq -> q)
}
}
Stream.eval(initQs).flatMap {
case (qq, initQ) =>
def newQueue = Queue.bounded[F, Option[Chunk[A]]](1).flatMap { q =>
qq.enqueue1(Some(q)) *> lastQRef.set(q).as(q)
}
val evalStream = {
in.chunks
.evalMapAccumulate((0, initQ)) {
case ((i, q), c) if i + c.size >= n =>
val (l, r) = c.splitAt(n - i)
q.enqueue1(Some(l)) >> q.enqueue1(None) >> q
.enqueue1(None) >> newQueue.flatMap { nq =>
nq.enqueue1(Some(r)).as(((r.size, nq), c))
}
case ((i, q), c) if (i + c.size) < n =>
q.enqueue1(Some(c)).as(((i + c.size, q), c))
}
.attempt ++ Stream.eval {
lastQRef.get.flatMap { last =>
last.enqueue1(None) *> last.enqueue1(None)
} *> qq.enqueue1(None)
}
}
qq.dequeue.unNoneTerminate
.map(
q =>
q.dequeue.unNoneTerminate
.flatMap(Stream.chunk)
.onFinalize(
q.dequeueChunk(Int.MaxValue).unNoneTerminate.compile.drain))
.concurrently(evalStream)
}
}
def grouped(n: Int) = {
Stream.eval {
Queue.unbounded[F, Option[Chunk[A]]].flatMap { empty =>
Ref.of[F, Queue[F, Option[Chunk[A]]]](empty)
}
}.flatMap { ref =>
val p = groupedPipe(ref, n)
s.through(p)
}
}
}
但是很复杂,有没有更简单的方法?
fs2 有 chunkN
chunkLimit
方法可以帮助分组
stream.chunkN(n).map(Stream.chunk)
stream.chunkLimit(n).map(Stream.chunk)
chunkN
产生大小为 n 的块,直到流结束
chunkLimit
拆分现有块并可以生成可变大小的块。
scala> Stream(1,2,3).repeat.chunkN(2).take(5).toList
res0: List[Chunk[Int]] = List(Chunk(1, 2), Chunk(3, 1), Chunk(2, 3), Chunk(1, 2), Chunk(3, 1))
scala> (Stream(1) ++ Stream(2, 3) ++ Stream(4, 5, 6)).chunkLimit(2).toList
res0: List[Chunk[Int]] = List(Chunk(1), Chunk(2, 3), Chunk(4, 5), Chunk(6))
除了已经提到的chunksN
,还可以考虑使用groupWithin
(fs2 1.0.1):
def groupWithin[F2[x] >: F[x]](n: Int, d: FiniteDuration)(implicit timer: Timer[F2], F: Concurrent[F2]): Stream[F2, Chunk[O]]
Divide this streams into groups of elements received within a time window, or limited by the number of the elements, whichever happens first. Empty groups, which can occur if no elements can be pulled from upstream in a given time window, will not be emitted.
Note: a time window starts each time downstream pulls.
我不确定您为什么希望它是嵌套流,因为要求是在一批中“最多 n
个元素”——这意味着您要跟踪有限数量的元素(这正是 Chunk
的用途)。无论哪种方式,Chunk
总是可以表示为 Stream
和 Stream.chunk
:
val chunks: Stream[F, Chunk[O]] = ???
val streamOfStreams: Stream[F, Stream[F, O]] = chunks.map(Stream.chunk)
下面是如何使用 groupWithin
的完整示例:
import cats.implicits._
import cats.effect.{ExitCode, IO, IOApp}
import fs2._
import scala.concurrent.duration._
object GroupingDemo extends IOApp {
override def run(args: List[String]): IO[ExitCode] = {
Stream('a, 'b, 'c).covary[IO]
.groupWithin(2, 1.second)
.map(_.toList)
.showLinesStdOut
.compile.drain
.as(ExitCode.Success)
}
}
输出:
List('a, 'b)
List('c)
最后我使用了一个更可靠的版本(使用 Hotswap 确保队列终止),就像这样。
def grouped(
innerSize: Int
)(implicit F: Async[F]): Stream[F, Stream[F, A]] = {
type InnerQueue = Queue[F, Option[Chunk[A]]]
type OuterQueue = Queue[F, Option[InnerQueue]]
def swapperInner(swapper: Hotswap[F, InnerQueue], outer: OuterQueue) = {
val innerRes =
Resource.make(Queue.unbounded[F, Option[Chunk[A]]])(_.offer(None))
swapper.swap(innerRes).flatTap(q => outer.offer(q.some))
}
def loopChunk(
gathered: Int,
curr: Queue[F, Option[Chunk[A]]],
chunk: Chunk[A],
newInnerQueue: F[InnerQueue]
): F[(Int, Queue[F, Option[Chunk[A]]])] = {
if (gathered + chunk.size > innerSize) {
val (left, right) = chunk.splitAt(innerSize - gathered)
curr.offer(left.some) >> newInnerQueue.flatMap { nq =>
loopChunk(0, nq, right, newInnerQueue)
}
} else if (gathered + chunk.size == innerSize) {
curr.offer(chunk.some) >> newInnerQueue.tupleLeft(
0
)
} else {
curr.offer(chunk.some).as(gathered + chunk.size -> curr)
}
}
val prepare = for {
outer <- Resource.eval(Queue.unbounded[F, Option[InnerQueue]])
swapper <- Hotswap.create[F, InnerQueue]
} yield outer -> swapper
Stream.resource(prepare).flatMap {
case (outer, swapper) =>
val newInner = swapperInner(swapper, outer)
val background = Stream.eval(newInner).flatMap { initQueue =>
s.chunks
.filter(_.nonEmpty)
.evalMapAccumulate(0 -> initQueue) { (state, chunk) =>
val (gathered, curr) = state
loopChunk(gathered, curr, chunk, newInner).tupleRight({})
}
.onFinalize(swapper.clear *> outer.offer(None))
}
val foreground = Stream
.fromQueueNoneTerminated(outer)
.map(i => Stream.fromQueueNoneTerminatedChunk(i))
foreground.concurrently(background)
}
}
我想将大 Stream[F, A]
分组到 Stream[Stream[F, A]]
中,内部流最多 n
个元素。
这就是我所做的,基本上是将块通过管道传输到 Queue[F, Queue[F, Chunk[A]]
,然后生成队列元素作为结果流。
implicit class StreamSyntax[F[_], A](s: Stream[F, A])(
implicit F: Concurrent[F]) {
def groupedPipe(
lastQRef: Ref[F, Queue[F, Option[Chunk[A]]]],
n: Int): Pipe[F, A, Stream[F, A]] = { in =>
val initQs =
Queue.unbounded[F, Option[Queue[F, Option[Chunk[A]]]]].flatMap { qq =>
Queue.bounded[F, Option[Chunk[A]]](1).flatMap { q =>
lastQRef.set(q) *> qq.enqueue1(Some(q)).as(qq -> q)
}
}
Stream.eval(initQs).flatMap {
case (qq, initQ) =>
def newQueue = Queue.bounded[F, Option[Chunk[A]]](1).flatMap { q =>
qq.enqueue1(Some(q)) *> lastQRef.set(q).as(q)
}
val evalStream = {
in.chunks
.evalMapAccumulate((0, initQ)) {
case ((i, q), c) if i + c.size >= n =>
val (l, r) = c.splitAt(n - i)
q.enqueue1(Some(l)) >> q.enqueue1(None) >> q
.enqueue1(None) >> newQueue.flatMap { nq =>
nq.enqueue1(Some(r)).as(((r.size, nq), c))
}
case ((i, q), c) if (i + c.size) < n =>
q.enqueue1(Some(c)).as(((i + c.size, q), c))
}
.attempt ++ Stream.eval {
lastQRef.get.flatMap { last =>
last.enqueue1(None) *> last.enqueue1(None)
} *> qq.enqueue1(None)
}
}
qq.dequeue.unNoneTerminate
.map(
q =>
q.dequeue.unNoneTerminate
.flatMap(Stream.chunk)
.onFinalize(
q.dequeueChunk(Int.MaxValue).unNoneTerminate.compile.drain))
.concurrently(evalStream)
}
}
def grouped(n: Int) = {
Stream.eval {
Queue.unbounded[F, Option[Chunk[A]]].flatMap { empty =>
Ref.of[F, Queue[F, Option[Chunk[A]]]](empty)
}
}.flatMap { ref =>
val p = groupedPipe(ref, n)
s.through(p)
}
}
}
但是很复杂,有没有更简单的方法?
fs2 有 chunkN
chunkLimit
方法可以帮助分组
stream.chunkN(n).map(Stream.chunk)
stream.chunkLimit(n).map(Stream.chunk)
chunkN
产生大小为 n 的块,直到流结束
chunkLimit
拆分现有块并可以生成可变大小的块。
scala> Stream(1,2,3).repeat.chunkN(2).take(5).toList
res0: List[Chunk[Int]] = List(Chunk(1, 2), Chunk(3, 1), Chunk(2, 3), Chunk(1, 2), Chunk(3, 1))
scala> (Stream(1) ++ Stream(2, 3) ++ Stream(4, 5, 6)).chunkLimit(2).toList
res0: List[Chunk[Int]] = List(Chunk(1), Chunk(2, 3), Chunk(4, 5), Chunk(6))
除了已经提到的chunksN
,还可以考虑使用groupWithin
(fs2 1.0.1):
def groupWithin[F2[x] >: F[x]](n: Int, d: FiniteDuration)(implicit timer: Timer[F2], F: Concurrent[F2]): Stream[F2, Chunk[O]]
Divide this streams into groups of elements received within a time window, or limited by the number of the elements, whichever happens first. Empty groups, which can occur if no elements can be pulled from upstream in a given time window, will not be emitted.
Note: a time window starts each time downstream pulls.
我不确定您为什么希望它是嵌套流,因为要求是在一批中“最多 n
个元素”——这意味着您要跟踪有限数量的元素(这正是 Chunk
的用途)。无论哪种方式,Chunk
总是可以表示为 Stream
和 Stream.chunk
:
val chunks: Stream[F, Chunk[O]] = ???
val streamOfStreams: Stream[F, Stream[F, O]] = chunks.map(Stream.chunk)
下面是如何使用 groupWithin
的完整示例:
import cats.implicits._
import cats.effect.{ExitCode, IO, IOApp}
import fs2._
import scala.concurrent.duration._
object GroupingDemo extends IOApp {
override def run(args: List[String]): IO[ExitCode] = {
Stream('a, 'b, 'c).covary[IO]
.groupWithin(2, 1.second)
.map(_.toList)
.showLinesStdOut
.compile.drain
.as(ExitCode.Success)
}
}
输出:
List('a, 'b)
List('c)
最后我使用了一个更可靠的版本(使用 Hotswap 确保队列终止),就像这样。
def grouped(
innerSize: Int
)(implicit F: Async[F]): Stream[F, Stream[F, A]] = {
type InnerQueue = Queue[F, Option[Chunk[A]]]
type OuterQueue = Queue[F, Option[InnerQueue]]
def swapperInner(swapper: Hotswap[F, InnerQueue], outer: OuterQueue) = {
val innerRes =
Resource.make(Queue.unbounded[F, Option[Chunk[A]]])(_.offer(None))
swapper.swap(innerRes).flatTap(q => outer.offer(q.some))
}
def loopChunk(
gathered: Int,
curr: Queue[F, Option[Chunk[A]]],
chunk: Chunk[A],
newInnerQueue: F[InnerQueue]
): F[(Int, Queue[F, Option[Chunk[A]]])] = {
if (gathered + chunk.size > innerSize) {
val (left, right) = chunk.splitAt(innerSize - gathered)
curr.offer(left.some) >> newInnerQueue.flatMap { nq =>
loopChunk(0, nq, right, newInnerQueue)
}
} else if (gathered + chunk.size == innerSize) {
curr.offer(chunk.some) >> newInnerQueue.tupleLeft(
0
)
} else {
curr.offer(chunk.some).as(gathered + chunk.size -> curr)
}
}
val prepare = for {
outer <- Resource.eval(Queue.unbounded[F, Option[InnerQueue]])
swapper <- Hotswap.create[F, InnerQueue]
} yield outer -> swapper
Stream.resource(prepare).flatMap {
case (outer, swapper) =>
val newInner = swapperInner(swapper, outer)
val background = Stream.eval(newInner).flatMap { initQueue =>
s.chunks
.filter(_.nonEmpty)
.evalMapAccumulate(0 -> initQueue) { (state, chunk) =>
val (gathered, curr) = state
loopChunk(gathered, curr, chunk, newInner).tupleRight({})
}
.onFinalize(swapper.clear *> outer.offer(None))
}
val foreground = Stream
.fromQueueNoneTerminated(outer)
.map(i => Stream.fromQueueNoneTerminatedChunk(i))
foreground.concurrently(background)
}
}