如何逐页阅读可迭代的页面?
How to read an iterable page by page?
我已经尝试了很多方法来逐页阅读项目,而无需加载列表中的每一页并返回它,这可能会在大页面上占用太多内存。我想避免获得一大堆项目,只是为了必须再次扫描列表以对每个项目进行 post 处理。
所以要么我得到生成器,它会不断返回空数据并用无限数量的空列表填充 pages
列表(使用 page_from_iterable2
时),要么我只得到第一页(就像 page_from_iterable1
.
关于我做错了什么的任何提示?
谢谢。
from typing import Iterable, Iterator
def read_paginated_items(
it: Iterator,
page_size: int,
) -> Iterable:
for _ in range(page_size):
try:
yield next(it)
except StopIteration:
return
def page_from_iterable1(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
page_items_generator = read_paginated_items(it, page_size)
yield page_items_generator
def page_from_iterable2(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := read_paginated_items(it, page_size):
yield page_items_generator
def test_read_by_page():
pages = []
for page in page_from_iterable1([1, 2, 3, 4, 5], 2):
page_items = [item for item in page]
pages.append(page_items)
assert pages == [[1, 2], [2, 3], [5]]
您需要一些方法来维护生成器中的状态。
这听起来像是 iterable
class 的工作。
from typing import Iterable, Iterator
class Page:
def __init__(self, it: Iterator, page_size: int):
self.it = it
self.page_size = page_size
self.done = False
self.item = 0
def __iter__(self):
self.item = 0
return self
def __next__(self):
while self.item < self.page_size:
try:
self.item += 1
return next(self.it)
except StopIteration:
# at this point the entirety of the original
# iterator is consumed
# self.done is our way of telling the generator
# to stop yielding the instance of Page
self.done = True
raise
# here we have reached the end of the page so we just reset the
# item count in __iter__. The entry point on each iteration.
raise StopIteration
def page_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
page = Page(it, page_size)
while not page.done:
yield page
def test_read_by_page():
pages = []
for page in page_from_iterable([1, 2, 3, 4, 5], 2):
page_items = [item for item in page]
pages.append(page_items)
print(pages)
test_read_by_page()
生成器通过yield
为每个页面使用相同的迭代器来工作。由于它继续 yield
ing Page
的相同实例,因此保持原始迭代器 (self.it
) 的状态。
通常在 __iter__
方法中,状态会被重置。但由于我们希望从该迭代器继续迭代,您只需要将项目计数重置回 0
.
如果您愿意测试生成的包含 0 个元素的分页,则可以进行简化:
from typing import Iterable, Iterator
import itertools
def paginate(
it: Iterator,
page_size: int,
) -> Iterable:
try:
for _ in range(page_size):
yield it.__next__()
except StopIteration:
pass
def page_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iterable.__iter__()
while True:
yield paginate(it, page_size)
def test_read_by_page():
pages = []
for page in page_from_iterable([1, 2, 3, 4, 5], 2):
page = list(page)
if not page:
break
pages.append(page)
print(pages)
test_read_by_page()
打印:
[[1, 2], [3, 4], [5]]
def page_from_iterable2(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := read_paginated_items(it, page_size):
yield page_items_generator
这里的问题很简单,page_items_generator
是...生成器,而不是生成的项目。每次循环,你都会创建一个新的生成器对象; while
条件通过(因为生成器对象是真实的);你产生了那个对象,实际上没有从嵌套的生成器中读取任何东西。
您需要明确收集结果:
def pages_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page := list(read_paginated_items(it, page_size)):
yield page
现在,每次通过循环,创建的生成器用于读取最多 page_size
个项目,创建一个项目列表。当源项目耗尽时,您可能会得到一个项目少于 page_size
的列表,然后是一个空列表(在这两种情况下都是由于 StopIteration
的处理)。由于空列表是假的, while
循环中断并且不产生该列表。
这意味着我们不需要从外部收集每页结果:
def test_read_by_page():
for page in pages_from_iterable([1, 2, 3, 4, 5], 2):
print(page)
也许您希望将页面结果的收集推迟到生成器之外。不幸的是,这根本行不通:无论生成什么,生成器都是真实的,并且 在一般情况下 弄清楚它们将生成什么的唯一方法就是让它们这样做。幸运的是,您的 page
大小是有限的并且可能很小,所以这仍然可以让您避免任何内存问题。这就是分页的意义所在,对吗?
Calling list() would fix my problem indeed, but would create a list of all the items of the page which is exactly what I am trying to avoid. I want to be able to iterate over them on the fly.
__length_hint__
解决这个问题不靠谱;但是如果我们允许在生成页面时从每个页面推测性地读取 一个 项,我们可以:
- 通过尝试读取一项来创建页面是否为空的测试
- 如果是,返回一个哨兵值而不是生成器,外部生成器适当处理
- 否则,使用包装纸将物品放回
看起来像:
def generator_with_prepended(iterator, value):
yield value
yield from iterator
def sentinelize_empty_generator(generator):
it = iter(generator)
try:
first = next(it)
return generator_with_prepended(it, first)
except StopIteration:
return None # which is falsey
# read_paginated_items as before
def pages_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := sentinelize_empty_generator(read_paginated_items(it, page_size)):
yield page_items_generator
我们再次需要从外部收集结果:
def test_read_by_page():
for page in pages_from_iterable([1,2,3,4,5], 2):
for item in page:
print(item)
print('---')
感谢大家的帮助,总结如下:
from typing import Iterator
import pytest
class PageItems:
def __init__(
self,
iterator: Iterator,
page_size: int,
):
self.items_generator = self._create_items_generator(iterator, page_size)
@staticmethod
def _create_items_generator(
iterator: Iterator,
page_size: int,
):
for _ in range(page_size):
try:
yield next(iterator)
except StopIteration:
return
def __iter__(self):
return self
def __next__(self):
return next(self.items_generator)
def test_read_one_page():
iterable = [1, 2, 3, 4, 5]
page_items = PageItems(iter(iterable), 3)
assert next(page_items) == 1
assert next(page_items) == 2
assert next(page_items) == 3
with pytest.raises(StopIteration):
next(page_items)
def test_read_pages():
iterable = [1, 2, 3, 4, 5]
pages = []
iterator = iter(iterable)
while page_items := list(PageItems(iterator, 2)):
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
def test_read_pages_modified_items():
iterable = [(1, "A"), (2, "B"), (3, "C"), (4, "D"), (5, "E")]
pages = []
iterator = iter(iterable)
while page_items := [item[0] for item in PageItems(iterator, 2)]:
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
我将无法在 PageItems
上使用 for 循环,因为它最后总是会吐出空白页,但是使用 while
我可以检查是否为空,而不必求助于丑陋的 if/break
街区。这也允许我调用 list(PageItems(iterator, 2))
如果我只需要不修改的项目,或者 [item[0] for item in PageItems(iterator, 2)]
例如返回的项目是元组,我只想要第一个元素。
我已经尝试了很多方法来逐页阅读项目,而无需加载列表中的每一页并返回它,这可能会在大页面上占用太多内存。我想避免获得一大堆项目,只是为了必须再次扫描列表以对每个项目进行 post 处理。
所以要么我得到生成器,它会不断返回空数据并用无限数量的空列表填充 pages
列表(使用 page_from_iterable2
时),要么我只得到第一页(就像 page_from_iterable1
.
关于我做错了什么的任何提示?
谢谢。
from typing import Iterable, Iterator
def read_paginated_items(
it: Iterator,
page_size: int,
) -> Iterable:
for _ in range(page_size):
try:
yield next(it)
except StopIteration:
return
def page_from_iterable1(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
page_items_generator = read_paginated_items(it, page_size)
yield page_items_generator
def page_from_iterable2(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := read_paginated_items(it, page_size):
yield page_items_generator
def test_read_by_page():
pages = []
for page in page_from_iterable1([1, 2, 3, 4, 5], 2):
page_items = [item for item in page]
pages.append(page_items)
assert pages == [[1, 2], [2, 3], [5]]
您需要一些方法来维护生成器中的状态。
这听起来像是 iterable
class 的工作。
from typing import Iterable, Iterator
class Page:
def __init__(self, it: Iterator, page_size: int):
self.it = it
self.page_size = page_size
self.done = False
self.item = 0
def __iter__(self):
self.item = 0
return self
def __next__(self):
while self.item < self.page_size:
try:
self.item += 1
return next(self.it)
except StopIteration:
# at this point the entirety of the original
# iterator is consumed
# self.done is our way of telling the generator
# to stop yielding the instance of Page
self.done = True
raise
# here we have reached the end of the page so we just reset the
# item count in __iter__. The entry point on each iteration.
raise StopIteration
def page_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
page = Page(it, page_size)
while not page.done:
yield page
def test_read_by_page():
pages = []
for page in page_from_iterable([1, 2, 3, 4, 5], 2):
page_items = [item for item in page]
pages.append(page_items)
print(pages)
test_read_by_page()
生成器通过yield
为每个页面使用相同的迭代器来工作。由于它继续 yield
ing Page
的相同实例,因此保持原始迭代器 (self.it
) 的状态。
通常在 __iter__
方法中,状态会被重置。但由于我们希望从该迭代器继续迭代,您只需要将项目计数重置回 0
.
如果您愿意测试生成的包含 0 个元素的分页,则可以进行简化:
from typing import Iterable, Iterator
import itertools
def paginate(
it: Iterator,
page_size: int,
) -> Iterable:
try:
for _ in range(page_size):
yield it.__next__()
except StopIteration:
pass
def page_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iterable.__iter__()
while True:
yield paginate(it, page_size)
def test_read_by_page():
pages = []
for page in page_from_iterable([1, 2, 3, 4, 5], 2):
page = list(page)
if not page:
break
pages.append(page)
print(pages)
test_read_by_page()
打印:
[[1, 2], [3, 4], [5]]
def page_from_iterable2(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := read_paginated_items(it, page_size):
yield page_items_generator
这里的问题很简单,page_items_generator
是...生成器,而不是生成的项目。每次循环,你都会创建一个新的生成器对象; while
条件通过(因为生成器对象是真实的);你产生了那个对象,实际上没有从嵌套的生成器中读取任何东西。
您需要明确收集结果:
def pages_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page := list(read_paginated_items(it, page_size)):
yield page
现在,每次通过循环,创建的生成器用于读取最多 page_size
个项目,创建一个项目列表。当源项目耗尽时,您可能会得到一个项目少于 page_size
的列表,然后是一个空列表(在这两种情况下都是由于 StopIteration
的处理)。由于空列表是假的, while
循环中断并且不产生该列表。
这意味着我们不需要从外部收集每页结果:
def test_read_by_page():
for page in pages_from_iterable([1, 2, 3, 4, 5], 2):
print(page)
也许您希望将页面结果的收集推迟到生成器之外。不幸的是,这根本行不通:无论生成什么,生成器都是真实的,并且 在一般情况下 弄清楚它们将生成什么的唯一方法就是让它们这样做。幸运的是,您的 page
大小是有限的并且可能很小,所以这仍然可以让您避免任何内存问题。这就是分页的意义所在,对吗?
Calling list() would fix my problem indeed, but would create a list of all the items of the page which is exactly what I am trying to avoid. I want to be able to iterate over them on the fly.
__length_hint__
解决这个问题不靠谱;但是如果我们允许在生成页面时从每个页面推测性地读取 一个 项,我们可以:
- 通过尝试读取一项来创建页面是否为空的测试
- 如果是,返回一个哨兵值而不是生成器,外部生成器适当处理
- 否则,使用包装纸将物品放回
看起来像:
def generator_with_prepended(iterator, value):
yield value
yield from iterator
def sentinelize_empty_generator(generator):
it = iter(generator)
try:
first = next(it)
return generator_with_prepended(it, first)
except StopIteration:
return None # which is falsey
# read_paginated_items as before
def pages_from_iterable(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := sentinelize_empty_generator(read_paginated_items(it, page_size)):
yield page_items_generator
我们再次需要从外部收集结果:
def test_read_by_page():
for page in pages_from_iterable([1,2,3,4,5], 2):
for item in page:
print(item)
print('---')
感谢大家的帮助,总结如下:
from typing import Iterator
import pytest
class PageItems:
def __init__(
self,
iterator: Iterator,
page_size: int,
):
self.items_generator = self._create_items_generator(iterator, page_size)
@staticmethod
def _create_items_generator(
iterator: Iterator,
page_size: int,
):
for _ in range(page_size):
try:
yield next(iterator)
except StopIteration:
return
def __iter__(self):
return self
def __next__(self):
return next(self.items_generator)
def test_read_one_page():
iterable = [1, 2, 3, 4, 5]
page_items = PageItems(iter(iterable), 3)
assert next(page_items) == 1
assert next(page_items) == 2
assert next(page_items) == 3
with pytest.raises(StopIteration):
next(page_items)
def test_read_pages():
iterable = [1, 2, 3, 4, 5]
pages = []
iterator = iter(iterable)
while page_items := list(PageItems(iterator, 2)):
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
def test_read_pages_modified_items():
iterable = [(1, "A"), (2, "B"), (3, "C"), (4, "D"), (5, "E")]
pages = []
iterator = iter(iterable)
while page_items := [item[0] for item in PageItems(iterator, 2)]:
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
我将无法在 PageItems
上使用 for 循环,因为它最后总是会吐出空白页,但是使用 while
我可以检查是否为空,而不必求助于丑陋的 if/break
街区。这也允许我调用 list(PageItems(iterator, 2))
如果我只需要不修改的项目,或者 [item[0] for item in PageItems(iterator, 2)]
例如返回的项目是元组,我只想要第一个元素。