嵌套生成器未正确触发
Nested generators not triggered properly
python 生成器的新手我想嵌套它们,即让生成器 A 依赖于生成器 B 的输出(B 生成文件路径,A 解析文档),但只读取第一个文件.
这是一个最小样本(使用即 TREC8all 数据)
import itertools
import spacy
from bs4 import BeautifulSoup
import os
def iter_all_files(p):
for root, dirs, files in os.walk(p):
for file in files:
if not file.startswith('.'):
print('using: ' + str(os.path.join(root, file)))
yield os.path.join(root, file)
def gen_items(path):
path = next(path)
text_file = open(path, 'r').read()
soup = BeautifulSoup(text_file,'html.parser')
for doc in soup.find_all("doc"):
strdoc = doc.docno.string.strip()
text_only = str(doc.find_all("text")[0])
yield (strdoc, text_only)
file_counter = 0
g = iter_all_files("data/TREC8all/Adhoc")
gen1, gen2 = itertools.tee(gen_items(g))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, docs):
file_counter += 1
file_counter
这只会输出
using: data/TREC8all/Adhoc/fbis/fb396002
Out[10]:
33
下面显示肯定还有一些文件需要解析:
g = iter_all_files("data/TREC8all/Adhoc")
file_counter = 0
for file in g:
file_counter += 1
# print(file)
for item in gen_items(g):
item_counter += 1
print(item_counter)
file_counter
将 return 大约 2000 个文件,如
using: data/TREC8all/Adhoc/fbis/fb396002
using: data/TREC8all/Adhoc/fbis/fb396003
using: data/TREC8all/Adhoc/fbis/fb396004
using: data/TREC8all/Adhoc/fbis/fb396005
using: data/TREC8all/Adhoc/fbis/fb396006
using: data/TREC8all/Adhoc/fbis/fb396007
using: data/TREC8all/Adhoc/fbis/fb396008
using: data/TREC8all/Adhoc/fbis/fb396009
using: data/TREC8all/Adhoc/fbis/fb396010
using: data/TREC8all/Adhoc/fbis/fb396011
using: data/TREC8all/Adhoc/fbis/fb396012
using: data/TREC8all/Adhoc/fbis/fb396013
显然是我的
g = iter_all_files("data/TREC8all/Adhoc")
gen1, gen2 = itertools.tee(gen_items(g))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, docs):
没有以正确的方式使用嵌套生成器。
编辑
带有外部 for 循环的嵌套似乎可行,但效果不佳。有没有更好的表达方式?
g = iter_all_files("data/TREC8all/Adhoc")
for file in g:
file_counter += 1
# print(file)
#for item in gen_items(g):
gen1, gen2 = itertools.tee(genFiles(g)
查看了代码,我不知道"nlp.pipe"的意思,这样试试
#docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, texts ):
file_counter += 1
file_counter
看"file_counter",错误就知道了。
but only the first file is read
嗯,你只告诉 Python 读取一个文件:
def gen_items(path):
path = next(path)
...
如果你想遍历所有文件,你需要一个循环。
def gen_items(paths):
for path in paths:
...
python 生成器的新手我想嵌套它们,即让生成器 A 依赖于生成器 B 的输出(B 生成文件路径,A 解析文档),但只读取第一个文件.
这是一个最小样本(使用即 TREC8all 数据)
import itertools
import spacy
from bs4 import BeautifulSoup
import os
def iter_all_files(p):
for root, dirs, files in os.walk(p):
for file in files:
if not file.startswith('.'):
print('using: ' + str(os.path.join(root, file)))
yield os.path.join(root, file)
def gen_items(path):
path = next(path)
text_file = open(path, 'r').read()
soup = BeautifulSoup(text_file,'html.parser')
for doc in soup.find_all("doc"):
strdoc = doc.docno.string.strip()
text_only = str(doc.find_all("text")[0])
yield (strdoc, text_only)
file_counter = 0
g = iter_all_files("data/TREC8all/Adhoc")
gen1, gen2 = itertools.tee(gen_items(g))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, docs):
file_counter += 1
file_counter
这只会输出
using: data/TREC8all/Adhoc/fbis/fb396002
Out[10]:
33
下面显示肯定还有一些文件需要解析:
g = iter_all_files("data/TREC8all/Adhoc")
file_counter = 0
for file in g:
file_counter += 1
# print(file)
for item in gen_items(g):
item_counter += 1
print(item_counter)
file_counter
将 return 大约 2000 个文件,如
using: data/TREC8all/Adhoc/fbis/fb396002
using: data/TREC8all/Adhoc/fbis/fb396003
using: data/TREC8all/Adhoc/fbis/fb396004
using: data/TREC8all/Adhoc/fbis/fb396005
using: data/TREC8all/Adhoc/fbis/fb396006
using: data/TREC8all/Adhoc/fbis/fb396007
using: data/TREC8all/Adhoc/fbis/fb396008
using: data/TREC8all/Adhoc/fbis/fb396009
using: data/TREC8all/Adhoc/fbis/fb396010
using: data/TREC8all/Adhoc/fbis/fb396011
using: data/TREC8all/Adhoc/fbis/fb396012
using: data/TREC8all/Adhoc/fbis/fb396013
显然是我的
g = iter_all_files("data/TREC8all/Adhoc")
gen1, gen2 = itertools.tee(gen_items(g))
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, docs):
没有以正确的方式使用嵌套生成器。
编辑
带有外部 for 循环的嵌套似乎可行,但效果不佳。有没有更好的表达方式?
g = iter_all_files("data/TREC8all/Adhoc")
for file in g:
file_counter += 1
# print(file)
#for item in gen_items(g):
gen1, gen2 = itertools.tee(genFiles(g)
查看了代码,我不知道"nlp.pipe"的意思,这样试试
#docs = nlp.pipe(texts, batch_size=50, n_threads=4)
for id_, doc in zip(ids, texts ):
file_counter += 1
file_counter
看"file_counter",错误就知道了。
but only the first file is read
嗯,你只告诉 Python 读取一个文件:
def gen_items(path):
path = next(path)
...
如果你想遍历所有文件,你需要一个循环。
def gen_items(paths):
for path in paths:
...