读写大文本文件 python 太慢
Read and Write large text file python too slow
此代码遍历一个 5.1GB 的大型文本文件,并检查是否有出现次数少于 100 次的单词。然后将 5.1GB 重写为输出文本文件,并将这些单词替换为 unk。主要问题是 output.txt 的创建需要很长时间。
我怀疑 write_text() 方法在打开数据集文件和输出文件时导致了问题。
此脚本背后的目标:我有一个预建的词汇表和一个文本。文本中可能有我的词汇表中没有的新词,所以我想将它们添加到我的词汇表中。但我只想添加相关的新词(出现超过 100 次)。文中出现次数少于100次的生词是一次性的,不重要所以我想把它们改成“unk”。
from collections import Counter
extra_words = []
new_words = []
add_words = []
def get_vocab():
vocab = set()
with open('vocab.txt', 'r', encoding='utf-8') as rd:
lines = rd.readlines()
for line in lines:
tokens = line.split(' ')
word = tokens[0]
vocab.add(word)
return vocab
def _count(text):
vocab = get_vocab()
with open(text, 'r', encoding='utf-8') as fd:
for line in fd.readlines():
for token in line.split():
if token not in vocab:
extra_words.append(token)
word_count = Counter(extra_words)
# add del word_count[punctuation] to remove it from list
#del word_count['"']
for word in word_count:
if word_count[word] < 100:
new_words.append(word)
else:
add_words.append(word)
write_text()
#return len(new_words), word_count.most_common()[0]
def write_text():
with open('dataset', 'r', encoding='utf-8') as fd:
f = fd.readlines()
with open('output.txt', 'w', encoding='utf-8') as rd:
new_text = []
for line in f:
new_line = []
for token in line.split():
if token in new_words:
new_line.append('<unk>')
else:
new_line.append(token)
new_text.append(' '.join(new_line))
print('\n'.join(new_text), file=rd)
#print(' '.join(new_line), file=rd)
def add_vocab():
ln = len(get_vocab())
with open('vocab.txt', 'w', encoding='utf-8') as fd:
for idx, word in add_words:
print(f'{word} {ln + idx + 1}\n', file=fd)
pass
print(_count('dataset'))
add_vocab()
我用莎士比亚全集对此进行了测试。您还有大量与大小写和标点符号相关的工作要做。它为我在大约 15 秒内完成 100 份他的作品 (500meg)。如果这花费的时间超过不可接受的时间,您可能需要查看分析您的代码。请注意,我使用了您的词汇文件的简化版本,因为我没有遵循您希望在其中看到的内容。我使用的版本只是一行一行的。
import collections
def get_vocabulary(path):
with open(path, 'r', encoding='utf-8') as file_in:
tokens = [line.strip("\n") for line in file_in]
return set(tokens)
def get_interesting_word_counts(path, vocabulary):
word_counts = collections.Counter()
with open(path, 'r', encoding='utf-8') as file_in:
for line in file_in:
word_counts.update([token for token in line.split() if token not in vocabulary])
return word_counts
def get_cleaned_text(path, vocabulary, uncommon_words):
with open(path, 'r', encoding='utf-8') as file_in:
for line in file_in:
#line_out = " ".join(["<unk>" if token in uncommon_words else token for token in line.strip("\n").split()])
line_out = " ".join([
token if token in vocabulary or token not in uncommon_words else "<unk>"
for token in line.strip("\n").split()
])
yield "{}\n".format(line_out)
vocabulary = get_vocabulary("vocabulary.txt")
word_counts = get_interesting_word_counts("shakespeare.txt", vocabulary)
## --------------------------------------
## Add frequent but missing words to vocabulary
## --------------------------------------
common_words = set([item[0] for item in word_counts.items() if item[1] >= 100])
with open('vocabulary.txt', 'a', encoding='utf-8') as file_out:
for word in common_words:
file_out.write("{}\n".format(word))
## --------------------------------------
## --------------------------------------
## Rewite the text censuring uncommon words
## --------------------------------------
uncommon_words = set([item[0] for item in word_counts.items() if item[1] < 100])
cleaned_text = get_cleaned_text("shakespeare.txt", vocabulary, uncommon_words)
with open('shakespeare_out.txt', 'w', encoding='utf-8') as file_out:
file_out.writelines(cleaned_text)
## --------------------------------------
你可以得到我在这里使用的文字:http://www.gutenberg.org/ebooks/100
来源开始:
The Project Gutenberg eBook of The Complete Works of William Shakespeare, by William Shakespeare
生成的文件开始于:
<unk> <unk> <unk> <unk> of The <unk> <unk> of <unk> <unk> by <unk> <unk>
更新的词汇文件开始于:
as
run
he’s
this.
there’s
like
you.
此代码遍历一个 5.1GB 的大型文本文件,并检查是否有出现次数少于 100 次的单词。然后将 5.1GB 重写为输出文本文件,并将这些单词替换为 unk。主要问题是 output.txt 的创建需要很长时间。 我怀疑 write_text() 方法在打开数据集文件和输出文件时导致了问题。
此脚本背后的目标:我有一个预建的词汇表和一个文本。文本中可能有我的词汇表中没有的新词,所以我想将它们添加到我的词汇表中。但我只想添加相关的新词(出现超过 100 次)。文中出现次数少于100次的生词是一次性的,不重要所以我想把它们改成“unk”。
from collections import Counter
extra_words = []
new_words = []
add_words = []
def get_vocab():
vocab = set()
with open('vocab.txt', 'r', encoding='utf-8') as rd:
lines = rd.readlines()
for line in lines:
tokens = line.split(' ')
word = tokens[0]
vocab.add(word)
return vocab
def _count(text):
vocab = get_vocab()
with open(text, 'r', encoding='utf-8') as fd:
for line in fd.readlines():
for token in line.split():
if token not in vocab:
extra_words.append(token)
word_count = Counter(extra_words)
# add del word_count[punctuation] to remove it from list
#del word_count['"']
for word in word_count:
if word_count[word] < 100:
new_words.append(word)
else:
add_words.append(word)
write_text()
#return len(new_words), word_count.most_common()[0]
def write_text():
with open('dataset', 'r', encoding='utf-8') as fd:
f = fd.readlines()
with open('output.txt', 'w', encoding='utf-8') as rd:
new_text = []
for line in f:
new_line = []
for token in line.split():
if token in new_words:
new_line.append('<unk>')
else:
new_line.append(token)
new_text.append(' '.join(new_line))
print('\n'.join(new_text), file=rd)
#print(' '.join(new_line), file=rd)
def add_vocab():
ln = len(get_vocab())
with open('vocab.txt', 'w', encoding='utf-8') as fd:
for idx, word in add_words:
print(f'{word} {ln + idx + 1}\n', file=fd)
pass
print(_count('dataset'))
add_vocab()
我用莎士比亚全集对此进行了测试。您还有大量与大小写和标点符号相关的工作要做。它为我在大约 15 秒内完成 100 份他的作品 (500meg)。如果这花费的时间超过不可接受的时间,您可能需要查看分析您的代码。请注意,我使用了您的词汇文件的简化版本,因为我没有遵循您希望在其中看到的内容。我使用的版本只是一行一行的。
import collections
def get_vocabulary(path):
with open(path, 'r', encoding='utf-8') as file_in:
tokens = [line.strip("\n") for line in file_in]
return set(tokens)
def get_interesting_word_counts(path, vocabulary):
word_counts = collections.Counter()
with open(path, 'r', encoding='utf-8') as file_in:
for line in file_in:
word_counts.update([token for token in line.split() if token not in vocabulary])
return word_counts
def get_cleaned_text(path, vocabulary, uncommon_words):
with open(path, 'r', encoding='utf-8') as file_in:
for line in file_in:
#line_out = " ".join(["<unk>" if token in uncommon_words else token for token in line.strip("\n").split()])
line_out = " ".join([
token if token in vocabulary or token not in uncommon_words else "<unk>"
for token in line.strip("\n").split()
])
yield "{}\n".format(line_out)
vocabulary = get_vocabulary("vocabulary.txt")
word_counts = get_interesting_word_counts("shakespeare.txt", vocabulary)
## --------------------------------------
## Add frequent but missing words to vocabulary
## --------------------------------------
common_words = set([item[0] for item in word_counts.items() if item[1] >= 100])
with open('vocabulary.txt', 'a', encoding='utf-8') as file_out:
for word in common_words:
file_out.write("{}\n".format(word))
## --------------------------------------
## --------------------------------------
## Rewite the text censuring uncommon words
## --------------------------------------
uncommon_words = set([item[0] for item in word_counts.items() if item[1] < 100])
cleaned_text = get_cleaned_text("shakespeare.txt", vocabulary, uncommon_words)
with open('shakespeare_out.txt', 'w', encoding='utf-8') as file_out:
file_out.writelines(cleaned_text)
## --------------------------------------
你可以得到我在这里使用的文字:http://www.gutenberg.org/ebooks/100
来源开始:
The Project Gutenberg eBook of The Complete Works of William Shakespeare, by William Shakespeare
生成的文件开始于:
<unk> <unk> <unk> <unk> of The <unk> <unk> of <unk> <unk> by <unk> <unk>
更新的词汇文件开始于:
as
run
he’s
this.
there’s
like
you.