如何从文本文件中的句子中删除特定单词?
How to delete specific words from a sentence in text file?
我有两个文本文件。第一个文件包含英文句子,第二个文件包含一些英文单词(词汇)。我想从第一个文件中的句子中删除词汇表中不存在的那些词,然后将处理后的文本保存回第一个文件中。
我写了代码,我可以从中得到那些包含我们第二个文件(词汇表)中没有的单词的句子。
这是我的代码:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
但是,我无法删除这些字词。
这应该有效:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
您可以试试这个代码。如果您有较大的文件,我尽量不使用任何循环来节省您的运行时间。
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\b("+"|".join(punctuation)+")\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\b("+"|".join(remove_words[1:])+")\W", re.I)
pattern.sub("", s)
我有两个文本文件。第一个文件包含英文句子,第二个文件包含一些英文单词(词汇)。我想从第一个文件中的句子中删除词汇表中不存在的那些词,然后将处理后的文本保存回第一个文件中。
我写了代码,我可以从中得到那些包含我们第二个文件(词汇表)中没有的单词的句子。
这是我的代码:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
但是,我无法删除这些字词。
这应该有效:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
您可以试试这个代码。如果您有较大的文件,我尽量不使用任何循环来节省您的运行时间。
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\b("+"|".join(punctuation)+")\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\b("+"|".join(remove_words[1:])+")\W", re.I)
pattern.sub("", s)