删除两个列表中存在的字符串共有的单词
Removing words which are common to strings present in two lists
我有两个字符串列表
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
我想从列表和 return 两个单独的列表
中的字符串中删除常用词
result_1 = ['james john','']
result_2 = ['hans', 'very']
我这样试过
print([' '.join(set(i.split()).difference(set(data_1))) for i in data_2])
如何获得像 result_1 和 result_2
这样的结果
您可以尝试使用 numpy 的 setdiff1d
函数。喜欢:
difference_1 = [" ".join(list(np.setdiff1d(np.array(x.split()), np.array(y.split())))) for x, y in zip(data_1, data_2)]
使用 set.diference()
也应该有效:
difference_1 = [" ".join(set(x.split()).difference(set(z.split()))) for x, z in zip(data_1, data_2)]
首先使用nltk对句子进行分词
from nltk import word_tokenize
def list_tokenize(data):
return [word_tokenize(sentence) for sentence in data]
然后获取常用词
def get_common_words(data_1_tokenized,data_2_tokenized):
return [
list(set.intersection(set(sentence_1), set(sentence_2)))
for sentence_1, sentence_2 in zip(data_1_tokenized, data_2_tokenized)
]
然后去掉常用词
def remove_common_words(data, common_words):
result = []
for i in range(len(data)):
result.append(
" ".join([word for word in data[i] if word not in common_words[i]]))
return result
组合函数获取唯一词
def get_unique(data_1,data_2):
data_1_tokenized = list_tokenize(data_1)
data_2_tokenized = list_tokenize(data_2)
common_words = get_common_words(data_1_tokenized,data_2_tokenized)
result1 = remove_common_words(data_1_tokenized,common_words)
result2 = remove_common_words(data_2_tokenized,common_words)
return result1,result2
最终用法
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
result1,result2 = get_unique(data_1,data_2)
结果
result1=['james john', '']
结果 2=['hans', 'very']
我有两个字符串列表
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
我想从列表和 return 两个单独的列表
中的字符串中删除常用词result_1 = ['james john','']
result_2 = ['hans', 'very']
我这样试过
print([' '.join(set(i.split()).difference(set(data_1))) for i in data_2])
如何获得像 result_1 和 result_2
这样的结果您可以尝试使用 numpy 的 setdiff1d
函数。喜欢:
difference_1 = [" ".join(list(np.setdiff1d(np.array(x.split()), np.array(y.split())))) for x, y in zip(data_1, data_2)]
使用 set.diference()
也应该有效:
difference_1 = [" ".join(set(x.split()).difference(set(z.split()))) for x, z in zip(data_1, data_2)]
首先使用nltk对句子进行分词
from nltk import word_tokenize
def list_tokenize(data):
return [word_tokenize(sentence) for sentence in data]
然后获取常用词
def get_common_words(data_1_tokenized,data_2_tokenized):
return [
list(set.intersection(set(sentence_1), set(sentence_2)))
for sentence_1, sentence_2 in zip(data_1_tokenized, data_2_tokenized)
]
然后去掉常用词
def remove_common_words(data, common_words):
result = []
for i in range(len(data)):
result.append(
" ".join([word for word in data[i] if word not in common_words[i]]))
return result
组合函数获取唯一词
def get_unique(data_1,data_2):
data_1_tokenized = list_tokenize(data_1)
data_2_tokenized = list_tokenize(data_2)
common_words = get_common_words(data_1_tokenized,data_2_tokenized)
result1 = remove_common_words(data_1_tokenized,common_words)
result2 = remove_common_words(data_2_tokenized,common_words)
return result1,result2
最终用法
data_1 = ['The art is performed by james john.', 'art is quite silent']
data_2 = ['The art is performed by hans.', 'art is very quite silent']
result1,result2 = get_unique(data_1,data_2)
结果
result1=['james john', '']
结果 2=['hans', 'very']