在 Python 中检查单词是否在彼此的 n space 范围内(使用 nltk 或其他方式)
Checking if words are within n space of one another (using nltk or otherwise) in Python
我有一个列表文件内容,其中包含由单词标记作为其元素的列表。我想创建一个函数,它将两个长度为 1 和 size 的字符串作为其输入,并且 returns 这两个术语出现在 size[ 中的实例=37=]彼此的话
到目前为止,我已经使用 nltk 对列表中每个元素的单词进行了标记,但我不确定从这里可以去哪里,任何人都可以向我推荐可以执行此操作的 nltk method/python 代码吗?
函数应该是这样的
file_contents = [['man', 'once', 'upon', 'time', 'love', 'princess'], ['python', 'code', 'cool', 'uses, 'java'],['man', 'help', 'test', 'weird', 'love'], .............]
def check_words_within(string: word1, string:word2, int: size) -> list:
#how to implement?
check_words_within('man','love', 4)
会 return [[man', 'once', 'upon', 'time', 'love'],['man', 'help', 'test', 'weird', 'love']]
check_words_within('man','upon', 1)
会 return [['man', 'once', 'upon']]
check_words_within('man','document',4)
会 return []
nltk
有帮助我执行此操作的功能吗?
- 创建字典列表以从中查找值。*
dat = [{ind: val for val, ind in enumerate(el)} for el in file_contents]
def foo(w1, w2, dist, f, fdat):
arr = []
for i, v in enumerate(fdat):
i1 = v.get(w1)
i2 = v.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist + 1):
arr.append(f[i][i1:i2+1])
return arr
foo("man", "upon", 1, file_contents, dat)
# [['man', 'once', 'upon']]
- 创建 class
class Search:
def __init__(self, words_list):
self.__words_list = words_list
self.__words_dict = self.__get_dict()
def __get_dict(self):
d = {}
for ind, arr in enumerate(self.__words_list):
for pos, word in enumerate(arr):
if not d.get(word):
d[word] = {}
d[word][ind] = pos
return d
def check_words_within(self, w1, w2, dist):
arr = []
if self.__words_dict.get(w1) and self.__words_dict.get(w2):
wl_inds = self.__words_dict[w1].keys()
for wl_ind in wl_inds:
pos1 = self.__words_dict[w1][wl_ind]
pos2 = self.__words_dict[w2].get(wl_ind, pos1 - 1)
if (pos2 - pos1 > 0) and (pos2 - pos1 <= dist + 1):
arr.append(self.__words_list[wl_ind][pos1:pos2 + 1])
return arr
foo = Search(file_contents)
foo.check_words_within("man", "love", 4)
# [['man', 'once', 'upon', 'time', 'love'],
# ['man', 'help', 'test', 'weird', 'love']]
我有一个列表文件内容,其中包含由单词标记作为其元素的列表。我想创建一个函数,它将两个长度为 1 和 size 的字符串作为其输入,并且 returns 这两个术语出现在 size[ 中的实例=37=]彼此的话
到目前为止,我已经使用 nltk 对列表中每个元素的单词进行了标记,但我不确定从这里可以去哪里,任何人都可以向我推荐可以执行此操作的 nltk method/python 代码吗?
函数应该是这样的
file_contents = [['man', 'once', 'upon', 'time', 'love', 'princess'], ['python', 'code', 'cool', 'uses, 'java'],['man', 'help', 'test', 'weird', 'love'], .............]
def check_words_within(string: word1, string:word2, int: size) -> list:
#how to implement?
check_words_within('man','love', 4)
会 return [[man', 'once', 'upon', 'time', 'love'],['man', 'help', 'test', 'weird', 'love']]
check_words_within('man','upon', 1)
会 return [['man', 'once', 'upon']]
check_words_within('man','document',4)
会 return []
nltk
有帮助我执行此操作的功能吗?
- 创建字典列表以从中查找值。*
dat = [{ind: val for val, ind in enumerate(el)} for el in file_contents]
def foo(w1, w2, dist, f, fdat):
arr = []
for i, v in enumerate(fdat):
i1 = v.get(w1)
i2 = v.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist + 1):
arr.append(f[i][i1:i2+1])
return arr
foo("man", "upon", 1, file_contents, dat)
# [['man', 'once', 'upon']]
- 创建 class
class Search:
def __init__(self, words_list):
self.__words_list = words_list
self.__words_dict = self.__get_dict()
def __get_dict(self):
d = {}
for ind, arr in enumerate(self.__words_list):
for pos, word in enumerate(arr):
if not d.get(word):
d[word] = {}
d[word][ind] = pos
return d
def check_words_within(self, w1, w2, dist):
arr = []
if self.__words_dict.get(w1) and self.__words_dict.get(w2):
wl_inds = self.__words_dict[w1].keys()
for wl_ind in wl_inds:
pos1 = self.__words_dict[w1][wl_ind]
pos2 = self.__words_dict[w2].get(wl_ind, pos1 - 1)
if (pos2 - pos1 > 0) and (pos2 - pos1 <= dist + 1):
arr.append(self.__words_list[wl_ind][pos1:pos2 + 1])
return arr
foo = Search(file_contents)
foo.check_words_within("man", "love", 4)
# [['man', 'once', 'upon', 'time', 'love'],
# ['man', 'help', 'test', 'weird', 'love']]