单词建议 Python
Word suggestion Python
我正在寻求有关使用 Python 编写单词建议系统的帮助。
在给定随机字符序列输入时,我希望能够搜索单词列表并提供一些单词建议。
我发现壁橱是一个拼写校正系统 (https://norvig.com/spell-correct.html),在分析函数时 "edits1" 它确实产生了一些结果,但是这是基于一次编辑(例如,包括一个 'a' 到输入字符串)。
我想要实现的是使用多个字母,即元音或辅音。
例如给定字母 'prt' 字典搜索应该推荐 'part' 和 'apart' 等
Filler.py - https://norvig.com/spell-correct.html
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('E:\new\words.txt').read())) #wordlist containing numerious word e.g. 'prut', 'prot', 'port', 'part', 'prat', 'pert', 'pret', 'apart'.
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'aeiouxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return set(inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
InputString.py
import filler
h = ['prt']
for x in h:
input = filler.candidates(h[0])
print(input)
好的,我已经修改了你的代码。 Suggestor
class 接收两个参数,即 max_times
和 letters
,以便您可以随时随地更改它们。
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
class Suggestor:
def __init__(self,max_times,letters):
self.max_times = max_times
self.letters = letters
def candidates(self,word):
return self.known(self.edited_word(word))
def known(self,words):
return set(w for w in words if w in WORDS)
def edit(self,word):
letters = self.letters
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return list(set(inserts))
def edited_word(self,raw_word):
words = [[raw_word]]
for i in range(self.max_times):
i_times_words = []
for word in words[-1]:
i_times_words += self.edit(word)
words.append(list(set(i_times_words)))
return [w for word in words for w in word]
if __name__ == '__main__':
word = 'prt'
suggestor = Suggestor(max_times=4,letters='aeiouxyz')
print(suggestor.candidates(word))
以上测试的输出为:
{'partie', 'parity', 'purity', 'part', 'port', 'proto', 'porto', 'party', 'apart', 'parait', 'export', 'operate', 'expert', 'pirate'}
此外,我的建议是检查所有单词的概率,你可以用贝叶斯定理过滤其中的一些。
我正在寻求有关使用 Python 编写单词建议系统的帮助。 在给定随机字符序列输入时,我希望能够搜索单词列表并提供一些单词建议。
我发现壁橱是一个拼写校正系统 (https://norvig.com/spell-correct.html),在分析函数时 "edits1" 它确实产生了一些结果,但是这是基于一次编辑(例如,包括一个 'a' 到输入字符串)。
我想要实现的是使用多个字母,即元音或辅音。 例如给定字母 'prt' 字典搜索应该推荐 'part' 和 'apart' 等
Filler.py - https://norvig.com/spell-correct.html
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('E:\new\words.txt').read())) #wordlist containing numerious word e.g. 'prut', 'prot', 'port', 'part', 'prat', 'pert', 'pret', 'apart'.
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'aeiouxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return set(inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
InputString.py
import filler
h = ['prt']
for x in h:
input = filler.candidates(h[0])
print(input)
好的,我已经修改了你的代码。 Suggestor
class 接收两个参数,即 max_times
和 letters
,以便您可以随时随地更改它们。
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
class Suggestor:
def __init__(self,max_times,letters):
self.max_times = max_times
self.letters = letters
def candidates(self,word):
return self.known(self.edited_word(word))
def known(self,words):
return set(w for w in words if w in WORDS)
def edit(self,word):
letters = self.letters
splits = [(word[:i], word[i:]) for i in range(len(word) + 2)]
inserts = [L + c + R for L, R in splits for c in letters]
return list(set(inserts))
def edited_word(self,raw_word):
words = [[raw_word]]
for i in range(self.max_times):
i_times_words = []
for word in words[-1]:
i_times_words += self.edit(word)
words.append(list(set(i_times_words)))
return [w for word in words for w in word]
if __name__ == '__main__':
word = 'prt'
suggestor = Suggestor(max_times=4,letters='aeiouxyz')
print(suggestor.candidates(word))
以上测试的输出为:
{'partie', 'parity', 'purity', 'part', 'port', 'proto', 'porto', 'party', 'apart', 'parait', 'export', 'operate', 'expert', 'pirate'}
此外,我的建议是检查所有单词的概率,你可以用贝叶斯定理过滤其中的一些。