Pandas 中的拼写检查器
Spell Checker in Pandas
我正在尝试在 pandas class 中使用从 SQL 数据库中提取的单词来实现 Peter Norvig's spell checker。数据包含用户查询,这些查询通常包含一些拼写错误,我希望这个 class 将 return 成为最有可能的查询(拼写正确)。
class 使用 return 是 pandas 数据框的数据库查询进行初始化。例如:
query count
0 foo bar 1864
1 super foo 73
2 bar of foos 1629
3 crazy foos 940
以下大部分内容直接摘自 Peter 的作品,但我对 class 所做的修改似乎无法正常工作。我的猜测是它与删除计数器功能有关 (WORDS = Counter(words(open('big.txt').read()))
),但我不确定从数据帧中获取相同功能的最佳方法。
当前 class 以下:
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, dremel_connection):
self.df = database_connection.ExecuteQuery(
'SELECT query, COUNT(query) AS count FROM table GROUP BY 1;'
def expected_word(self, word):
"""Most probable spelling correction for word."""
return max(self._candidates(word), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
query_count = self.df.loc[self.df['query'] == query]['count'].values
return query_count / self.df['count'].sum()
def _candidates(self, word):
"""Generate possible spelling corrections for word."""
return (self._known([word])
or self._known(self._one_edits_from_word(word))
or self._known(self._two_edits_from_word(word))
or [word])
def _known(self, query):
"""The subset of `words` that appear in the dictionary of WORDS."""
# return set(w for w in query if w in WORDS)
return set(w for w in query if w in self.df['query'].value_counts)
def _one_edits_from_word(self, word):
"""All edits that are one edit away from `word`."""
splits = [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_word(self, word):
"""All edits that are two edits away from `word`."""
return (e2 for e1 in self._one_edits_from_word(word)
for e2 in self._one_edits_from_word(e1))
提前致谢!
对于寻找此问题答案的任何人,以下是对我有用的方法:
def _words(df):
"""Returns the total count of each word within a dataframe."""
return df['query'].str.get_dummies(sep=' ').T.dot(df['count'])
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, database_connection):
self.df = database_connection
self.words = _words(self.df)
def expected_word(self, query):
"""Most probable spelling correction for word."""
return max(self._candidates(query), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
return self.words.pipe(lambda x: x / x.sum()).get(query, 0.0)
def _candidates(self, query):
"""Generate possible spelling corrections for word."""
return (self._known(self._one_edits_from_query(query))
or self._known(self._two_edits_from_query(query))
or [query])
def _known(self, query):
"""The subset of `query` that appear in the search console database."""
return set(w for w in query if self.words.get(w))
def _one_edits_from_query(self, query):
"""All edits that are one edit away from `query`."""
splits = [(query[:i], query[i:]) for i in xrange(len(query) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_query(self, query):
"""All edits that are two edits away from `query`."""
return (e2 for e1 in self._one_edits_from_query(query)
for e2 in self._one_edits_from_query(e1))
import pandas as pd
from spellchecker import SpellChecker
df = pd.Series(['Customir','Tast','Hlp'])
spell = SpellChecker(distance=1)
def Correct(x):
return spell.correction(x)
df = df.apply(Correct)
df
0 customer
1 last
2 help
dtype: object
我正在尝试在 pandas class 中使用从 SQL 数据库中提取的单词来实现 Peter Norvig's spell checker。数据包含用户查询,这些查询通常包含一些拼写错误,我希望这个 class 将 return 成为最有可能的查询(拼写正确)。
class 使用 return 是 pandas 数据框的数据库查询进行初始化。例如:
query count
0 foo bar 1864
1 super foo 73
2 bar of foos 1629
3 crazy foos 940
以下大部分内容直接摘自 Peter 的作品,但我对 class 所做的修改似乎无法正常工作。我的猜测是它与删除计数器功能有关 (WORDS = Counter(words(open('big.txt').read()))
),但我不确定从数据帧中获取相同功能的最佳方法。
当前 class 以下:
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, dremel_connection):
self.df = database_connection.ExecuteQuery(
'SELECT query, COUNT(query) AS count FROM table GROUP BY 1;'
def expected_word(self, word):
"""Most probable spelling correction for word."""
return max(self._candidates(word), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
query_count = self.df.loc[self.df['query'] == query]['count'].values
return query_count / self.df['count'].sum()
def _candidates(self, word):
"""Generate possible spelling corrections for word."""
return (self._known([word])
or self._known(self._one_edits_from_word(word))
or self._known(self._two_edits_from_word(word))
or [word])
def _known(self, query):
"""The subset of `words` that appear in the dictionary of WORDS."""
# return set(w for w in query if w in WORDS)
return set(w for w in query if w in self.df['query'].value_counts)
def _one_edits_from_word(self, word):
"""All edits that are one edit away from `word`."""
splits = [(word[:i], word[i:]) for i in xrange(len(word) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_word(self, word):
"""All edits that are two edits away from `word`."""
return (e2 for e1 in self._one_edits_from_word(word)
for e2 in self._one_edits_from_word(e1))
提前致谢!
对于寻找此问题答案的任何人,以下是对我有用的方法:
def _words(df):
"""Returns the total count of each word within a dataframe."""
return df['query'].str.get_dummies(sep=' ').T.dot(df['count'])
class _SpellCheckClient(object):
"""Wraps functionality to check the spelling of a query."""
def __init__(self, team, table, database_connection):
self.df = database_connection
self.words = _words(self.df)
def expected_word(self, query):
"""Most probable spelling correction for word."""
return max(self._candidates(query), key=self._probability)
def _probability(self, query):
"""Probability of a given word within a query."""
return self.words.pipe(lambda x: x / x.sum()).get(query, 0.0)
def _candidates(self, query):
"""Generate possible spelling corrections for word."""
return (self._known(self._one_edits_from_query(query))
or self._known(self._two_edits_from_query(query))
or [query])
def _known(self, query):
"""The subset of `query` that appear in the search console database."""
return set(w for w in query if self.words.get(w))
def _one_edits_from_query(self, query):
"""All edits that are one edit away from `query`."""
splits = [(query[:i], query[i:]) for i in xrange(len(query) + 1)]
deletes = [left + right[1:] for left, right in splits if right]
transposes = [left + right[1] + right[0] + right[2:]
for left, right in splits
if len(right) > 1]
replaces = [left + center + right[1:]
for left, right in splits
if right for center in LETTERS]
inserts = [left + center + right
for left, right in splits
for center in LETTERS]
return set(deletes + transposes + replaces + inserts)
def _two_edits_from_query(self, query):
"""All edits that are two edits away from `query`."""
return (e2 for e1 in self._one_edits_from_query(query)
for e2 in self._one_edits_from_query(e1))
import pandas as pd
from spellchecker import SpellChecker
df = pd.Series(['Customir','Tast','Hlp'])
spell = SpellChecker(distance=1)
def Correct(x):
return spell.correction(x)
df = df.apply(Correct)
df
0 customer
1 last
2 help
dtype: object