python,未找到投票
python, Stemmer not found
我从 github 得到这段代码,这段代码将在 windows 64 位机器上执行。
这是我得到的错误:
回溯(最近调用最后):
文件 "new.py",第 2 行,位于
导入词干分析器
ModuleNotFoundError: 没有名为 'stemmer'
的模块
import math
import stemmer
def irange(sequence):
return zip(range(len(sequence)), sequence)
class CosineScore(object):
def __init__(self,all_docs):
self.documents = all_docs #list all docs [doc1,doc2..]
self.ndocs = len(all_docs)
self.posting_list = {} #term frequency list, don't care about term position
#term => {docId => freq}
self.pstemmer = stemmer.PorterStemmer()
self._term_indexer()
def _term_indexer(self):
#Create term frequency dict
#Run each word through stemmer
for doc_id,document in irange(self.documents):
for word in document.split(' '):
s_word = self.pstemmer.stem(word)
if self.posting_list.has_key(s_word):
doc_id_mapping = self.posting_list[s_word]
if doc_id_mapping.has_key(doc_id):
doc_id_mapping[doc_id] += 1
else:
doc_id_mapping[doc_id] = 1
else:
self.posting_list[s_word] = {doc_id: 1}
def _term_frequency(self,term):
if self.posting_list.has_key(term):
return self.posting_list[term]
else:
return -1
def _listToString(self,arg):
if isinstance(arg,basestring):
return arg.split(' ')
def __qTermFrequency(self,term,bWords):
count =0
for i,bWordsObj in irange(bWords):
if bWordsObj == term:
count = count +1
return count
def _docListWeights(self) :
all_terms = self.posting_list.keys()
doclist_weights = [0.0] * self.ndocs
#for all terms in the corpus
for i,term in irange(all_terms):
#for all docs in corpus that contain this term
docs = self.posting_list[term].keys()
for j,doc_id in irange(docs):
tf = self.posting_list[term][doc_id]
tfSquared = (tf * tf)
doclist_weights[doc_id] += tfSquared
for k in range(self.ndocs):
doclist_weights[k] = math.sqrt(doclist_weights[k])
return doclist_weights
def compute(self,query,mIDF=0):
'''
dft - document term frequency
idf - inverse document frequency
wTQ - weights for each query term
mIDF - max tf normalization
'''
scores = [0.0] * self.ndocs
bWords = self._listToString(query)
normalizationFactor = self._docListWeights()
for qterm in bWords:
term = self.pstemmer.stem(qterm)
#calculate WT
#dft = __qTermFrequency(queryTerm,bWords)
#wTQ = math.log10(int(N)/dft)
term_posting_doclist = []
if self._term_frequency(term) != -1:
#Find all documents with this query term
term_posting_doclist = self.posting_list[term].keys()
#total_term_frequency_in_corpus = sum(self.posting_list[term].values())
if(mIDF!=0):
dft = mIDF
else:
dft = len(term_posting_doclist)
_wTQ = float(self.ndocs)/float(dft)
wTQ = math.log10(float(_wTQ)) #idf
#cosinescore algorithm
for doc_id in term_posting_doclist:
if normalizationFactor[doc_id] != 0:
#wFTD = termDocFrequencyList/ normalizationFactor(doc_id)
wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id])
else:
wFTD = 0.0
scores[doc_id] += (wTQ * wFTD)
return scores
if __name__ == "__main__":
docs = [ "mallya","mallya mallya in hawaii", "sunil" ]
q = "hawaii mallya"
cs = CosineScore(docs)
print (cs.compute(q))
很可能是 nltk ,您可以使用以下命令安装它:
pip install nltk
将import stemmer
更改为import nltk.stem as stemmer
和运行代码。请注意此代码在 Python 2.7 中,如果您有 Python3
则不会 运行
使用:
pip install stemmer
在命令提示符下,如果不起作用,请按照以下步骤操作。
首先,手动下载文本挖掘包:https://pypi.python.org/pypi/textmining/1.0
解压(unzip textmining-1.0.zip)你会得到一个名为 textmining-1.0
的文件夹
在 anconda 提示符下输入 conda info
然后看到这个目录
活动环境位置:C:\ProgramData\Anaconda3
将解压的 textmining-1.0 文件夹复制并粘贴到此目录中
将文件夹转换为 python 3:要执行此复制,请将下面的代码粘贴到 anaconda 提示符中,然后 运行
2to3 --output-dir=textmining-1.0_v3 -W -n textmining-1.0
将文件夹转换为 python 3 将 textmining-1.0 重命名为 textmining-1。0_v3
最后在 anaconda 提示符下输入以下代码安装相同的
cd textmining-1.0_v3
如下
C:\Users\user>cd textmining-1.0_v3
键入此代码 python setup.py 安装如下
C:\Users\user \textmining-1.0_v3>python setup.py install
现在您将成功摆脱错误
Stemmer 是一个可以像 PyStemmer 一样通过 pip 安装的包。它仅用于非常粗略的“是真实单词”过滤器。
pip install PyStemmer
目前此版本可能还存在一些其他问题。
为了解决ubuntu中的上述问题,您需要安装PyStemmer,但不会直接安装,所以首先
安装 gcc 包:
sudo apt install gcc
然后:
Pip install PyStemmer
对我有用
我从 github 得到这段代码,这段代码将在 windows 64 位机器上执行。
这是我得到的错误:
回溯(最近调用最后): 文件 "new.py",第 2 行,位于 导入词干分析器
ModuleNotFoundError: 没有名为 'stemmer'
的模块import math
import stemmer
def irange(sequence):
return zip(range(len(sequence)), sequence)
class CosineScore(object):
def __init__(self,all_docs):
self.documents = all_docs #list all docs [doc1,doc2..]
self.ndocs = len(all_docs)
self.posting_list = {} #term frequency list, don't care about term position
#term => {docId => freq}
self.pstemmer = stemmer.PorterStemmer()
self._term_indexer()
def _term_indexer(self):
#Create term frequency dict
#Run each word through stemmer
for doc_id,document in irange(self.documents):
for word in document.split(' '):
s_word = self.pstemmer.stem(word)
if self.posting_list.has_key(s_word):
doc_id_mapping = self.posting_list[s_word]
if doc_id_mapping.has_key(doc_id):
doc_id_mapping[doc_id] += 1
else:
doc_id_mapping[doc_id] = 1
else:
self.posting_list[s_word] = {doc_id: 1}
def _term_frequency(self,term):
if self.posting_list.has_key(term):
return self.posting_list[term]
else:
return -1
def _listToString(self,arg):
if isinstance(arg,basestring):
return arg.split(' ')
def __qTermFrequency(self,term,bWords):
count =0
for i,bWordsObj in irange(bWords):
if bWordsObj == term:
count = count +1
return count
def _docListWeights(self) :
all_terms = self.posting_list.keys()
doclist_weights = [0.0] * self.ndocs
#for all terms in the corpus
for i,term in irange(all_terms):
#for all docs in corpus that contain this term
docs = self.posting_list[term].keys()
for j,doc_id in irange(docs):
tf = self.posting_list[term][doc_id]
tfSquared = (tf * tf)
doclist_weights[doc_id] += tfSquared
for k in range(self.ndocs):
doclist_weights[k] = math.sqrt(doclist_weights[k])
return doclist_weights
def compute(self,query,mIDF=0):
'''
dft - document term frequency
idf - inverse document frequency
wTQ - weights for each query term
mIDF - max tf normalization
'''
scores = [0.0] * self.ndocs
bWords = self._listToString(query)
normalizationFactor = self._docListWeights()
for qterm in bWords:
term = self.pstemmer.stem(qterm)
#calculate WT
#dft = __qTermFrequency(queryTerm,bWords)
#wTQ = math.log10(int(N)/dft)
term_posting_doclist = []
if self._term_frequency(term) != -1:
#Find all documents with this query term
term_posting_doclist = self.posting_list[term].keys()
#total_term_frequency_in_corpus = sum(self.posting_list[term].values())
if(mIDF!=0):
dft = mIDF
else:
dft = len(term_posting_doclist)
_wTQ = float(self.ndocs)/float(dft)
wTQ = math.log10(float(_wTQ)) #idf
#cosinescore algorithm
for doc_id in term_posting_doclist:
if normalizationFactor[doc_id] != 0:
#wFTD = termDocFrequencyList/ normalizationFactor(doc_id)
wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id])
else:
wFTD = 0.0
scores[doc_id] += (wTQ * wFTD)
return scores
if __name__ == "__main__":
docs = [ "mallya","mallya mallya in hawaii", "sunil" ]
q = "hawaii mallya"
cs = CosineScore(docs)
print (cs.compute(q))
很可能是 nltk ,您可以使用以下命令安装它:
pip install nltk
将import stemmer
更改为import nltk.stem as stemmer
和运行代码。请注意此代码在 Python 2.7 中,如果您有 Python3
则不会 运行使用:
pip install stemmer
在命令提示符下,如果不起作用,请按照以下步骤操作。
首先,手动下载文本挖掘包:https://pypi.python.org/pypi/textmining/1.0
解压(unzip textmining-1.0.zip)你会得到一个名为 textmining-1.0
的文件夹
在 anconda 提示符下输入
conda info
然后看到这个目录 活动环境位置:C:\ProgramData\Anaconda3将解压的 textmining-1.0 文件夹复制并粘贴到此目录中
将文件夹转换为 python 3:要执行此复制,请将下面的代码粘贴到 anaconda 提示符中,然后 运行
2to3 --output-dir=textmining-1.0_v3 -W -n textmining-1.0
将文件夹转换为 python 3 将 textmining-1.0 重命名为 textmining-1。0_v3
最后在 anaconda 提示符下输入以下代码安装相同的
cd textmining-1.0_v3
如下
C:\Users\user>cd textmining-1.0_v3
键入此代码 python setup.py 安装如下
C:\Users\user \textmining-1.0_v3>python setup.py install
现在您将成功摆脱错误
Stemmer 是一个可以像 PyStemmer 一样通过 pip 安装的包。它仅用于非常粗略的“是真实单词”过滤器。
pip install PyStemmer
目前此版本可能还存在一些其他问题。
为了解决ubuntu中的上述问题,您需要安装PyStemmer,但不会直接安装,所以首先
安装 gcc 包:
sudo apt install gcc
然后:
Pip install PyStemmer
对我有用