添加停用词后,CountVectorizer 在 fit_transform 上抛出错误
CountVectorizer throws error on fit_transform after adding stop words
我有两段代码。一个有效,一个无效。
以下代码按预期运行,没有错误:(注意:postrain
、negtrain
、postest
和 negtest
是先前定义的字符串列表。)
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
train_vector = vector.fit_transform(postrain+negtrain)
test_vector = vector.transform(postest+negtest)
print test_vector.shape
但是,此代码会引发错误:
import re
stop = [re.split('\n|\t', open('stop_words.txt').read())]
vector2 = CountVectorizer(stop_words=stop)
train_vector = vector2.fit_transform(postrain+negtrain) # <-- Error occurs here
test_vector = vector2.transform(postest+negtest)
print test_vector.shape
错误:
TypeErrorTraceback (most recent call last)
<ipython-input-43-cf5f4754d58c> in <module>()
7
8 vector2 = CountVectorizer(stop_words=stop)
----> 9 train_vector = vector2.fit_transform(postrain+negtrain)
10 test_vector = vector2.transform(postest+negtest)
11
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in fit_transform(self, raw_documents, y)
815
816 vocabulary, X = self._count_vocab(raw_documents,
--> 817 self.fixed_vocabulary_)
818
819 if self.binary:
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
745 vocabulary.default_factory = vocabulary.__len__
746
--> 747 analyze = self.build_analyzer()
748 j_indices = _make_int_array()
749 indptr = _make_int_array()
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in build_analyzer(self)
232
233 elif self.analyzer == 'word':
--> 234 stop_words = self.get_stop_words()
235 tokenize = self.build_tokenizer()
236
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in get_stop_words(self)
215 def get_stop_words(self):
216 """Build or fetch the effective stop words list"""
--> 217 return _check_stop_list(self.stop_words)
218
219 def build_analyzer(self):
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _check_stop_list(stop)
92 return None
93 else: # assume it's a collection
---> 94 return frozenset(stop)
95
96
TypeError: unhashable type: 'list'
添加停用词是如何导致错误的?
我很笨。应该是:
stop = re.split('\n|\t', open('stop_words.txt').read())
没有括号。不知道为什么在那之后它会抛出错误。
我有两段代码。一个有效,一个无效。
以下代码按预期运行,没有错误:(注意:postrain
、negtrain
、postest
和 negtest
是先前定义的字符串列表。)
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
train_vector = vector.fit_transform(postrain+negtrain)
test_vector = vector.transform(postest+negtest)
print test_vector.shape
但是,此代码会引发错误:
import re
stop = [re.split('\n|\t', open('stop_words.txt').read())]
vector2 = CountVectorizer(stop_words=stop)
train_vector = vector2.fit_transform(postrain+negtrain) # <-- Error occurs here
test_vector = vector2.transform(postest+negtest)
print test_vector.shape
错误:
TypeErrorTraceback (most recent call last)
<ipython-input-43-cf5f4754d58c> in <module>()
7
8 vector2 = CountVectorizer(stop_words=stop)
----> 9 train_vector = vector2.fit_transform(postrain+negtrain)
10 test_vector = vector2.transform(postest+negtest)
11
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in fit_transform(self, raw_documents, y)
815
816 vocabulary, X = self._count_vocab(raw_documents,
--> 817 self.fixed_vocabulary_)
818
819 if self.binary:
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
745 vocabulary.default_factory = vocabulary.__len__
746
--> 747 analyze = self.build_analyzer()
748 j_indices = _make_int_array()
749 indptr = _make_int_array()
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in build_analyzer(self)
232
233 elif self.analyzer == 'word':
--> 234 stop_words = self.get_stop_words()
235 tokenize = self.build_tokenizer()
236
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in get_stop_words(self)
215 def get_stop_words(self):
216 """Build or fetch the effective stop words list"""
--> 217 return _check_stop_list(self.stop_words)
218
219 def build_analyzer(self):
C:\Users\Nsth\Anaconda2\envs\cs489\lib\site-packages\sklearn\feature_extraction\text.pyc in _check_stop_list(stop)
92 return None
93 else: # assume it's a collection
---> 94 return frozenset(stop)
95
96
TypeError: unhashable type: 'list'
添加停用词是如何导致错误的?
我很笨。应该是:
stop = re.split('\n|\t', open('stop_words.txt').read())
没有括号。不知道为什么在那之后它会抛出错误。