empty document error: scikit learn
empty document error: scikit learn
我正在尝试为文本分类拟合 SVM 模型,但行 x = text_clf_svm.fit(file_name, target_file)
出现错误。试了各种方法都解决不了
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from io import StringIO
import numpy as np
count_vect = CountVectorizer(stop_words=None, input='file')
file_name = open('./svmtest.txt', 'r').read().splitlines()
target_file = open('./target.txt', 'r').read().splitlines()
file_name = [StringIO(x) for x in file_name]
X_train_counts = count_vect.fit_transform(file_name)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=None,
input='file')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5,
random_state=42)),
])
x = text_clf_svm.fit(file_name, target_file)
Python 错误回溯:
File "/Users/aravind/PycharmProjects/PycharmProjects!/minorproject/src/svmClassifier.py", line 27, in <module>
x = text_clf_svm.fit(file_name, target_file)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/feature_extraction/text.py", line 869, in
fit_transform
self.fixed_vocabulary_)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop
words
我的svmtest.txt内容:
train is so bad it is very dirty
great and awesome train
我的target.txt内容:
0
1
我正在使用这个简单的数据进行测试。我收到上述错误。我不确定是什么问题。
排队
count_vect = CountVectorizer(stop_words=None, input='file')
您将 input
参数设置为 'file'。来自文档
If ‘file’, the sequence items must have a ‘read’ method (file-like object) that is called to fetch the bytes in memory.
您可以:
1.传递给fit_transform
方法文件对象
count_vect = CountVectorizer(stop_words=None, input='file')
file_name = open('./svmtest.txt', 'r')
X_train_counts = count_vect.fit_transform(file_name)
- 使用'content'选项
count_vect = CountVectorizer(stop_words=None, input='content')
file_name = open('./svmtest.txt', 'r').read().splitlines()
target_file = open('./target.txt', 'r').read().splitlines()
file_name = [StringIO(x) for x in file_name]
X_train_counts = count_vect.fit_transform(file_name)
几点:
- 您在输入数据中只给出了两个句子。我只能假设实际情况并非如此,但以防万一提到文档频率高的词可能会被考虑 "stop words"。例如,由于 "train" 出现在你上面的两个句子中,它的文档频率为 1.0,因此会被你的向量化器忽略。如果您有一个合理大小的数据集,这应该不是问题。
- 考虑在矢量化器中使用标准停用词字典:
('vect', CountVectorizer(stopwords='english'))
。 None
的默认值不等同于 no stop words
,而是 "make your best guess"。如果您希望没有停用词,请改用 stop_words=[]
。
- 不清楚您提供的文件应该是包含您的数据的文件名列表,还是包含您的数据的字符串列表。当然,没有将字符串转换为
StringIO
对象的调用。
- 要么使用输入数据的文件名并使用
input='filename'
,要么将其加载到内存中并使用 input='content'
。将它们转换为 StringIO
对象只是为了使用 input='file'
没有意义。
考虑以下版本的代码(我选择使用 input='filename'
,但如果需要可进行调整):
file_name = './svmtest.txt'
targets = [int(line.strip()) for line in open('./target.txt', 'r').read().splitlines()]
text_clf_svm = Pipeline([
# consider using stop_words='english'
('vect', CountVectorizer(stop_words=None, input='filename')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
text_clf_svm.fit(file_name, targets)
x = text_clf_svm.predict(file_name)
我正在尝试为文本分类拟合 SVM 模型,但行 x = text_clf_svm.fit(file_name, target_file)
出现错误。试了各种方法都解决不了
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from io import StringIO
import numpy as np
count_vect = CountVectorizer(stop_words=None, input='file')
file_name = open('./svmtest.txt', 'r').read().splitlines()
target_file = open('./target.txt', 'r').read().splitlines()
file_name = [StringIO(x) for x in file_name]
X_train_counts = count_vect.fit_transform(file_name)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=None,
input='file')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5,
random_state=42)),
])
x = text_clf_svm.fit(file_name, target_file)
Python 错误回溯:
File "/Users/aravind/PycharmProjects/PycharmProjects!/minorproject/src/svmClassifier.py", line 27, in <module>
x = text_clf_svm.fit(file_name, target_file)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/feature_extraction/text.py", line 869, in
fit_transform
self.fixed_vocabulary_)
File "/Users/aravind/venv/PycharmProjects!/lib/python3.6/site-
packages/sklearn/feature_extraction/text.py", line 811, in _count_vocab
raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop
words
我的svmtest.txt内容:
train is so bad it is very dirty
great and awesome train
我的target.txt内容:
0
1
我正在使用这个简单的数据进行测试。我收到上述错误。我不确定是什么问题。
排队
count_vect = CountVectorizer(stop_words=None, input='file')
您将 input
参数设置为 'file'。来自文档
If ‘file’, the sequence items must have a ‘read’ method (file-like object) that is called to fetch the bytes in memory.
您可以:
1.传递给fit_transform
方法文件对象
count_vect = CountVectorizer(stop_words=None, input='file')
file_name = open('./svmtest.txt', 'r')
X_train_counts = count_vect.fit_transform(file_name)
- 使用'content'选项
count_vect = CountVectorizer(stop_words=None, input='content')
file_name = open('./svmtest.txt', 'r').read().splitlines()
target_file = open('./target.txt', 'r').read().splitlines()
file_name = [StringIO(x) for x in file_name]
X_train_counts = count_vect.fit_transform(file_name)
几点:
- 您在输入数据中只给出了两个句子。我只能假设实际情况并非如此,但以防万一提到文档频率高的词可能会被考虑 "stop words"。例如,由于 "train" 出现在你上面的两个句子中,它的文档频率为 1.0,因此会被你的向量化器忽略。如果您有一个合理大小的数据集,这应该不是问题。
- 考虑在矢量化器中使用标准停用词字典:
('vect', CountVectorizer(stopwords='english'))
。None
的默认值不等同于no stop words
,而是 "make your best guess"。如果您希望没有停用词,请改用stop_words=[]
。 - 不清楚您提供的文件应该是包含您的数据的文件名列表,还是包含您的数据的字符串列表。当然,没有将字符串转换为
StringIO
对象的调用。 - 要么使用输入数据的文件名并使用
input='filename'
,要么将其加载到内存中并使用input='content'
。将它们转换为StringIO
对象只是为了使用input='file'
没有意义。
考虑以下版本的代码(我选择使用 input='filename'
,但如果需要可进行调整):
file_name = './svmtest.txt'
targets = [int(line.strip()) for line in open('./target.txt', 'r').read().splitlines()]
text_clf_svm = Pipeline([
# consider using stop_words='english'
('vect', CountVectorizer(stop_words=None, input='filename')),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
text_clf_svm.fit(file_name, targets)
x = text_clf_svm.predict(file_name)