skmultiLearn 分类器预测总是 return 0
skmultiLearn classifiers predictions always return 0
我是 skmultiLearn 的新手,现在我将其用于 'Chinese' 文档多标签分类。
训练数据集很小(大约 200 个句子),我总共设置了 6 类。即使我在训练数据集中使用句子,我也只能得到 [0,0,0,0,0,0] 作为预测结果,我能得到一些帮助吗?
谢谢!
我的代码:
# Import BinaryRelevance from skmultilearn
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from scipy import sparse
import jieba
import codecs
import numpy as np
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
Q_list = []
L_list = []
# Read Sentence file
with codecs.open('multi-label-Q.txt',encoding='utf-8') as infile:
for line in infile:
Q_list.append(line[:-2])
infile.close()
# Read Label file
with open('multi-label-L.txt') as infile:
for line in infile:
tmp_l = line[:-1].split(',')
L_list.append(tmp_l)
infile.close()
L_list = np.array(L_list)
L_Question_list = []
# Preprocess for Chinese sentences
for line in Q_list:
seg_list = jieba.lcut(line, cut_all=False)
q_addSpace = ''
for w in seg_list:
q_addSpace = q_addSpace + w + ' '
L_Question_list.append(q_addSpace[:-1])
cv = CountVectorizer()
cv_fit=cv.fit_transform(L_Question_list)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(cv_fit)
M = sparse.lil_matrix((len(L_list),6), dtype=int)
for i,row in enumerate(L_list):
count = 0
for col in row:
M[i, count] = col
count += 1
# Setup the classifier
clf = BinaryRelevance(classifier=SVC())
# Train
clf.fit(tfidf, M)
# A sentence in train dataset
x_test = '偏头痛多发于什么年龄层?'
# Preprocess for Chinese sentence
seg_list = jieba.lcut(x_test, cut_all=False)
q_addSpace = ''
for w in seg_list:
q_addSpace = q_addSpace + w + ' '
X_test = [q_addSpace]
cv_fit2=cv.transform(X_test)
tfidf2 = transformer.transform(cv_fit2)
# Predict
pred = clf.predict(tfidf2)
print(pred.todense())
现在明白了,原因是我的单标签数据太多了
我使用了一些高价值的数据集并得到了正确的结果。
所以,答案是:完善数据集。
我是 skmultiLearn 的新手,现在我将其用于 'Chinese' 文档多标签分类。 训练数据集很小(大约 200 个句子),我总共设置了 6 类。即使我在训练数据集中使用句子,我也只能得到 [0,0,0,0,0,0] 作为预测结果,我能得到一些帮助吗? 谢谢!
我的代码:
# Import BinaryRelevance from skmultilearn
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from scipy import sparse
import jieba
import codecs
import numpy as np
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
Q_list = []
L_list = []
# Read Sentence file
with codecs.open('multi-label-Q.txt',encoding='utf-8') as infile:
for line in infile:
Q_list.append(line[:-2])
infile.close()
# Read Label file
with open('multi-label-L.txt') as infile:
for line in infile:
tmp_l = line[:-1].split(',')
L_list.append(tmp_l)
infile.close()
L_list = np.array(L_list)
L_Question_list = []
# Preprocess for Chinese sentences
for line in Q_list:
seg_list = jieba.lcut(line, cut_all=False)
q_addSpace = ''
for w in seg_list:
q_addSpace = q_addSpace + w + ' '
L_Question_list.append(q_addSpace[:-1])
cv = CountVectorizer()
cv_fit=cv.fit_transform(L_Question_list)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(cv_fit)
M = sparse.lil_matrix((len(L_list),6), dtype=int)
for i,row in enumerate(L_list):
count = 0
for col in row:
M[i, count] = col
count += 1
# Setup the classifier
clf = BinaryRelevance(classifier=SVC())
# Train
clf.fit(tfidf, M)
# A sentence in train dataset
x_test = '偏头痛多发于什么年龄层?'
# Preprocess for Chinese sentence
seg_list = jieba.lcut(x_test, cut_all=False)
q_addSpace = ''
for w in seg_list:
q_addSpace = q_addSpace + w + ' '
X_test = [q_addSpace]
cv_fit2=cv.transform(X_test)
tfidf2 = transformer.transform(cv_fit2)
# Predict
pred = clf.predict(tfidf2)
print(pred.todense())
现在明白了,原因是我的单标签数据太多了
我使用了一些高价值的数据集并得到了正确的结果。
所以,答案是:完善数据集。