kNN 一直过度使用一个标签
kNN Consistently Overusing One Label
我正在使用 kNN 对标记图像进行一些分类。分类完成后,我输出一个混淆矩阵。我注意到一个标签 bottle
被错误应用的频率更高。
我取下标签并再次测试,但后来发现另一个标签 shoe
应用不正确,但上次没问题。
应该没有规范化,所以我不确定是什么导致了这种行为。测试表明,无论我删除多少标签,它都会继续。
不完全确定 post 有多少代码,所以我会放一些应该相关的东西,然后粘贴剩下的东西。
def confusionMatrix(classifier, train_DS_X, train_DS_y, test_DS_X, test_DS_y):
# Will output a confusion matrix graph for the predicion
y_pred = classifier.fit(train_DS_X, train_DS_y).predict(test_DS_X)
labels = set(set(train_DS_y) | set(test_DS_y))
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cm = confusion_matrix(test_DS_y , y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
#print(cm)
plt.figure()
plot_confusion_matrix(cm)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
#print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.show()
主要功能的相关代码:
# Select training and test data
PCA = decomposition.PCA(n_components=.95)
zscorer = ZScoreMapper(param_est=('targets', ['rest']), auto_train=False)
DS = getVoxels (1, .5)
train_DS = DS[0]
test_DS = DS[1]
# Apply PCA and ZScoring
train_DS = processVoxels(train_DS, True, zscorer, PCA)
test_DS = processVoxels(test_DS, False, zscorer, PCA)
print 3*"\n"
# Select the desired features
# If selecting samples or PCA, that must be the only feature
featuresOfInterest = ['pca']
trainDSFeat = selectFeatures(train_DS, featuresOfInterest)
testDSFeat = selectFeatures(test_DS, featuresOfInterest)
train_DS_X = trainDSFeat[0]
train_DS_y = trainDSFeat[1]
test_DS_X = testDSFeat[0]
test_DS_y = testDSFeat[1]
# Optimization of neighbors
# Naively searches for local max starting at numNeighbors
lastScore = 0
lastNeightbors = 1
score = .0000001
numNeighbors = 5
while score > lastScore:
lastScore = score
lastNeighbors = numNeighbors
numNeighbors += 1
#Classification
neigh = neighbors.KNeighborsClassifier(n_neighbors=numNeighbors, weights='distance')
neigh.fit(train_DS_X, train_DS_y)
#Testing
score = neigh.score(test_DS_X,test_DS_y )
# Confusion Matrix Output
neigh = neighbors.KNeighborsClassifier(n_neighbors=lastNeighbors, weights='distance')
confusionMatrix(neigh, train_DS_X, train_DS_y, test_DS_X, test_DS_y)
Pastebin:http://pastebin.com/U7yTs3vs
问题的部分原因是我的轴被错误标记,当我以为我正在删除错误标签时,实际上我只是删除了一个随机标签,这意味着错误数据仍在分析中。修复轴并删除实际上是 rest
的错误标签:
我改的代码是:
cm = confusion_matrix(test_DS_y , y_pred, labels)
基本上我根据我的有序标签列表手动设置顺序。
我正在使用 kNN 对标记图像进行一些分类。分类完成后,我输出一个混淆矩阵。我注意到一个标签 bottle
被错误应用的频率更高。
我取下标签并再次测试,但后来发现另一个标签 shoe
应用不正确,但上次没问题。
应该没有规范化,所以我不确定是什么导致了这种行为。测试表明,无论我删除多少标签,它都会继续。 不完全确定 post 有多少代码,所以我会放一些应该相关的东西,然后粘贴剩下的东西。
def confusionMatrix(classifier, train_DS_X, train_DS_y, test_DS_X, test_DS_y):
# Will output a confusion matrix graph for the predicion
y_pred = classifier.fit(train_DS_X, train_DS_y).predict(test_DS_X)
labels = set(set(train_DS_y) | set(test_DS_y))
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cm = confusion_matrix(test_DS_y , y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
#print(cm)
plt.figure()
plot_confusion_matrix(cm)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
#print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.show()
主要功能的相关代码:
# Select training and test data
PCA = decomposition.PCA(n_components=.95)
zscorer = ZScoreMapper(param_est=('targets', ['rest']), auto_train=False)
DS = getVoxels (1, .5)
train_DS = DS[0]
test_DS = DS[1]
# Apply PCA and ZScoring
train_DS = processVoxels(train_DS, True, zscorer, PCA)
test_DS = processVoxels(test_DS, False, zscorer, PCA)
print 3*"\n"
# Select the desired features
# If selecting samples or PCA, that must be the only feature
featuresOfInterest = ['pca']
trainDSFeat = selectFeatures(train_DS, featuresOfInterest)
testDSFeat = selectFeatures(test_DS, featuresOfInterest)
train_DS_X = trainDSFeat[0]
train_DS_y = trainDSFeat[1]
test_DS_X = testDSFeat[0]
test_DS_y = testDSFeat[1]
# Optimization of neighbors
# Naively searches for local max starting at numNeighbors
lastScore = 0
lastNeightbors = 1
score = .0000001
numNeighbors = 5
while score > lastScore:
lastScore = score
lastNeighbors = numNeighbors
numNeighbors += 1
#Classification
neigh = neighbors.KNeighborsClassifier(n_neighbors=numNeighbors, weights='distance')
neigh.fit(train_DS_X, train_DS_y)
#Testing
score = neigh.score(test_DS_X,test_DS_y )
# Confusion Matrix Output
neigh = neighbors.KNeighborsClassifier(n_neighbors=lastNeighbors, weights='distance')
confusionMatrix(neigh, train_DS_X, train_DS_y, test_DS_X, test_DS_y)
Pastebin:http://pastebin.com/U7yTs3vs
问题的部分原因是我的轴被错误标记,当我以为我正在删除错误标签时,实际上我只是删除了一个随机标签,这意味着错误数据仍在分析中。修复轴并删除实际上是 rest
的错误标签:
我改的代码是:
cm = confusion_matrix(test_DS_y , y_pred, labels)
基本上我根据我的有序标签列表手动设置顺序。