R 中用于 knn 分类的混淆矩阵

confusionMatrix for knn classification in R

我想用最优k值进行kNN分类,用训练集预测测试集中的因变量糖尿病,并将结果与​​真实值进行比较。

我已经得到了最优的k值并且已经得到了accruacy。之后,我想使用 confussionMatrix 将结果与实际值进行比较,但我遇到了不同长度的问题。

我已经检查过 nrow 和 length 数量相同(74),但它仍然有同样的问题。

你能帮我解决这个问题吗?

我的代码如下

install.packages("mlbench")
install.packages("gbm")

library(mlbench)
library(gbm)

data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)

MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)

any(is.na(MLdata))
sum(is.na(MLdata))

MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)

set.seed(3333)

MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)

MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]

head(MLTrain)
head(MLValid)
head(MLTest)

str(MLTrain)
str(MLValid)
str(MLTest)

View(MLTestY)


MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]

MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])

View(MLTrainX)
View(MLTrainY)

library(caret)

NormValues <- preProcess(MLTrainX, method = c("center", "scale"))

TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)

head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)


install.packages('FNN')
library(FNN)
library(class)

set.seed(3333)

NN <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 3)

NN

Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy3

nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')

set.seed(3333)

AccuracyK <- NULL

for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
             test = ValidXNormDF,
             cl = MLTrainY$`MLTrain[, 9]`,
             k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))


ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)

min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])

plot(formula = accuracy ~ k,
 data = ValidK,
 type = "o",
 pch = 5,
 main = "Optimal K Validation")

with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))

set.seed(3333)

NN120 <- knn(train = TrainXNormDF, 
      test = ValidXNormDF,
      cl = MLTrainY$`MLTrain[, 9]`,
      k = 120)

Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

Accuracy120

set.seed(3333)

FinalNN <- knn(train = TrainXNormDF, 
           test = TestXNormDF,
           cl = MLTrainY$`MLTrain[, 9]`,
           k = 120)

AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)

AccuracyFinal

这里我遇到了问题。

Result <- confusionMatrix(FinalNN, TestXNormDF)

我想你正在看这个:

Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
Confusion Matrix and Statistics

          Reference
Prediction neg pos
       neg  49  22
       pos   0   3
                                          
               Accuracy : 0.7027          
                 95% CI : (0.5852, 0.8034)
    No Information Rate : 0.6622          
    P-Value [Acc > NIR] : 0.2724          
                                          
                  Kappa : 0.153           
                                          
 Mcnemar's Test P-Value : 7.562e-06       
                                          
            Sensitivity : 1.0000          
            Specificity : 0.1200          
         Pos Pred Value : 0.6901          
         Neg Pred Value : 1.0000          
             Prevalence : 0.6622          
         Detection Rate : 0.6622          
   Detection Prevalence : 0.9595          
      Balanced Accuracy : 0.5600          
                                          
       'Positive' Class : neg    

除此之外,我建议修复 for 的括号并更改一些代码。一个好的重构会有所帮助!

AugPelle 已经回答了您关于如何获得混淆矩阵的问题。

我只是想指出,您在整个代码中计算的准确性都不正确。您应该根据验证集或测试集而不是训练集来计算准确性。这就是为什么你会收到警告,并且它会用 non-optimal k.

给你不正确的答案

下面是更正后的代码:

install.packages("mlbench")
install.packages("gbm")

library(mlbench)
library(gbm)

data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)

MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)

any(is.na(MLdata))
sum(is.na(MLdata))

MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)

set.seed(3333)

MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)

MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]

head(MLTrain)
head(MLValid)
head(MLTest)

str(MLTrain)
str(MLValid)
str(MLTest)

View(MLTestY)


MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]

MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])

View(MLTrainX)
View(MLTrainY)

library(caret)

NormValues <- preProcess(MLTrainX, method = c("center", "scale"))

TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)

head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)


install.packages('FNN')
library(FNN)
library(class)

set.seed(3333)

NN <- knn(train = TrainXNormDF, 
          test = ValidXNormDF,
          cl = MLTrainY$`MLTrain[, 9]`,
          k = 3)

NN

Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)

Accuracy3

nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')

set.seed(3333)

AccuracyK <- NULL

for(kk in c(1:nrow(TrainXNormDF))){
  Knn_K <- knn(train = TrainXNormDF,
               test = ValidXNormDF,
               cl = MLTrainY$`MLTrain[, 9]`,
               k = kk)
  AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}

  
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)

min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])

plot(formula = accuracy ~ k,
     data = ValidK,
     type = "o",
     pch = 5,
     main = "Optimal K Validation")

with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))

set.seed(3333)

NN120 <- knn(train = TrainXNormDF, 
             test = ValidXNormDF,
             cl = MLTrainY$`MLTrain[, 9]`,
             k = 36)

Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)

Accuracy36

set.seed(3333)

FinalNN <- knn(train = TrainXNormDF, 
               test = TestXNormDF,
               cl = MLTrainY$`MLTrain[, 9]`,
               k = 36)

AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)

AccuracyFinal

Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )