R 中用于 knn 分类的混淆矩阵
confusionMatrix for knn classification in R
我想用最优k值进行kNN分类,用训练集预测测试集中的因变量糖尿病,并将结果与真实值进行比较。
我已经得到了最优的k值并且已经得到了accruacy。之后,我想使用 confussionMatrix 将结果与实际值进行比较,但我遇到了不同长度的问题。
我已经检查过 nrow 和 length 数量相同(74),但它仍然有同样的问题。
你能帮我解决这个问题吗?
我的代码如下
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy120
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
AccuracyFinal
这里我遇到了问题。
Result <- confusionMatrix(FinalNN, TestXNormDF)
我想你正在看这个:
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 49 22
pos 0 3
Accuracy : 0.7027
95% CI : (0.5852, 0.8034)
No Information Rate : 0.6622
P-Value [Acc > NIR] : 0.2724
Kappa : 0.153
Mcnemar's Test P-Value : 7.562e-06
Sensitivity : 1.0000
Specificity : 0.1200
Pos Pred Value : 0.6901
Neg Pred Value : 1.0000
Prevalence : 0.6622
Detection Rate : 0.6622
Detection Prevalence : 0.9595
Balanced Accuracy : 0.5600
'Positive' Class : neg
除此之外,我建议修复 for 的括号并更改一些代码。一个好的重构会有所帮助!
AugPelle 已经回答了您关于如何获得混淆矩阵的问题。
我只是想指出,您在整个代码中计算的准确性都不正确。您应该根据验证集或测试集而不是训练集来计算准确性。这就是为什么你会收到警告,并且它会用 non-optimal k.
给你不正确的答案
下面是更正后的代码:
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy36
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)
AccuracyFinal
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
我想用最优k值进行kNN分类,用训练集预测测试集中的因变量糖尿病,并将结果与真实值进行比较。
我已经得到了最优的k值并且已经得到了accruacy。之后,我想使用 confussionMatrix 将结果与实际值进行比较,但我遇到了不同长度的问题。
我已经检查过 nrow 和 length 数量相同(74),但它仍然有同样的问题。
你能帮我解决这个问题吗?
我的代码如下
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLTrainY$'MLTrain[, 9]') / length(MLTrainY$'MLTrain[, 9]'))
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
Accuracy120 <- sum(NN120 == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
Accuracy120
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 120)
AccuracyFinal <- sum(FinalNN == MLTrainY$`MLTrain[, 9]`) / length(MLTrainY$`MLTrain[, 9]`)
AccuracyFinal
这里我遇到了问题。
Result <- confusionMatrix(FinalNN, TestXNormDF)
我想你正在看这个:
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )
Confusion Matrix and Statistics
Reference
Prediction neg pos
neg 49 22
pos 0 3
Accuracy : 0.7027
95% CI : (0.5852, 0.8034)
No Information Rate : 0.6622
P-Value [Acc > NIR] : 0.2724
Kappa : 0.153
Mcnemar's Test P-Value : 7.562e-06
Sensitivity : 1.0000
Specificity : 0.1200
Pos Pred Value : 0.6901
Neg Pred Value : 1.0000
Prevalence : 0.6622
Detection Rate : 0.6622
Detection Prevalence : 0.9595
Balanced Accuracy : 0.5600
'Positive' Class : neg
除此之外,我建议修复 for 的括号并更改一些代码。一个好的重构会有所帮助!
AugPelle 已经回答了您关于如何获得混淆矩阵的问题。
我只是想指出,您在整个代码中计算的准确性都不正确。您应该根据验证集或测试集而不是训练集来计算准确性。这就是为什么你会收到警告,并且它会用 non-optimal k.
给你不正确的答案下面是更正后的代码:
install.packages("mlbench")
install.packages("gbm")
library(mlbench)
library(gbm)
data("PimaIndiansDiabetes2")
head(PimaIndiansDiabetes2)
MLdata <- as.data.frame(PimaIndiansDiabetes2)
head(MLdata)
str(MLdata)
View(MLdata)
any(is.na(MLdata))
sum(is.na(MLdata))
MLdata2 <- na.omit(MLdata)
any(is.na(MLdata2))
sum(is.na(MLdata2))
View(MLdata2)
set.seed(3333)
MLIdx <- sample(1:3, size = nrow(MLdata2), prob = c(0.6, 0.2, 0.2), replace = TRUE)
MLTrain <- MLdata2[MLIdx == 1,]
MLValid <- MLdata2[MLIdx == 2,]
MLTest <- MLdata2[MLIdx == 3,]
head(MLTrain)
head(MLValid)
head(MLTest)
str(MLTrain)
str(MLValid)
str(MLTest)
View(MLTestY)
MLTrainX <- MLTrain[ , -9]
MLValidX <- MLValid[ , -9]
MLTestX <- MLTest[ , -9]
MLTrainY <- as.data.frame(MLTrain[ , 9])
MLValidY <- as.data.frame(MLValid[ , 9])
MLTestY <- as.data.frame(MLTest[ , 9])
View(MLTrainX)
View(MLTrainY)
library(caret)
NormValues <- preProcess(MLTrainX, method = c("center", "scale"))
TrainXNormDF <- predict(NormValues, MLTrainX)
ValidXNormDF <- predict(NormValues, MLValidX)
TestXNormDF <- predict(NormValues, MLTestX)
head(TrainXNormDF)
head(ValidXNormDF)
head(TestXNormDF)
install.packages('FNN')
library(FNN)
library(class)
set.seed(3333)
NN <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 3)
NN
Accuracy3 <- sum(NN == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy3
nrow(TrainXNormDF)
length(MLTrainY$'MLTrain[, 9]')
set.seed(3333)
AccuracyK <- NULL
for(kk in c(1:nrow(TrainXNormDF))){
Knn_K <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = kk)
AccuracyK <- c(AccuracyK, sum(Knn_K == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`))}
ValidK <- data.frame(k = c(1:nrow(TrainXNormDF)), accuracy = AccuracyK)
min(ValidK[ValidK$accuracy %in% max(AccuracyK), "k"])
plot(formula = accuracy ~ k,
data = ValidK,
type = "o",
pch = 5,
main = "Optimal K Validation")
with(ValidK, text(accuracy ~ k, labels = rownames(ValidK), pos = 2, cex = 0.5))
set.seed(3333)
NN120 <- knn(train = TrainXNormDF,
test = ValidXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
Accuracy36 <- sum(NN120 == MLValidY$`MLValid[, 9]`) / length(MLValidY$`MLValid[, 9]`)
Accuracy36
set.seed(3333)
FinalNN <- knn(train = TrainXNormDF,
test = TestXNormDF,
cl = MLTrainY$`MLTrain[, 9]`,
k = 36)
AccuracyFinal <- sum(FinalNN == MLTestY$`MLTest[, 9]`) / length(MLTestY$`MLTest[, 9]`)
AccuracyFinal
Result <- confusionMatrix(FinalNN, MLTestY$`MLTest[, 9]` )