一个 Class ClassR 语言化。生成混淆矩阵时我做错了什么?
One Class Classification in R language. What am I doing wrong when generating the confusion matrix?
我正在尝试理解和实现 classifers R 中的 class 基于多个 UCI 和其中之一 (http://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease)。
尝试打印混淆矩阵时出现错误“所有参数必须具有相同的长度”。
我做错了什么?
library(caret)
library(dplyr)
library(e1071)
library(NLP)
library(tm)
ds = read.csv('kidney_disease.csv',
header = TRUE)
#Remover colunas inutiliz?veis
ds <- subset(ds, select = -c(age), classification =='ckd' )
x <- subset(ds, select = -classification) #make x variables
y <- ds$classification #make y variable(dependent)
# test on the whole set
#pred <- predict(model, subset(ds, select=-classification))
trainPositive<-x
testnegative<-y
inTrain<-createDataPartition(1:nrow(trainPositive),p=0.6,list=FALSE)
trainpredictors<-trainPositive[inTrain,1:4]
trainLabels<-trainPositive[inTrain,6]
testPositive<-trainPositive[-inTrain,]
testPosNeg<-rbind(testPositive,testnegative)
testpredictors<-testPosNeg[,1:4]
testLabels<-testPosNeg[,6]
svm.model<-svm(trainpredictors,y=NULL,
type='one-classification',
nu=0.10,
scale=TRUE,
kernel="radial")
svm.predtrain<-predict(svm.model,trainpredictors)
svm.predtest<-predict(svm.model,testpredictors)
# confusionMatrixTable<-table(Predicted=svm.pred,Reference=testLabels)
# confusionMatrix(confusionMatrixTable,positive='TRUE')
confTrain <- table(Predicted=svm.predtrain,Reference=trainLabels)
confTest <- table(Predicted=svm.predtest,Reference=testLabels)
confusionMatrix(confTest,positive='TRUE')
print(confTrain)
print(confTest)
#grid
以下是我正在使用的数据集的一些第一行:
id bp sg al su rbc pc pcc ba bgr bu sc sod pot hemo pcv wc
1 0 80 1.020 1 0 normal notpresent notpresent 121 36 1.2 NA NA 15.4 44 7800
2 1 50 1.020 4 0 normal notpresent notpresent NA 18 0.8 NA NA 11.3 38 6000
3 2 80 1.010 2 3 normal normal notpresent notpresent 423 53 1.8 NA NA 9.6 31 7500
4 3 70 1.005 4 0 normal abnormal present notpresent 117 56 3.8 111 2.5 11.2 32 6700
5 4 80 1.010 2 0 normal normal notpresent notpresent 106 26 1.4 NA NA 11.6 35 7300
6 5 90 1.015 3 0 notpresent notpresent 74 25 1.1 142 3.2 12.2 39 7800
rc htn dm cad appet pe ane classification
1 5.2 yes yes no good no no ckd
2 no no no good no no ckd
3 no yes no poor no yes ckd
4 3.9 yes no no poor yes yes ckd
5 4.6 no no no good no no ckd
6 4.4 yes yes no good yes no ckd
错误日志:
> confTrain <- table (Predicted = svm.predtrain, Reference = trainLabels)
Table error (Predicted = svm.predtrain, Reference = trainLabels):
all arguments must be the same length
> confTest <- table (Predicted = svm.predtest, Reference = testLabels)
Table error (expected = svm.predtest, reference = testLabels):
all arguments must be the same length
>
> confusionMatrix (confTest, positive = 'TRUE')
ConfusionMatrix error (confTest, positive = "TRUE"):
'confTest' object not found
>
>
> print (confTrain)
Printing error (confTrain): object 'confTrain' not found
> print (confTest)
Printing error (confTest): object 'confTest' not found
我看到了很多问题。首先,您的很多数据似乎都是 class 字符而不是数字,这是 classifier 所要求的。让我们选择一些列并转换为数字。我会用 data.table
因为 fread
很方便。
library(caret)
library(e1071)
library(data.table)
setDT(ds)
#Choose columns
mycols <- c("id","bp","sg","al","su")
#Convert to numeric
ds[,(mycols) := lapply(.SD, as.numeric),.SDcols = mycols]
#Convert classification to logical
data <- ds[,.(bp,sg,al,su,classification = ds$classification == "ckd")]
data
bp sg al su classification
1: 80 1.020 1 0 TRUE
2: 50 1.020 4 0 TRUE
3: 80 1.010 2 3 TRUE
4: 70 1.005 4 0 TRUE
5: 80 1.010 2 0 TRUE
---
396: 80 1.020 0 0 FALSE
397: 70 1.025 0 0 FALSE
398: 80 1.020 0 0 FALSE
399: 60 1.025 0 0 FALSE
400: 80 1.025 0 0 FALSE
清理数据后,您可以像在原始代码中一样使用 createDataPartition
对训练和测试集进行采样。
#Sample data for training and test set
inTrain<-createDataPartition(1:nrow(data),p=0.6,list=FALSE)
train<- data[inTrain,]
test <- data[-inTrain,]
然后我们可以创建模型并进行预测。
svm.model<-svm(classification ~ bp + sg + al + su, data = train,
type='one-classification',
nu=0.10,
scale=TRUE,
kernel="radial")
#Perform predictions
svm.predtrain<-predict(svm.model,train)
svm.predtest<-predict(svm.model,test)
你的交叉 table 的主要问题是模型只能预测没有任何 NA
s 的情况,所以你必须对 classification 进行子集化水平到那些有预测的。然后你可以评估 confusionMatrix
:
confTrain <- table(Predicted=svm.predtrain,
Reference=train$classification[as.integer(names(svm.predtrain))])
confTest <- table(Predicted=svm.predtest,
Reference=test$classification[as.integer(names(svm.predtest))])
confusionMatrix(confTest,positive='TRUE')
Confusion Matrix and Statistics
Reference
Predicted FALSE TRUE
FALSE 0 17
TRUE 55 64
Accuracy : 0.4706
95% CI : (0.3845, 0.558)
No Information Rate : 0.5956
P-Value [Acc > NIR] : 0.9988
Kappa : -0.2361
Mcnemar's Test P-Value : 1.298e-05
Sensitivity : 0.7901
Specificity : 0.0000
Pos Pred Value : 0.5378
Neg Pred Value : 0.0000
Prevalence : 0.5956
Detection Rate : 0.4706
Detection Prevalence : 0.8750
Balanced Accuracy : 0.3951
'Positive' Class : TRUE
数据
library(archive)
library(data.table)
tf1 <- tempfile(fileext = ".rar")
#Download data file
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00336/Chronic_Kidney_Disease.rar", tf1)
tf2 <- tempfile()
#Un-rar file
archive_extract(tf1, tf2)
#Read in data
ds <- fread(paste0(tf2,"/Chronic_Kidney_Disease/chronic_kidney_disease.arff"), fill = TRUE, skip = "48")
#Remove erroneous last column
ds[,V26:= NULL]
#Set column names (from header)
setnames(ds,c("id","bp","sg","al","su","rbc","pc","pcc","ba","bgr","bu","sc","sod","pot","hemo","pcv","wc","rc","htn","dm","cad","appet","pe","ane","classification"))
#Replace "?" with NA
ds[ds == "?"] <- NA
我正在尝试理解和实现 classifers R 中的 class 基于多个 UCI 和其中之一 (http://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease)。
尝试打印混淆矩阵时出现错误“所有参数必须具有相同的长度”。
我做错了什么?
library(caret)
library(dplyr)
library(e1071)
library(NLP)
library(tm)
ds = read.csv('kidney_disease.csv',
header = TRUE)
#Remover colunas inutiliz?veis
ds <- subset(ds, select = -c(age), classification =='ckd' )
x <- subset(ds, select = -classification) #make x variables
y <- ds$classification #make y variable(dependent)
# test on the whole set
#pred <- predict(model, subset(ds, select=-classification))
trainPositive<-x
testnegative<-y
inTrain<-createDataPartition(1:nrow(trainPositive),p=0.6,list=FALSE)
trainpredictors<-trainPositive[inTrain,1:4]
trainLabels<-trainPositive[inTrain,6]
testPositive<-trainPositive[-inTrain,]
testPosNeg<-rbind(testPositive,testnegative)
testpredictors<-testPosNeg[,1:4]
testLabels<-testPosNeg[,6]
svm.model<-svm(trainpredictors,y=NULL,
type='one-classification',
nu=0.10,
scale=TRUE,
kernel="radial")
svm.predtrain<-predict(svm.model,trainpredictors)
svm.predtest<-predict(svm.model,testpredictors)
# confusionMatrixTable<-table(Predicted=svm.pred,Reference=testLabels)
# confusionMatrix(confusionMatrixTable,positive='TRUE')
confTrain <- table(Predicted=svm.predtrain,Reference=trainLabels)
confTest <- table(Predicted=svm.predtest,Reference=testLabels)
confusionMatrix(confTest,positive='TRUE')
print(confTrain)
print(confTest)
#grid
以下是我正在使用的数据集的一些第一行:
id bp sg al su rbc pc pcc ba bgr bu sc sod pot hemo pcv wc
1 0 80 1.020 1 0 normal notpresent notpresent 121 36 1.2 NA NA 15.4 44 7800
2 1 50 1.020 4 0 normal notpresent notpresent NA 18 0.8 NA NA 11.3 38 6000
3 2 80 1.010 2 3 normal normal notpresent notpresent 423 53 1.8 NA NA 9.6 31 7500
4 3 70 1.005 4 0 normal abnormal present notpresent 117 56 3.8 111 2.5 11.2 32 6700
5 4 80 1.010 2 0 normal normal notpresent notpresent 106 26 1.4 NA NA 11.6 35 7300
6 5 90 1.015 3 0 notpresent notpresent 74 25 1.1 142 3.2 12.2 39 7800
rc htn dm cad appet pe ane classification
1 5.2 yes yes no good no no ckd
2 no no no good no no ckd
3 no yes no poor no yes ckd
4 3.9 yes no no poor yes yes ckd
5 4.6 no no no good no no ckd
6 4.4 yes yes no good yes no ckd
错误日志:
> confTrain <- table (Predicted = svm.predtrain, Reference = trainLabels)
Table error (Predicted = svm.predtrain, Reference = trainLabels):
all arguments must be the same length
> confTest <- table (Predicted = svm.predtest, Reference = testLabels)
Table error (expected = svm.predtest, reference = testLabels):
all arguments must be the same length
>
> confusionMatrix (confTest, positive = 'TRUE')
ConfusionMatrix error (confTest, positive = "TRUE"):
'confTest' object not found
>
>
> print (confTrain)
Printing error (confTrain): object 'confTrain' not found
> print (confTest)
Printing error (confTest): object 'confTest' not found
我看到了很多问题。首先,您的很多数据似乎都是 class 字符而不是数字,这是 classifier 所要求的。让我们选择一些列并转换为数字。我会用 data.table
因为 fread
很方便。
library(caret)
library(e1071)
library(data.table)
setDT(ds)
#Choose columns
mycols <- c("id","bp","sg","al","su")
#Convert to numeric
ds[,(mycols) := lapply(.SD, as.numeric),.SDcols = mycols]
#Convert classification to logical
data <- ds[,.(bp,sg,al,su,classification = ds$classification == "ckd")]
data
bp sg al su classification
1: 80 1.020 1 0 TRUE
2: 50 1.020 4 0 TRUE
3: 80 1.010 2 3 TRUE
4: 70 1.005 4 0 TRUE
5: 80 1.010 2 0 TRUE
---
396: 80 1.020 0 0 FALSE
397: 70 1.025 0 0 FALSE
398: 80 1.020 0 0 FALSE
399: 60 1.025 0 0 FALSE
400: 80 1.025 0 0 FALSE
清理数据后,您可以像在原始代码中一样使用 createDataPartition
对训练和测试集进行采样。
#Sample data for training and test set
inTrain<-createDataPartition(1:nrow(data),p=0.6,list=FALSE)
train<- data[inTrain,]
test <- data[-inTrain,]
然后我们可以创建模型并进行预测。
svm.model<-svm(classification ~ bp + sg + al + su, data = train,
type='one-classification',
nu=0.10,
scale=TRUE,
kernel="radial")
#Perform predictions
svm.predtrain<-predict(svm.model,train)
svm.predtest<-predict(svm.model,test)
你的交叉 table 的主要问题是模型只能预测没有任何 NA
s 的情况,所以你必须对 classification 进行子集化水平到那些有预测的。然后你可以评估 confusionMatrix
:
confTrain <- table(Predicted=svm.predtrain,
Reference=train$classification[as.integer(names(svm.predtrain))])
confTest <- table(Predicted=svm.predtest,
Reference=test$classification[as.integer(names(svm.predtest))])
confusionMatrix(confTest,positive='TRUE')
Confusion Matrix and Statistics
Reference
Predicted FALSE TRUE
FALSE 0 17
TRUE 55 64
Accuracy : 0.4706
95% CI : (0.3845, 0.558)
No Information Rate : 0.5956
P-Value [Acc > NIR] : 0.9988
Kappa : -0.2361
Mcnemar's Test P-Value : 1.298e-05
Sensitivity : 0.7901
Specificity : 0.0000
Pos Pred Value : 0.5378
Neg Pred Value : 0.0000
Prevalence : 0.5956
Detection Rate : 0.4706
Detection Prevalence : 0.8750
Balanced Accuracy : 0.3951
'Positive' Class : TRUE
数据
library(archive)
library(data.table)
tf1 <- tempfile(fileext = ".rar")
#Download data file
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00336/Chronic_Kidney_Disease.rar", tf1)
tf2 <- tempfile()
#Un-rar file
archive_extract(tf1, tf2)
#Read in data
ds <- fread(paste0(tf2,"/Chronic_Kidney_Disease/chronic_kidney_disease.arff"), fill = TRUE, skip = "48")
#Remove erroneous last column
ds[,V26:= NULL]
#Set column names (from header)
setnames(ds,c("id","bp","sg","al","su","rbc","pc","pcc","ba","bgr","bu","sc","sod","pot","hemo","pcv","wc","rc","htn","dm","cad","appet","pe","ane","classification"))
#Replace "?" with NA
ds[ds == "?"] <- NA