如何在 R 中为具有三个 类 的随机森林模型绘制 ROC 曲线?
How can I draw a ROC curve for a randomForest model with three classes in R?
我正在使用 R 包 randomForest 创建一个分为三组的模型。
model = randomForest(formula = condition ~ ., data = train, ntree = 2000,
mtry = bestm, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 2000
No. of variables tried at each split: 3
OOB estimate of error rate: 5.71%
Confusion matrix:
lethal mock resistant class.error
lethal 20 1 0 0.04761905
mock 1 37 0 0.02631579
resistant 2 0 9 0.18181818
我试过几个库。例如,使用 ROCR,您不能进行三分类,只能进行两分类。看:
pred=prediction(predictions,train$condition)
Error in prediction(predictions, train$condition) :
Number of classes is not equal to 2.
ROCR currently supports only evaluation of binary classification
tasks.
来自 model$votes 的数据如下所示:
lethal mock resistant
3 0.04514364 0.952120383 0.002735978
89 0.32394366 0.147887324 0.528169014
16 0.02564103 0.973009447 0.001349528
110 0.55614973 0.433155080 0.010695187
59 0.06685633 0.903271693 0.029871977
43 0.13424658 0.865753425 0.000000000
41 0.82987552 0.033195021 0.136929461
86 0.32705249 0.468371467 0.204576043
87 0.37704918 0.341530055 0.281420765
........
我可以使用 pROC 包通过这种方式得到一些非常难看的 ROC 图:
predictions <- as.numeric(predict(model, test, type = 'response'))
roc.multi <- multiclass.roc(test$condition, predictions,
percent=TRUE)
rs <- roc.multi[['rocs']]
plot.roc(rs[[2]])
sapply(2:length(rs),function(i) lines.roc(rs[[i]],col=i))
那些情节看起来像这样:
虽然没有办法平滑这些线,因为它们不是曲线,因为它们每条有 4 个左右的点。
我需要一种方法来为该模型绘制一条平滑的 ROC 曲线,但我似乎找不到。有谁知道一个好方法?非常感谢!
我在这里看到两个问题 1) ROC 曲线适用于二进制 classifier,因此您应该将您的性能评估转换为一系列二进制问题。我在下面展示了如何做到这一点。 2) 当你预测你的测试集时,你应该得到每个观察值属于你的每个 classes(而不仅仅是预测的 class)的概率.这将允许您绘制漂亮的 ROC 曲线。这是代码
#load libraries
library(randomForest)
library(pROC)
# generate some random data
set.seed(1111)
train <- data.frame(condition = sample(c("mock", "lethal", "resist"), replace = T, size = 1000))
train$feat01 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
train$feat02 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
train$feat03 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
head(train)
test <- data.frame(condition = sample(c("mock", "lethal", "resist"), replace = T, size = 1000))
test$feat01 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
test$feat02 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
test$feat03 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
head(test)
现在我们有了一些数据,让我们像您一样训练随机森林模型
# model
model <- randomForest(formula = condition ~ ., data = train, ntree = 10, maxnodes= 100, norm.votes = F)
接下来使用模型对测试数据进行预测。但是,您应该在此处请求 type="prob"
。
# predict test set, get probs instead of response
predictions <- as.data.frame(predict(model, test, type = "prob"))
既然你有概率,就用它们来获得最有可能的 class。
# predict class and then attach test class
predictions$predict <- names(predictions)[1:3][apply(predictions[,1:3], 1, which.max)]
predictions$observed <- test$condition
head(predictions)
lethal mock resist predict observed
1 0.0 0.0 1.0 resist resist
2 0.0 0.6 0.4 mock mock
3 1.0 0.0 0.0 lethal mock
4 0.0 0.0 1.0 resist resist
5 0.0 1.0 0.0 mock mock
6 0.7 0.3 0.0 lethal mock
现在,让我们看看如何绘制 ROC 曲线。对于每个class,将多class问题转化为二元问题。此外,调用指定 2 个参数的 roc()
函数:i) observed classes 和 ii) class 概率(而不是预测 class)。
# 1 ROC curve, mock vs non mock
roc.mock <- roc(ifelse(predictions$observed=="mock", "mock", "non-mock"), as.numeric(predictions$mock))
plot(roc.mock, col = "gray60")
# others
roc.lethal <- roc(ifelse(predictions$observed=="lethal", "lethal", "non-lethal"), as.numeric(predictions$mock))
roc.resist <- roc(ifelse(predictions$observed=="resist", "resist", "non-resist"), as.numeric(predictions$mock))
lines(roc.lethal, col = "blue")
lines(roc.resist, col = "red")
完成。这是结果。当然,你的测试集中的观察越多,你的曲线就会越平滑。
我正在使用 R 包 randomForest 创建一个分为三组的模型。
model = randomForest(formula = condition ~ ., data = train, ntree = 2000,
mtry = bestm, importance = TRUE, proximity = TRUE)
Type of random forest: classification
Number of trees: 2000
No. of variables tried at each split: 3
OOB estimate of error rate: 5.71%
Confusion matrix:
lethal mock resistant class.error
lethal 20 1 0 0.04761905
mock 1 37 0 0.02631579
resistant 2 0 9 0.18181818
我试过几个库。例如,使用 ROCR,您不能进行三分类,只能进行两分类。看:
pred=prediction(predictions,train$condition)
Error in prediction(predictions, train$condition) :
Number of classes is not equal to 2.
ROCR currently supports only evaluation of binary classification
tasks.
来自 model$votes 的数据如下所示:
lethal mock resistant
3 0.04514364 0.952120383 0.002735978
89 0.32394366 0.147887324 0.528169014
16 0.02564103 0.973009447 0.001349528
110 0.55614973 0.433155080 0.010695187
59 0.06685633 0.903271693 0.029871977
43 0.13424658 0.865753425 0.000000000
41 0.82987552 0.033195021 0.136929461
86 0.32705249 0.468371467 0.204576043
87 0.37704918 0.341530055 0.281420765
........
我可以使用 pROC 包通过这种方式得到一些非常难看的 ROC 图:
predictions <- as.numeric(predict(model, test, type = 'response'))
roc.multi <- multiclass.roc(test$condition, predictions,
percent=TRUE)
rs <- roc.multi[['rocs']]
plot.roc(rs[[2]])
sapply(2:length(rs),function(i) lines.roc(rs[[i]],col=i))
那些情节看起来像这样:
虽然没有办法平滑这些线,因为它们不是曲线,因为它们每条有 4 个左右的点。
我需要一种方法来为该模型绘制一条平滑的 ROC 曲线,但我似乎找不到。有谁知道一个好方法?非常感谢!
我在这里看到两个问题 1) ROC 曲线适用于二进制 classifier,因此您应该将您的性能评估转换为一系列二进制问题。我在下面展示了如何做到这一点。 2) 当你预测你的测试集时,你应该得到每个观察值属于你的每个 classes(而不仅仅是预测的 class)的概率.这将允许您绘制漂亮的 ROC 曲线。这是代码
#load libraries
library(randomForest)
library(pROC)
# generate some random data
set.seed(1111)
train <- data.frame(condition = sample(c("mock", "lethal", "resist"), replace = T, size = 1000))
train$feat01 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
train$feat02 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
train$feat03 <- sapply(train$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
head(train)
test <- data.frame(condition = sample(c("mock", "lethal", "resist"), replace = T, size = 1000))
test$feat01 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
test$feat02 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
test$feat03 <- sapply(test$condition, (function(i){ if (i == "mock") { rnorm(n = 1, mean = 0)} else if (i == "lethal") { rnorm(n = 1, mean = 1.5)} else { rnorm(n = 1, mean = -1.5)} }))
head(test)
现在我们有了一些数据,让我们像您一样训练随机森林模型
# model
model <- randomForest(formula = condition ~ ., data = train, ntree = 10, maxnodes= 100, norm.votes = F)
接下来使用模型对测试数据进行预测。但是,您应该在此处请求 type="prob"
。
# predict test set, get probs instead of response
predictions <- as.data.frame(predict(model, test, type = "prob"))
既然你有概率,就用它们来获得最有可能的 class。
# predict class and then attach test class
predictions$predict <- names(predictions)[1:3][apply(predictions[,1:3], 1, which.max)]
predictions$observed <- test$condition
head(predictions)
lethal mock resist predict observed
1 0.0 0.0 1.0 resist resist
2 0.0 0.6 0.4 mock mock
3 1.0 0.0 0.0 lethal mock
4 0.0 0.0 1.0 resist resist
5 0.0 1.0 0.0 mock mock
6 0.7 0.3 0.0 lethal mock
现在,让我们看看如何绘制 ROC 曲线。对于每个class,将多class问题转化为二元问题。此外,调用指定 2 个参数的 roc()
函数:i) observed classes 和 ii) class 概率(而不是预测 class)。
# 1 ROC curve, mock vs non mock
roc.mock <- roc(ifelse(predictions$observed=="mock", "mock", "non-mock"), as.numeric(predictions$mock))
plot(roc.mock, col = "gray60")
# others
roc.lethal <- roc(ifelse(predictions$observed=="lethal", "lethal", "non-lethal"), as.numeric(predictions$mock))
roc.resist <- roc(ifelse(predictions$observed=="resist", "resist", "non-resist"), as.numeric(predictions$mock))
lines(roc.lethal, col = "blue")
lines(roc.resist, col = "red")
完成。这是结果。当然,你的测试集中的观察越多,你的曲线就会越平滑。