用于不平衡数据的 r caret 包中的 prSummary
prSummary in r caret package for imbalance data
我有一个不平衡的数据,我想做分层交叉验证并使用精确召回率auc作为我的评估指标。
我在带分层索引的r包caret中使用prSummary,在计算性能时遇到错误。
以下是可以重现的示例。我发现计算p-r auc的sample只有十个,因为不平衡,只有一个class所以无法计算p-r auc。 (我发现只有十个样本来计算 p-r auc 的原因是因为我修改了 prSummary 以强制此函数打印出数据)
library(randomForest)
library(mlbench)
library(caret)
# Load Dataset
data(Sonar)
dataset <- Sonar
x <- dataset[,1:60]
y <- dataset[,61]
# make this data very imbalance
y[4:length(y)] <- "M"
y <- as.factor(y)
dataset$Class <- y
# create index and indexOut
seed <- 1
set.seed(seed)
folds <- 2
idxAll <- 1:nrow(x)
cvIndex <- createFolds(factor(y), folds, returnTrain = T)
cvIndexOut <- lapply(1:length(cvIndex), function(i){
idxAll[-cvIndex[[i]]]
})
names(cvIndexOut) <- names(cvIndex)
# set the index, indexOut and prSummaryCorrect
control <- trainControl(index = cvIndex, indexOut = cvIndexOut,
method="cv", summaryFunction = prSummary, classProbs = T)
metric <- "AUC"
set.seed(seed)
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(Class~., data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
错误信息如下:
Error in ROCR::prediction(y_pred, y_true) :
Number of classes is not equal to 2.
ROCR currently supports only evaluation of binary classification tasks.
我想我发现了奇怪的东西...
即使我指定了交叉验证索引,汇总函数(无论是 prSummary 还是其他汇总函数)仍然会随机(我不确定)select 十个样本来计算性能。
我的做法是用tryCatch定义一个summary函数来避免这个错误的发生。
prSummaryCorrect <- function (data, lev = NULL, model = NULL) {
print(data)
print(dim(data))
library(MLmetrics)
library(PRROC)
if (length(levels(data$obs)) != 2)
stop(levels(data$obs))
if (length(levels(data$obs)) > 2)
stop(paste("Your outcome has", length(levels(data$obs)),
"levels. The prSummary() function isn't appropriate."))
if (!all(levels(data[, "pred"]) == levels(data[, "obs"])))
stop("levels of observed and predicted data do not match")
res <- tryCatch({
auc <- MLmetrics::PRAUC(y_pred = data[, lev[2]], y_true = ifelse(data$obs == lev[2], 1, 0))
}, warning = function(war) {
print(war)
auc <- NA
}, error = function(e){
print(dim(data))
auc <- NA
}, finally = {
print("finally")
auc <- NA
})
c(AUC = res,
Precision = precision.default(data = data$pred, reference = data$obs, relevant = lev[2]),
Recall = recall.default(data = data$pred, reference = data$obs, relevant = lev[2]),
F = F_meas.default(data = data$pred, reference = data$obs, relevant = lev[2]))
}
我有一个不平衡的数据,我想做分层交叉验证并使用精确召回率auc作为我的评估指标。
我在带分层索引的r包caret中使用prSummary,在计算性能时遇到错误。
以下是可以重现的示例。我发现计算p-r auc的sample只有十个,因为不平衡,只有一个class所以无法计算p-r auc。 (我发现只有十个样本来计算 p-r auc 的原因是因为我修改了 prSummary 以强制此函数打印出数据)
library(randomForest)
library(mlbench)
library(caret)
# Load Dataset
data(Sonar)
dataset <- Sonar
x <- dataset[,1:60]
y <- dataset[,61]
# make this data very imbalance
y[4:length(y)] <- "M"
y <- as.factor(y)
dataset$Class <- y
# create index and indexOut
seed <- 1
set.seed(seed)
folds <- 2
idxAll <- 1:nrow(x)
cvIndex <- createFolds(factor(y), folds, returnTrain = T)
cvIndexOut <- lapply(1:length(cvIndex), function(i){
idxAll[-cvIndex[[i]]]
})
names(cvIndexOut) <- names(cvIndex)
# set the index, indexOut and prSummaryCorrect
control <- trainControl(index = cvIndex, indexOut = cvIndexOut,
method="cv", summaryFunction = prSummary, classProbs = T)
metric <- "AUC"
set.seed(seed)
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(Class~., data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
错误信息如下:
Error in ROCR::prediction(y_pred, y_true) :
Number of classes is not equal to 2.
ROCR currently supports only evaluation of binary classification tasks.
我想我发现了奇怪的东西...
即使我指定了交叉验证索引,汇总函数(无论是 prSummary 还是其他汇总函数)仍然会随机(我不确定)select 十个样本来计算性能。
我的做法是用tryCatch定义一个summary函数来避免这个错误的发生。
prSummaryCorrect <- function (data, lev = NULL, model = NULL) {
print(data)
print(dim(data))
library(MLmetrics)
library(PRROC)
if (length(levels(data$obs)) != 2)
stop(levels(data$obs))
if (length(levels(data$obs)) > 2)
stop(paste("Your outcome has", length(levels(data$obs)),
"levels. The prSummary() function isn't appropriate."))
if (!all(levels(data[, "pred"]) == levels(data[, "obs"])))
stop("levels of observed and predicted data do not match")
res <- tryCatch({
auc <- MLmetrics::PRAUC(y_pred = data[, lev[2]], y_true = ifelse(data$obs == lev[2], 1, 0))
}, warning = function(war) {
print(war)
auc <- NA
}, error = function(e){
print(dim(data))
auc <- NA
}, finally = {
print("finally")
auc <- NA
})
c(AUC = res,
Precision = precision.default(data = data$pred, reference = data$obs, relevant = lev[2]),
Recall = recall.default(data = data$pred, reference = data$obs, relevant = lev[2]),
F = F_meas.default(data = data$pred, reference = data$obs, relevant = lev[2]))
}