使用插入符号的交叉验证计算样本内预测准确性
Calculating In-sample predictive accuracy using carets' cross validation
我想计算某些指标的样本内和样本外预测准确性,同时使用插入符号的 k 折交叉验证。
到目前为止我得到了
library(MASS)
library(leaps)
library(caret)
library(tidyverse)
full_df <- surgical
set.seed(123)
Performance_Summary <- function(data,
lev = NULL,
model = NULL) {
c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
MAE = mean(data$obs - data$pred))
}
train.Control <- trainControl(method = "cv", number = 10, summaryFunction = Performance_Summary)
cv_linear_model <- train(y~., data = full_df, method = "lm", trControl = train.Control)
cv_linear_model
这应该给出 10 个样本外(测试)集的 RMSE 和 MAE 的平均值。
我现在想做的是计算 10 个样本(训练)集合中每个集合的平均 RMSE 和 MAE。
这可以使用 caret 包吗?或者我是否需要手动实施 k 折交叉验证以获得样本内指标。
感谢您的帮助!
如果您不介意两次拟合模型,您将首先设置测试和训练折叠,使用示例数据集 BostonHousing
其中 medv
是因变量:
library(mlbench)
data(BostonHousing)
full_df = BostonHousing[1:400,]
#create folds
set.seed(111)
testFolds = createFolds(full_df$medv,k=10)
trFolds =lapply(testFolds,function(i)setdiff(1:nrow(full_df),i))
MAE有误,应该是绝对平均值:
Performance_Summary <- function(data,
lev = NULL,
model = NULL) {
c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
MAE = mean(abs(data$obs - data$pred)))
}
运行 用于测试数据,就像通常在插入符号中一样:
test.Control <- trainControl(method = "cv", summaryFunction = Performance_Summary,index=trFolds,indexOut=testFolds)
results_test <- train(medv~., data = full_df, method = "lm", trControl = test.Control)
head(results_test$resample)
RMSE MAE Resample
1 4.07 3.02 Fold01
2 4.10 3.04 Fold02
3 5.76 4.48 Fold03
4 4.16 2.97 Fold04
5 4.10 3.01 Fold05
6 6.14 4.25 Fold06
运行 使用相同的训练,但也使用相同的索引进行测试:
train.Control <- trainControl(method = "cv", summaryFunction = Performance_Summary,index=trFolds,indexOut=trFolds)
results_train <- train(medv~., data = full_df, method = "lm", trControl = train.Control)
head(results_train$resample)
RMSE MAE Resample
1 4.80 3.35 Fold01
2 4.80 3.32 Fold02
3 4.63 3.19 Fold03
4 4.79 3.29 Fold04
5 4.80 3.31 Fold05
6 4.57 3.18 Fold06
下面是一个简单的实现,您可以看到我们得到了相同的结果。首先我们稍微改变度量函数:
mets <- function(obs,pred){
c(
RMSE = sqrt(mean((obs-pred)^2)),
MAE = mean(abs(obs - pred))
)
}
然后:
results = lapply(1:length(testFolds),function(i){
trData = full_df[trFolds[[i]],]
testData = full_df[testFolds[[i]],]
fit = lm(medv ~., data = trData)
inSample = mets(trData$medv,fit$fitted.values)
outSample = mets(testData$medv,predict(fit,testData))
data.frame(
folds = i,
inSample_RMSE = inSample[1],
inSample_MAE = inSample[2],
outSample_RMSE = outSample[1],
outSample_MAE = outSample[2]
)
})
results = do.call(rbind,results)
folds inSample_RMSE inSample_MAE outSample_RMSE outSample_MAE
RMSE 1 4.80 3.35 4.07 3.02
RMSE1 2 4.80 3.32 4.10 3.04
RMSE2 3 4.63 3.19 5.76 4.48
RMSE3 4 4.79 3.29 4.16 2.97
RMSE4 5 4.80 3.31 4.10 3.01
RMSE5 6 4.57 3.18 6.14 4.25
我想计算某些指标的样本内和样本外预测准确性,同时使用插入符号的 k 折交叉验证。
到目前为止我得到了
library(MASS)
library(leaps)
library(caret)
library(tidyverse)
full_df <- surgical
set.seed(123)
Performance_Summary <- function(data,
lev = NULL,
model = NULL) {
c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
MAE = mean(data$obs - data$pred))
}
train.Control <- trainControl(method = "cv", number = 10, summaryFunction = Performance_Summary)
cv_linear_model <- train(y~., data = full_df, method = "lm", trControl = train.Control)
cv_linear_model
这应该给出 10 个样本外(测试)集的 RMSE 和 MAE 的平均值。
我现在想做的是计算 10 个样本(训练)集合中每个集合的平均 RMSE 和 MAE。
这可以使用 caret 包吗?或者我是否需要手动实施 k 折交叉验证以获得样本内指标。
感谢您的帮助!
如果您不介意两次拟合模型,您将首先设置测试和训练折叠,使用示例数据集 BostonHousing
其中 medv
是因变量:
library(mlbench)
data(BostonHousing)
full_df = BostonHousing[1:400,]
#create folds
set.seed(111)
testFolds = createFolds(full_df$medv,k=10)
trFolds =lapply(testFolds,function(i)setdiff(1:nrow(full_df),i))
MAE有误,应该是绝对平均值:
Performance_Summary <- function(data,
lev = NULL,
model = NULL) {
c(RMSE = sqrt(mean((data$obs-data$pred)^2)),
MAE = mean(abs(data$obs - data$pred)))
}
运行 用于测试数据,就像通常在插入符号中一样:
test.Control <- trainControl(method = "cv", summaryFunction = Performance_Summary,index=trFolds,indexOut=testFolds)
results_test <- train(medv~., data = full_df, method = "lm", trControl = test.Control)
head(results_test$resample)
RMSE MAE Resample
1 4.07 3.02 Fold01
2 4.10 3.04 Fold02
3 5.76 4.48 Fold03
4 4.16 2.97 Fold04
5 4.10 3.01 Fold05
6 6.14 4.25 Fold06
运行 使用相同的训练,但也使用相同的索引进行测试:
train.Control <- trainControl(method = "cv", summaryFunction = Performance_Summary,index=trFolds,indexOut=trFolds)
results_train <- train(medv~., data = full_df, method = "lm", trControl = train.Control)
head(results_train$resample)
RMSE MAE Resample
1 4.80 3.35 Fold01
2 4.80 3.32 Fold02
3 4.63 3.19 Fold03
4 4.79 3.29 Fold04
5 4.80 3.31 Fold05
6 4.57 3.18 Fold06
下面是一个简单的实现,您可以看到我们得到了相同的结果。首先我们稍微改变度量函数:
mets <- function(obs,pred){
c(
RMSE = sqrt(mean((obs-pred)^2)),
MAE = mean(abs(obs - pred))
)
}
然后:
results = lapply(1:length(testFolds),function(i){
trData = full_df[trFolds[[i]],]
testData = full_df[testFolds[[i]],]
fit = lm(medv ~., data = trData)
inSample = mets(trData$medv,fit$fitted.values)
outSample = mets(testData$medv,predict(fit,testData))
data.frame(
folds = i,
inSample_RMSE = inSample[1],
inSample_MAE = inSample[2],
outSample_RMSE = outSample[1],
outSample_MAE = outSample[2]
)
})
results = do.call(rbind,results)
folds inSample_RMSE inSample_MAE outSample_RMSE outSample_MAE
RMSE 1 4.80 3.35 4.07 3.02
RMSE1 2 4.80 3.32 4.10 3.04
RMSE2 3 4.63 3.19 5.76 4.48
RMSE3 4 4.79 3.29 4.16 2.97
RMSE4 5 4.80 3.31 4.10 3.01
RMSE5 6 4.57 3.18 6.14 4.25