mrl3 集成模型中的重复 cv
Repeated cv in a mrl3 ensemble model
我有一个漂亮的 mlr3
集成模型(结合 glmnet
和 glm
)用于二元预测,查看详情
library("mlr3verse")
library("dplyr")
# get example data
data(PimaIndiansDiabetes, package="mlbench")
data <- PimaIndiansDiabetes
# add an additional predictor "superdoc" which is not entered in the glmnet but in the final glm
set.seed(2323)
data %>%
rowwise() %>%
mutate(superdoc=case_when(diabetes=="pos" ~ as.numeric(sample(0:2,1)), TRUE~ 0)) %>%
ungroup -> data
# make a rather small train set
set.seed(23)
test.data <- sample_n(data,70,replace=FALSE)
# creat elastic net regression
glmnet_lrn = lrn("classif.cv_glmnet", predict_type = "prob")
# create the learner out-of-bag predictions
glmnet_cv1 = po("learner_cv", glmnet_lrn, id = "glmnet")
# PipeOp that drops 'superdoc', i.e. selects all except 'superdoc'
# (ID given to avoid ID clash with other selector)
drop_superdoc = po("select", id = "drop.superdoc",
selector = selector_invert(selector_name("superdoc")))
# PipeOp that selects 'superdoc' (and drops all other columns)
select_superdoc = po("select", id = "select.superdoc",
selector = selector_name("superdoc"))
# superdoc along one path, the fitted model along the other
stacking_layer = gunion(list(
select_superdoc,
drop_superdoc %>>% glmnet_cv1
)) %>>% po("featureunion", id = "union1")
# final logistic regression
log_reg_lrn = lrn("classif.log_reg", predict_type = "prob")
# combine ensemble model
ensemble = stacking_layer %>>% log_reg_lrn
#define tests
train.task <- TaskClassif$new("test.data", test.data, target = "diabetes")
# make ensemble learner
elearner = as_learner(ensemble)
ensemble$plot(html = FALSE)
如果我用不同的 set.seed 训练它,我会得到不同的系数。
我认为这主要是由 引起的,可以通过重复交叉验证来缓解。
# Train the Learner:
# seed 1
elearner = as_learner(ensemble)
set.seed(22521136)
elearner$train(train.task) -> seed1
# seed 2
elearner = as_learner(ensemble)
set.seed(12354)
elearner$train(train.task) -> seed2
# different coefficients of the glment model
coef(seed1$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.238598277
#> age .
#> glucose 0.023462376
#> insulin -0.001007037
#> mass 0.055587740
#> pedigree 0.322911217
#> pregnant 0.137419564
#> pressure .
#> triceps .
coef(seed2$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.876802620
#> age .
#> glucose 0.025601712
#> insulin -0.001500856
#> mass 0.063029550
#> pedigree 0.464369417
#> pregnant 0.155971123
#> pressure .
#> triceps .
# different coefficients of the final regression model
seed1$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> -9.438452 23.710923 8.726956 NA
seed2$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> 0.3698143 23.5362542 -5.5514365 NA
问题:
在哪里以及如何在我的 mlr3
集成模型中输入重复的交叉验证来缓解这些不同的结果?非常感谢任何帮助。
感谢 missuse 的评论,他的精彩教程(Tuning a stacked learner) and mb706's 我想我可以解决我的问题。
将"classif.cv_glmnet"
替换为"classif.glmnet"
# Add tuning
resampling = rsmp("repeated_cv")
resampling$param_set$values = list(repeats = 10, folds=5)
ps_ens = ParamSet$new(
list(
ParamDbl$new("glmnet.alpha", 0, 1),
ParamDbl$new("glmnet.s", 0, 1)))
auto1 = AutoTuner$new(
learner = elearner,
resampling = resampling,
measure = msr("classif.auc"),
search_space = ps_ens,
terminator = trm("evals", n_evals = 5), # to limit running time
tuner = tnr("random_search")
)
使用不同的 set.seed
进行训练并获得相同的系数
# Train with different set.seed
#first
set.seed(22521136)
at1= auto1
at1$train(train.task) -> seed1
# second
set.seed(12354)
at2= auto1
at2$train(train.task) -> seed2
# Compare coefficients of the learners
# classif.log_reg
seed1$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
seed2$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
#classif.glmnet
coef(at1$learner$model$glmnet$model, alpha=at1$tuning_result$glmnet.alpha,s=at1$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030
coef(at2$learner$model$glmnet$model, alpha=at2$tuning_result$glmnet.alpha,s=at2$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030
我有一个漂亮的 mlr3
集成模型(结合 glmnet
和 glm
)用于二元预测,查看详情
library("mlr3verse")
library("dplyr")
# get example data
data(PimaIndiansDiabetes, package="mlbench")
data <- PimaIndiansDiabetes
# add an additional predictor "superdoc" which is not entered in the glmnet but in the final glm
set.seed(2323)
data %>%
rowwise() %>%
mutate(superdoc=case_when(diabetes=="pos" ~ as.numeric(sample(0:2,1)), TRUE~ 0)) %>%
ungroup -> data
# make a rather small train set
set.seed(23)
test.data <- sample_n(data,70,replace=FALSE)
# creat elastic net regression
glmnet_lrn = lrn("classif.cv_glmnet", predict_type = "prob")
# create the learner out-of-bag predictions
glmnet_cv1 = po("learner_cv", glmnet_lrn, id = "glmnet")
# PipeOp that drops 'superdoc', i.e. selects all except 'superdoc'
# (ID given to avoid ID clash with other selector)
drop_superdoc = po("select", id = "drop.superdoc",
selector = selector_invert(selector_name("superdoc")))
# PipeOp that selects 'superdoc' (and drops all other columns)
select_superdoc = po("select", id = "select.superdoc",
selector = selector_name("superdoc"))
# superdoc along one path, the fitted model along the other
stacking_layer = gunion(list(
select_superdoc,
drop_superdoc %>>% glmnet_cv1
)) %>>% po("featureunion", id = "union1")
# final logistic regression
log_reg_lrn = lrn("classif.log_reg", predict_type = "prob")
# combine ensemble model
ensemble = stacking_layer %>>% log_reg_lrn
#define tests
train.task <- TaskClassif$new("test.data", test.data, target = "diabetes")
# make ensemble learner
elearner = as_learner(ensemble)
ensemble$plot(html = FALSE)
如果我用不同的 set.seed 训练它,我会得到不同的系数。
我认为这主要是由
# Train the Learner:
# seed 1
elearner = as_learner(ensemble)
set.seed(22521136)
elearner$train(train.task) -> seed1
# seed 2
elearner = as_learner(ensemble)
set.seed(12354)
elearner$train(train.task) -> seed2
# different coefficients of the glment model
coef(seed1$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.238598277
#> age .
#> glucose 0.023462376
#> insulin -0.001007037
#> mass 0.055587740
#> pedigree 0.322911217
#> pregnant 0.137419564
#> pressure .
#> triceps .
coef(seed2$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.876802620
#> age .
#> glucose 0.025601712
#> insulin -0.001500856
#> mass 0.063029550
#> pedigree 0.464369417
#> pregnant 0.155971123
#> pressure .
#> triceps .
# different coefficients of the final regression model
seed1$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> -9.438452 23.710923 8.726956 NA
seed2$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> 0.3698143 23.5362542 -5.5514365 NA
问题:
在哪里以及如何在我的 mlr3
集成模型中输入重复的交叉验证来缓解这些不同的结果?非常感谢任何帮助。
感谢 missuse 的评论,他的精彩教程(Tuning a stacked learner) and mb706's
将"classif.cv_glmnet"
替换为"classif.glmnet"
# Add tuning
resampling = rsmp("repeated_cv")
resampling$param_set$values = list(repeats = 10, folds=5)
ps_ens = ParamSet$new(
list(
ParamDbl$new("glmnet.alpha", 0, 1),
ParamDbl$new("glmnet.s", 0, 1)))
auto1 = AutoTuner$new(
learner = elearner,
resampling = resampling,
measure = msr("classif.auc"),
search_space = ps_ens,
terminator = trm("evals", n_evals = 5), # to limit running time
tuner = tnr("random_search")
)
使用不同的 set.seed
进行训练并获得相同的系数
# Train with different set.seed
#first
set.seed(22521136)
at1= auto1
at1$train(train.task) -> seed1
# second
set.seed(12354)
at2= auto1
at2$train(train.task) -> seed2
# Compare coefficients of the learners
# classif.log_reg
seed1$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
seed2$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
#classif.glmnet
coef(at1$learner$model$glmnet$model, alpha=at1$tuning_result$glmnet.alpha,s=at1$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030
coef(at2$learner$model$glmnet$model, alpha=at2$tuning_result$glmnet.alpha,s=at2$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030