尝试调整 R Tidymodels 中的 xgboost 超参数时模型失败
Models failing while trying to tune xgboost hyperparameters in R Tidymodels
我不确定我哪里出错了。当我 运行 以下调整网格中的模型失败时。我收到此警告消息:'All models failed in tune_grid()'。
非常感谢任何帮助。
# PREPROCESSING -- RECIPE ---------------------------------------------------------------------
library(recipes)
xgb_recipe <- recipe(EVENT ~ ., data = train_data) %>% # define target & data
#step_string2factor(all_nominal()) %>%
#step_dummy(all_predictors()) %>%
recipes::step_other(all_nominal(), threshold = 0.01) %>%
recipes::step_nzv(all_nominal()) %>%
#step_downsample(EVENT) %>%
prep()
> xgb_recipe
Data Recipe
Inputs:
role #variables outcome 1 predictor 272
Training data contained 2427 data points and no missing data.
Operations:
Collapsing factor levels for PROGRAM_TYPE_CODE, PREFERENCE_NUMBER, ...
[trained] Sparse, unbalanced variable filter removed
PRIOR_PGRD_PRG_YR, PRIOR_TF_SC_PRG_YR, ETHNIC_GROUP_DESC, HASEMAIL,
... [trained]
# XGB SPEC ------------------------------------------------------------------------------------
xgb_spec <- boost_tree(
trees = 600, ## nround=6000
tree_depth = tune(), min_n = tune(), ## max_depth = 6
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(), mtry = tune(), ## randomness
learn_rate = tune(), ## step size,
#num_class=4,
#objective = 'multi:softprob' #%>%
#nthreads=20 %>%
#print_every_n = 300
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
Boosted Tree Model Specification (classification)
Main Arguments: mtry = tune() trees = 600 min_n = tune()
tree_depth = tune() learn_rate = tune() loss_reduction = tune()
sample_size = tune()
Computational engine: xgboost
# GRID ----------------------------------------------------------------------------------------
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), train_data),
learn_rate(),
size = 20
)
xgb_grid
A tibble: 20 x 6 tree_depth min_n loss_reduction sample_size mtry
learn_rate
1 4 15 1.71e- 6 0.256 110 2.14e- 9 2 7
29 4.08e- 8 0.836 97 2.07e-10 3 10 26
6.44e- 7 0.883 66 7.59e- 8 4 8 28 9.77e- 1 0.964 270 3.90e- 8 5 1 19 4.27e- 4
0.733 208 8.00e- 4 6 3 5 1.61e+ 1 0.392 220 4.04e-10 7 5 9 1.48e- 9 0.673 163
1.63e- 7 8 11 34 4.20e- 5 0.569 178 1.69e- 8 9 12 38 7.80e+ 0 0.143 79 8.67e- 7 10
4 12 5.58e- 9 0.946 173 1.17e- 2 11 14
2 1.30e- 4 0.805 202 1.10e- 4 12 15 21
9.15e- 3 0.454 134 3.82e- 3 13 9 21 4.99e- 6 0.500 10 2.91e- 9 14 7 17 7.60e-10
0.232 248 1.57e- 6 15 12 11 4.85e- 1 0.297 21 1.23e- 5 16 7 35 7.63e- 8 0.516 95
9.60e- 2 17 2 6 1.01e- 1 0.353 48 3.57e- 6 18 10 23 2.57e-10 0.161 33 1.46e- 2 19
13 40 2.00e- 3 0.715 150 3.44e- 5 20 5
32 1.25e- 2 0.610 234 4.95e- 4
# WORKFLOW ------------------------------------------------------------------------------------
xgb_wf <- workflow() %>%
add_recipe(xgb_recipe) %>%
add_model(xgb_spec)
xgb_wf
══ Workflow
═══════════════════════════════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe Model: boost_tree()
── Preprocessor
───────────────────────────────────────────────────────────────────────────────────────────────────────────
2 Recipe Steps
● step_other() ● step_nzv()
── Model
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)
Main Arguments: mtry = tune() trees = 600 min_n = tune()
tree_depth = tune() learn_rate = tune() loss_reduction = tune()
sample_size = tune()
Computational engine: xgboost
# CROSS-VALIDATION Resamples -----------------------------------------------------------------
# 10 fold CV
set.seed(123)
cv_folds <- vfold_cv(train_data)
cv_folds
10-fold cross-validation
A tibble: 10 x 2 splits id 1 Fold01 2 Fold02 3
Fold03 4 Fold04 5 Fold05 6 Fold06 7
Fold07 8 Fold08 9 Fold09 10
Fold10
# TUNING --------------------------------------------------------------------------------------
all_cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(all_cores)
registerDoParallel(cl)
s <- Sys.time()
set.seed(2020)
xgb_res <- tune_grid(
xgb_wf,
resamples = cv_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE,
verbose = TRUE)
)
Sys.time() - s
xgb_res
xgb_res
# 10-fold cross-validation
# A tibble: 10 x 5
splits id .metrics .notes .predictions
1 Fold01
2 Fold02
3 Fold03
4 Fold04
5 Fold05
6 Fold06
7 Fold07
8 Fold08
9 Fold09
10 Fold10
警告信息:
所有模型都在 tune_grid() 中失败。请参阅 .notes
列。
xgboost算法对数据比较挑剔,必须是数字,也许值得一试。
如果您的代码没有可重现的数据样本,则很难确定问题出在哪里。
如果数据是敏感的,那么就创建一些虚假数据。
或许尝试 运行 一个没有并行化的小样本模型,看看是否能解决问题。
我不确定我哪里出错了。当我 运行 以下调整网格中的模型失败时。我收到此警告消息:'All models failed in tune_grid()'。 非常感谢任何帮助。
# PREPROCESSING -- RECIPE ---------------------------------------------------------------------
library(recipes)
xgb_recipe <- recipe(EVENT ~ ., data = train_data) %>% # define target & data
#step_string2factor(all_nominal()) %>%
#step_dummy(all_predictors()) %>%
recipes::step_other(all_nominal(), threshold = 0.01) %>%
recipes::step_nzv(all_nominal()) %>%
#step_downsample(EVENT) %>%
prep()
> xgb_recipe
Data Recipe
Inputs:
role #variables outcome 1 predictor 272
Training data contained 2427 data points and no missing data.
Operations:
Collapsing factor levels for PROGRAM_TYPE_CODE, PREFERENCE_NUMBER, ... [trained] Sparse, unbalanced variable filter removed PRIOR_PGRD_PRG_YR, PRIOR_TF_SC_PRG_YR, ETHNIC_GROUP_DESC, HASEMAIL, ... [trained]
# XGB SPEC ------------------------------------------------------------------------------------
xgb_spec <- boost_tree(
trees = 600, ## nround=6000
tree_depth = tune(), min_n = tune(), ## max_depth = 6
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(), mtry = tune(), ## randomness
learn_rate = tune(), ## step size,
#num_class=4,
#objective = 'multi:softprob' #%>%
#nthreads=20 %>%
#print_every_n = 300
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
Boosted Tree Model Specification (classification)
Main Arguments: mtry = tune() trees = 600 min_n = tune()
tree_depth = tune() learn_rate = tune() loss_reduction = tune()
sample_size = tune()Computational engine: xgboost
# GRID ----------------------------------------------------------------------------------------
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), train_data),
learn_rate(),
size = 20
)
xgb_grid
A tibble: 20 x 6 tree_depth min_n loss_reduction sample_size mtry learn_rate 1 4 15 1.71e- 6 0.256 110 2.14e- 9 2 7
29 4.08e- 8 0.836 97 2.07e-10 3 10 26
6.44e- 7 0.883 66 7.59e- 8 4 8 28 9.77e- 1 0.964 270 3.90e- 8 5 1 19 4.27e- 4
0.733 208 8.00e- 4 6 3 5 1.61e+ 1 0.392 220 4.04e-10 7 5 9 1.48e- 9 0.673 163
1.63e- 7 8 11 34 4.20e- 5 0.569 178 1.69e- 8 9 12 38 7.80e+ 0 0.143 79 8.67e- 7 10
4 12 5.58e- 9 0.946 173 1.17e- 2 11 14
2 1.30e- 4 0.805 202 1.10e- 4 12 15 21
9.15e- 3 0.454 134 3.82e- 3 13 9 21 4.99e- 6 0.500 10 2.91e- 9 14 7 17 7.60e-10
0.232 248 1.57e- 6 15 12 11 4.85e- 1 0.297 21 1.23e- 5 16 7 35 7.63e- 8 0.516 95
9.60e- 2 17 2 6 1.01e- 1 0.353 48 3.57e- 6 18 10 23 2.57e-10 0.161 33 1.46e- 2 19
13 40 2.00e- 3 0.715 150 3.44e- 5 20 5
32 1.25e- 2 0.610 234 4.95e- 4
# WORKFLOW ------------------------------------------------------------------------------------
xgb_wf <- workflow() %>%
add_recipe(xgb_recipe) %>%
add_model(xgb_spec)
xgb_wf
══ Workflow ═══════════════════════════════════════════════════════════════════════════════════════════════════════════════ Preprocessor: Recipe Model: boost_tree()
── Preprocessor ─────────────────────────────────────────────────────────────────────────────────────────────────────────── 2 Recipe Steps
● step_other() ● step_nzv()
── Model ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Boosted Tree Model Specification (classification)
Main Arguments: mtry = tune() trees = 600 min_n = tune()
tree_depth = tune() learn_rate = tune() loss_reduction = tune()
sample_size = tune()Computational engine: xgboost
# CROSS-VALIDATION Resamples -----------------------------------------------------------------
# 10 fold CV
set.seed(123)
cv_folds <- vfold_cv(train_data)
cv_folds
10-fold cross-validation
A tibble: 10 x 2 splits id 1 Fold01 2 Fold02 3
Fold03 4 Fold04 5 Fold05 6 Fold06 7 Fold07 8 Fold08 9 Fold09 10 Fold10
# TUNING --------------------------------------------------------------------------------------
all_cores <- parallel::detectCores(logical = FALSE)
library(doParallel)
cl <- makePSOCKcluster(all_cores)
registerDoParallel(cl)
s <- Sys.time()
set.seed(2020)
xgb_res <- tune_grid(
xgb_wf,
resamples = cv_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE,
verbose = TRUE)
)
Sys.time() - s
xgb_res
xgb_res # 10-fold cross-validation # A tibble: 10 x 5 splits id .metrics .notes .predictions
1 Fold01
2 Fold02
3 Fold03
4 Fold04
5 Fold05
6 Fold06
7 Fold07
8 Fold08
9 Fold09
10 Fold10
警告信息:
所有模型都在 tune_grid() 中失败。请参阅 .notes
列。
xgboost算法对数据比较挑剔,必须是数字,也许值得一试。
如果您的代码没有可重现的数据样本,则很难确定问题出在哪里。
如果数据是敏感的,那么就创建一些虚假数据。
或许尝试 运行 一个没有并行化的小样本模型,看看是否能解决问题。