将变量列表传递给 tidymodels 中的配方会导致模型错误

passing a list of variables to recipe in tidymodels causes model error

我有一个训练模型的简单方法。我的分类变量随时间变化,有时我希望将数字视为分类变量(邮政编码),因此我在包含它们的配方之前定义了一个列表。 (只是为了论证,列表要长得多)

配方工作正常,然后训练我的模型(3 次)但出现错误。

 model_tuned$.notes
[[1]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

[[2]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

[[3]]
# A tibble: 1 x 1
  .notes                                                     
  <chr>                                                      
1 preprocessor 1/1: Error: object 'my_categorical' not found 

是否有任何正确的方法将变量列表传递给配方而不会使模型崩溃?

REPREX

    library(recipes)
    library(magrittr)
    library(tidyverse)
    library(xgboost)
    library(tidymodels)
    
    mtcars1 <- mtcars
    
    
    mtcars1 %<>% dplyr::mutate(new1 = sample.int(200, 32, replace = TRUE),
                              new2 = sample.int(100, 32, replace = TRUE),
                              new3 = sample.int(50, 32, replace = TRUE))
    
    my_categorical <- c("new1", "new2", "new3")
    
    mtcars_split <- initial_split(mtcars1, strata = drat)
    train <- training(mtcars_split)
    test  <- testing(mtcars_split)
    
    recipe <-
      recipes::recipe(drat ~ ., data = train) %>%
      recipes::step_mutate_at(all_of(my_categorical), fn = ~as.character(.)) %>%
      recipes::step_string2factor(all_of(my_categorical)) %>% 
      prep()
    
    
    cv_folds <-
      vfold_cv(train, 
               v = 3, 
               strata = drat)
    
    
    xgboost_model <-
      parsnip::boost_tree(
        mode = "classification",
        trees = 100,
        min_n = tune(),
        tree_depth = tune(),
        learn_rate = tune(),
        loss_reduction = tune(),
        mtry = tune()
      ) %>%
      set_engine("xgboost") %>% 
      set_mode("classification")
    
    
    xgboost_workflow <-
      workflows::workflow() %>%
      add_recipe(recipe) %>% 
      add_model(xgboost_model) 
    
    
    xgboost_grid <-
      parameters(xgboost_model) %>%
      finalize(select(training(mtcars_split), -drat)) %>%
      grid_max_entropy(size = 100)
    
    
    model_metrics <- yardstick::metric_set(gain_capture,roc_auc)
    
    
    xgboost_tuned <-
      tune::tune_grid(
        object = xgboost_workflow,
        resamples = cv_folds,
        grid = xgboost_grid,
        metrics = model_metrics,
        control = tune::control_grid(save_pred = TRUE, save_workflow = TRUE)
      )

 xgboost_tuned$.notes
[[1]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found

[[2]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found

[[3]]
# A tibble: 1 x 1
  .notes                                                        
  <chr>                                                         
1 preprocessor 1/1: Error: object 'my_categorical' not found


sessioninfo::session_info()
- Session info -------------------------------------------------------------------------------
 setting  value                       
 version  R version 4.0.5 (2021-03-31)
 os       Windows 10 x64              
 system   x86_64, mingw32             
 ui       RStudio                     
 language (EN)                        
 collate  Spanish_Spain.1252          
 ctype    Spanish_Spain.1252          
 tz       Europe/Paris                
 date     2021-06-25                  

- Packages -----------------------------------------------------------------------------------
 package      * version    date       lib source                               
 askpass        1.1        2019-01-13 [1] CRAN (R 4.0.5)                       
 assertthat     0.2.1      2019-03-21 [1] CRAN (R 4.0.5)                       
 backports      1.2.1      2020-12-09 [1] CRAN (R 4.0.3)                       
 base64enc      0.1-3      2015-07-28 [1] CRAN (R 4.0.3)                       
 BBmisc         1.11       2017-03-10 [1] CRAN (R 4.0.5)                       
 broom        * 0.7.6      2021-04-05 [1] CRAN (R 4.0.5)                       
 butcher        0.1.4      2021-03-19 [1] CRAN (R 4.0.5)                       
 cachem         1.0.4      2021-02-13 [1] CRAN (R 4.0.5)                       
 cellranger     1.1.0      2016-07-27 [1] CRAN (R 4.0.5)                       
 checkmate      2.0.0      2020-02-06 [1] CRAN (R 4.0.5)                       
 class          7.3-18     2021-01-24 [2] CRAN (R 4.0.5)                       
 cli            2.5.0      2021-04-26 [1] CRAN (R 4.0.5)                       
 cluster        2.1.1      2021-02-14 [2] CRAN (R 4.0.5)                       
 codetools      0.2-18     2020-11-04 [2] CRAN (R 4.0.5)                       
 colorspace     2.0-1      2021-05-04 [1] CRAN (R 4.0.5)                       
 crayon         1.4.1      2021-02-08 [1] CRAN (R 4.0.5)                       
 credentials    1.3.0      2020-07-21 [1] CRAN (R 4.0.5)                       
 curl           4.3.1      2021-04-30 [1] CRAN (R 4.0.5)                       
 data.table     1.14.0     2021-02-21 [1] CRAN (R 4.0.5)                       
 DBI            1.1.1      2021-01-15 [1] CRAN (R 4.0.5)                       
 dbplyr         2.1.1      2021-04-06 [1] CRAN (R 4.0.5)                       
 dials        * 0.0.9      2020-09-16 [1] CRAN (R 4.0.5)                       
 DiceDesign     1.9        2021-02-13 [1] CRAN (R 4.0.5)                       
 digest         0.6.27     2020-10-24 [1] CRAN (R 4.0.5)                       
 doParallel     1.0.16     2020-10-16 [1] CRAN (R 4.0.5)                       
 dplyr        * 1.0.6      2021-05-05 [1] CRAN (R 4.0.3)                       
 ellipsis       0.3.2      2021-04-29 [1] CRAN (R 4.0.5)                       
 fansi          0.4.2      2021-01-15 [1] CRAN (R 4.0.5)                       
 fastmap        1.1.0      2021-01-25 [1] CRAN (R 4.0.5)                       
 fastmatch      1.1-0      2017-01-28 [1] CRAN (R 4.0.3)                       
 FNN            1.1.3      2019-02-15 [1] CRAN (R 4.0.5)                       
 forcats      * 0.5.1      2021-01-27 [1] CRAN (R 4.0.5)                       
 foreach        1.5.1      2020-10-15 [1] CRAN (R 4.0.5)                       
 foreign        0.8-81     2020-12-22 [2] CRAN (R 4.0.5)                       
 Formula        1.2-4      2020-10-16 [1] CRAN (R 4.0.3)                       
 fs             1.5.0      2020-07-31 [1] CRAN (R 4.0.5)                       
 furrr          0.2.2      2021-01-29 [1] CRAN (R 4.0.5)                       
 future         1.21.0     2020-12-10 [1] CRAN (R 4.0.5)                       
 generics       0.1.0      2020-10-31 [1] CRAN (R 4.0.5)                       
 gert           1.3.0      2021-03-29 [1] CRAN (R 4.0.5)                       
 ggplot2      * 3.3.3      2020-12-30 [1] CRAN (R 4.0.5)                       
 globals        0.14.0     2020-11-22 [1] CRAN (R 4.0.3)                       
 glue           1.4.2      2020-08-27 [1] CRAN (R 4.0.5)                       
 gower          0.2.2      2020-06-23 [1] CRAN (R 4.0.3)                       
 GPfit          1.0-8      2019-02-08 [1] CRAN (R 4.0.5)                       
 gridExtra      2.3        2017-09-09 [1] CRAN (R 4.0.5)                       
 gtable         0.3.0      2019-03-25 [1] CRAN (R 4.0.5)                       
 hardhat        0.1.5      2020-11-09 [1] CRAN (R 4.0.5)                       
 haven          2.4.1      2021-04-23 [1] CRAN (R 4.0.5)                       
 Hmisc          4.5-0      2021-02-28 [1] CRAN (R 4.0.5)                       
 hms            1.1.0      2021-05-17 [1] CRAN (R 4.0.5)                       
 htmlTable      2.1.0      2020-09-16 [1] CRAN (R 4.0.5)                       
 htmltools      0.5.1.1    2021-01-22 [1] CRAN (R 4.0.5)                       
 htmlwidgets    1.5.3      2020-12-10 [1] CRAN (R 4.0.5)                       
 httr           1.4.2      2020-07-20 [1] CRAN (R 4.0.5)                       
 infer        * 0.5.4      2021-01-13 [1] CRAN (R 4.0.5)                       
 ipred          0.9-11     2021-03-12 [1] CRAN (R 4.0.5)                       
 iterators      1.0.13     2020-10-15 [1] CRAN (R 4.0.5)                       
 jpeg           0.1-8.1    2019-10-24 [1] CRAN (R 4.0.3)                       
 jsonlite       1.7.2      2020-12-09 [1] CRAN (R 4.0.5)                       
 knitr          1.33       2021-04-24 [1] CRAN (R 4.0.5)                       
 lattice        0.20-41    2020-04-02 [2] CRAN (R 4.0.5)                       
 latticeExtra   0.6-29     2019-12-19 [1] CRAN (R 4.0.5)                       
 lava           1.6.9      2021-03-11 [1] CRAN (R 4.0.5)                       
 lhs            1.1.1      2020-10-05 [1] CRAN (R 4.0.5)                       
 lifecycle      1.0.0      2021-02-15 [1] CRAN (R 4.0.5)                       
 listenv        0.8.0      2019-12-05 [1] CRAN (R 4.0.5)                       
 lubridate      1.7.10     2021-02-26 [1] CRAN (R 4.0.5)                       
 magrittr     * 2.0.1      2020-11-17 [1] CRAN (R 4.0.5)                       
 MASS           7.3-53.1   2021-02-12 [2] CRAN (R 4.0.5)                       
 Matrix         1.3-2      2021-01-06 [2] CRAN (R 4.0.5)                       
 memoise        2.0.0      2021-01-26 [1] CRAN (R 4.0.5)                       
 memuse         4.1-0      2020-02-17 [1] CRAN (R 4.0.3)                       
 mlr            2.19.0     2021-02-22 [1] CRAN (R 4.0.5)                       
 modeldata    * 0.1.0      2020-10-22 [1] CRAN (R 4.0.5)                       
 modelr         0.1.8      2020-05-19 [1] CRAN (R 4.0.5)                       
 munsell        0.5.0      2018-06-12 [1] CRAN (R 4.0.5)                       
 nnet           7.3-15     2021-01-24 [2] CRAN (R 4.0.5)                       
 openssl        1.4.4      2021-04-30 [1] CRAN (R 4.0.5)                       
 openxlsx       4.2.3      2020-10-27 [1] CRAN (R 4.0.5)                       
 pacman       * 0.5.1      2019-03-11 [1] CRAN (R 4.0.5)                       
 parallelly     1.25.0     2021-04-30 [1] CRAN (R 4.0.5)                       
 parallelMap    1.5.0      2020-03-26 [1] CRAN (R 4.0.5)                       
 ParamHelpers   1.14       2020-03-24 [1] CRAN (R 4.0.5)                       
 parsnip      * 0.1.5      2021-01-19 [1] CRAN (R 4.0.5)                       
 pillar         1.6.1      2021-05-16 [1] CRAN (R 4.0.5)                       
 pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.0.5)                       
 plyr           1.8.6      2020-03-03 [1] CRAN (R 4.0.5)                       
 png            0.1-7      2013-12-03 [1] CRAN (R 4.0.3)                       
 prettycode     1.1.0      2019-12-16 [1] CRAN (R 4.0.5)                       
 pROC           1.17.0.1   2021-01-13 [1] CRAN (R 4.0.5)                       
 prodlim        2019.11.13 2019-11-17 [1] CRAN (R 4.0.5)                       
 prompt         1.0.1      2021-03-12 [1] CRAN (R 4.0.5)                       
 purrr        * 0.3.4      2020-04-17 [1] CRAN (R 4.0.5)                       
 R6             2.5.0      2020-10-28 [1] CRAN (R 4.0.5)                       
 RANN           2.6.1      2019-01-08 [1] CRAN (R 4.0.5)                       
 rappdirs       0.3.3      2021-01-31 [1] CRAN (R 4.0.5)                       
 RColorBrewer   1.1-2      2014-12-07 [1] CRAN (R 4.0.3)                       
 Rcpp           1.0.6      2021-01-15 [1] CRAN (R 4.0.5)                       
 readr        * 1.4.0      2020-10-05 [1] CRAN (R 4.0.5)                       
 readxl         1.3.1      2019-03-13 [1] CRAN (R 4.0.5)                       
 recipes      * 0.1.16     2021-04-16 [1] CRAN (R 4.0.5)                       
 remotes        2.3.0      2021-04-01 [1] CRAN (R 4.0.5)                       
 reprex         2.0.0      2021-04-02 [1] CRAN (R 4.0.5)                       
 rio            0.5.26     2021-03-01 [1] CRAN (R 4.0.5)                       
 rlang        * 0.4.11     2021-04-30 [1] CRAN (R 4.0.5)                       
 ROSE           0.0-3      2014-07-15 [1] CRAN (R 4.0.5)                       
 rpart          4.1-15     2019-04-12 [2] CRAN (R 4.0.5)                       
 rprofile       0.1.7      2021-05-10 [1] Github (csgillespie/rprofile@61dca21)
 rsample      * 0.1.0      2021-05-08 [1] CRAN (R 4.0.3)                       
 rsthemes       0.2.1.9000 2021-05-13 [1] Github (gadenbuie/rsthemes@19299e5)  
 rstudioapi     0.13       2020-11-12 [1] CRAN (R 4.0.5)                       
 rvest          1.0.0      2021-03-09 [1] CRAN (R 4.0.5)                       
 scales       * 1.1.1      2020-05-11 [1] CRAN (R 4.0.5)                       
 sessioninfo    1.1.1      2018-11-05 [1] CRAN (R 4.0.5)                       
 stringi        1.5.3      2020-09-09 [1] CRAN (R 4.0.3)                       
 stringr      * 1.4.0      2019-02-10 [1] CRAN (R 4.0.5)                       
 survival       3.2-10     2021-03-16 [2] CRAN (R 4.0.5)                       
 sys            3.4        2020-07-23 [1] CRAN (R 4.0.5)                       
 themis         0.1.3      2020-11-12 [1] CRAN (R 4.0.5)                       
 tibble       * 3.1.1      2021-04-18 [1] CRAN (R 4.0.5)                       
 tidymodels   * 0.1.3      2021-04-19 [1] CRAN (R 4.0.5)                       
 tidyr        * 1.1.3      2021-03-03 [1] CRAN (R 4.0.5)                       
 tidyselect     1.1.1      2021-04-30 [1] CRAN (R 4.0.5)                       
 tidyverse    * 1.3.1      2021-04-15 [1] CRAN (R 4.0.5)                       
 timeDate       3043.102   2018-02-21 [1] CRAN (R 4.0.5)                       
 tune         * 0.1.5      2021-04-23 [1] CRAN (R 4.0.5)                       
 unbalanced     2.0        2015-06-26 [1] CRAN (R 4.0.5)                       
 usethis        2.0.1      2021-02-10 [1] CRAN (R 4.0.5)                       
 utf8           1.2.1      2021-03-12 [1] CRAN (R 4.0.5)                       
 vctrs        * 0.3.8      2021-04-29 [1] CRAN (R 4.0.5)                       
 withr          2.4.2      2021-04-18 [1] CRAN (R 4.0.5)                       
 workflows    * 0.2.2      2021-03-10 [1] CRAN (R 4.0.5)                       
 workflowsets * 0.0.2      2021-04-16 [1] CRAN (R 4.0.5)                       
 xaringan       0.20       2021-03-04 [1] CRAN (R 4.0.5)                       
 xfun           0.22       2021-03-11 [1] CRAN (R 4.0.5)                       
 xgboost      * 1.4.1.1    2021-04-22 [1] CRAN (R 4.0.5)                       
 xml2           1.3.2      2020-04-23 [1] CRAN (R 4.0.5)                       
 yardstick    * 0.0.8      2021-03-28 [1] CRAN (R 4.0.5)                       
 zip            2.1.1      2020-08-27 [1] CRAN (R 4.0.5)                       

[1] C:/Users/Joe/R/win-library/4.0
[2] C:/Program Files/R/R-4.0.5/library

您确实将变量向量正确地传递给了配方——没问题!

您 运行 遇到了模型拟合的其他问题。 xgboost 模型要求所有预测变量都是数字,因此如果您将邮政编码之类的东西转换为因子,则需要使用 step_dummy()。如果您有诸如邮政编码之类的高基数,您可能还需要处理新关卡或未知关卡。

library(magrittr)
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip

mtcars1 <- mtcars
mtcars1 %<>% dplyr::mutate(new1 = sample.int(10, 32, replace = TRUE),
                           new2 = sample.int(5, 32, replace = TRUE))

my_categorical <- c("new1", "new2")

mtcars_split <- initial_split(mtcars1)
train <- training(mtcars_split)
test  <- testing(mtcars_split)
cv_folds <- vfold_cv(train, v = 3)


rec <-
  recipe(drat ~ ., data = train) %>%
  step_mutate_at(all_of(my_categorical), fn = ~as.character(.)) %>%
  step_string2factor(all_of(my_categorical)) %>%
  step_novel(all_nominal_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors())

xgboost_model <-
  boost_tree(
    mode = "classification",
    trees = tune()
  ) %>%
  set_engine("xgboost") %>% 
  set_mode("regression")


xgboost_workflow <-
  workflow() %>%
  add_recipe(rec) %>% 
  add_model(xgboost_model)

tune_grid(
    object = xgboost_workflow,
    resamples = cv_folds,
    grid = 5
  )
#> # Tuning results
#> # 3-fold cross-validation 
#> # A tibble: 3 x 4
#>   splits         id    .metrics          .notes          
#>   <list>         <chr> <list>            <list>          
#> 1 <split [16/8]> Fold1 <tibble [10 × 5]> <tibble [0 × 1]>
#> 2 <split [16/8]> Fold2 <tibble [10 × 5]> <tibble [0 × 1]>
#> 3 <split [16/8]> Fold3 <tibble [10 × 5]> <tibble [0 × 1]>

reprex package (v2.0.0)

于 2021-06-25 创建

我不得不在你的例子中改变一些其他的东西来得到这个 运行,比如使用 "regression" 因为 drat 是数字等。我建议检查 reprex 包,因此您 运行 在新的 R 会话中像这样的示例并更有效地获得帮助。