创建食谱并动态传递列名

Create Recipes and passing column names dynamically

我有一个函数可以简单地创建几个食谱对象。问题是在函数内部我必须重命名传递的 data.frame/tibble 的列,以便我可以使 recipes.

出于显而易见的原因,我不想这样做,主要是,列名必须是 data.frame 本身中的名称,否则它们将无法工作。

简单示例:

library(tidyverse)

data_tbl <- tibble(
  visit_date = seq(
    from = as.Date("2021-01-01"), 
    to   = as.Date("2021-10-15"),
    by = 7,
  ),
  visits = rnbinom(
    n = 42,
    size = 100,
    mu = 66
  )
)

ts_auto_recipe <- function(.data, .date_col, .pred_col){
  
  # * Tidyeval ----
  date_col_var <- rlang::enquo(.date_col)
  pred_col_var <- rlang::enquo(.pred_col)
  
  # * Checks ----
  if(!is.data.frame(.data)){
    stop(call. = FALSE, "You must supply a data.frame/tibble.")
  }
  
  if(rlang::quo_is_missing(date_col_var)){
    stop(call. = FALSE, "The (.date_col) must be supplied.")
  }
  
  if(rlang::quo_is_missing(pred_col_var)){
    stop(call. = FALSE, "The (.pred_col) must be supplied.")
  }
  
  # * Data ----
  data_tbl <- tibble::as_tibble(.data)
  
  data_tbl <- data_tbl %>%
    dplyr::select(
      {{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
    ) %>%
    dplyr::rename(
      date_col    = {{ date_col_var }}
      , value_col = {{ pred_col_var }}
    )
  
  # * Recipe Objects ----
  # ** Base recipe ----
  rec_base_obj <- recipes::recipe(
    formula = date_col ~ . # I have to do the above so I can do this, which I don't like
    , data = data_tbl
  )
  
  # * Add Steps ----
  # ** ts signature and normalize ----
  rec_date_obj <- rec_base_obj %>%
    timetk::step_timeseries_signature(date_col) %>%
    recipes::step_normalize(
      dplyr::contains("index.num")
      , dplyr::contains("date_col_year")
    )
  
  # * Recipe List ----
  rec_lst <- list(
    rec_base = rec_base_obj,
    rec_date = rec_date_obj
  )
  
  # * Return ----
  return(rec_lst)
  
}

rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)

我这样做的原因是因为我不能在配方函数本身内部使用动态名称,所以像 rlang::sym(names(data_tbl)[[1]]) 这样的东西将不起作用,像 data_tbl[[1]] 这样的东西也不会。我正在考虑使用 step_rename() 之类的东西,但这需要您提前知道名称,并且它不能是配方步骤中的变量。但是,您可以将变量传递给 timetk::step_time_series_signature

我唯一能想到的另一件事是强制用户使用特定的列名,就像在 dsy

的 Facebook Prophet R 库中一样

我还注意到,当我 运行 rec_objs 我得到以下结果时,我在终端上得到了一些奇怪的输出:

> rec_objs
$rec_base
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          1

$rec_date
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          1

Operations:

Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...

然而当我这样做时:

> rec_objs[[2]]
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          1

Operations:

Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")

不会发生。

谢谢,

我想我已经找到了解决这个问题的方法,请参阅以下自定义函数:

ts_auto_recipe_b <- function(.data
                           , .date_col
                           , .pred_col
                           , .step_ts_sig = TRUE
                           , .step_ts_rm_misc = TRUE
                           , .step_ts_dummy = TRUE
                           , .step_ts_fourier = TRUE
                           , .step_ts_fourier_period = 1
                           , .K = 1
                           , .step_ts_yeo = TRUE
                           , .step_ts_nzv = TRUE) {
  
  # * Tidyeval ----
  date_col_var_expr      <- rlang::enquo(.date_col)
  pred_col_var_expr      <- rlang::enquo(.pred_col)
  step_ts_sig            <- .step_ts_sig
  step_ts_rm_misc        <- .step_ts_rm_misc
  step_ts_dummy          <- .step_ts_dummy
  step_ts_fourier        <- .step_ts_fourier
  step_ts_fourier_k      <- .K
  step_ts_fourier_period <- .step_ts_fourier_period
  step_ts_yeo            <- .step_ts_yeo
  step_ts_nzv            <- .step_ts_nzv
  
  # * Checks ----
  if(!is.data.frame(.data)){
    stop(call. = FALSE, "You must supply a data.frame/tibble.")
  }
  
  if(rlang::quo_is_missing(date_col_var_expr)){
    stop(call. = FALSE, "The (.date_col) must be supplied.")
  }
  
  if(rlang::quo_is_missing(pred_col_var_expr)){
    stop(call. = FALSE, "The (.pred_col) must be supplied.")
  }
  
  # * Data ----
  data_tbl <- tibble::as_tibble(.data)
  
  data_tbl <- data_tbl %>%
    dplyr::select(
      {{ date_col_var_expr }}
      , {{ pred_col_var_expr }}
      , dplyr::everything()
    ) 
  # %>%
  #   dplyr::rename(
  #     date_col    = {{ date_col_var_expr }}
  #     , value_col = {{ pred_col_var_expr }}
  #   )
  
  # Original Col names ----
  ds <- rlang::sym(names(data_tbl)[[1]])
  v  <- rlang::sym(names(data_tbl)[[2]])
  f <- as.formula(paste(v, " ~ ."))
  
  # * Recipe Objects ----
  # ** Base recipe ----
  rec_base_obj <- recipes::recipe(
    formula = f
    , data = data_tbl
  )
  
  # * Add Steps ----
  # ** ts signature and normalize ----
  if(step_ts_sig){
    rec_date_obj <- rec_base_obj %>%
      timetk::step_timeseries_signature(ds) %>%
      recipes::step_normalize(
        dplyr::contains("index.num")
        , dplyr::contains("date_col_year")
      )
  }
  
  # ** Step rm ----
  if(step_ts_rm_misc){
    rec_date_obj <- rec_date_obj %>%
      recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
  }
  
  # ** Step Dummy ----
  if(step_ts_dummy){
    rec_date_obj <- rec_date_obj %>%
      recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
  }
  
  # ** Step Fourier ----
  if(step_ts_fourier){
    rec_date_fourier_obj <- rec_date_obj %>%
      timetk::step_fourier(
        ds
        , period = 1#step_ts_fourier_period
        , K      = 1#step_ts_fourier_k
      )
  }
  # ** Step YeoJohnson ----
  if(step_ts_yeo){
    rec_date_fourier_obj <- rec_date_fourier_obj %>%
      recipes::step_YeoJohnson(!!v, limits = c(0, 1))
  }
  
  # ** Step NZV ----
  if(step_ts_nzv){
    rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
      recipes::step_nzv(recipes::all_predictors())
  }
  
  # * Recipe List ----
  rec_lst <- list(
    rec_base             = rec_base_obj,
    rec_date             = rec_date_obj,
    rec_date_fourier     = rec_date_fourier_obj,
    rec_date_fourier_nzv = rec_date_fourier_nzv_obj
  )
  
  # * Return ----
  return(rec_lst)
  
}

然后 运行 以下内容:

> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"    
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
 [1] "visit_date"              "visits"                  "visit_date_index.num"   
 [4] "visit_date_year"         "visit_date_half"         "visit_date_quarter"     
 [7] "visit_date_month"        "visit_date_day"          "visit_date_wday"        
[10] "visit_date_mday"         "visit_date_qday"         "visit_date_yday"        
[13] "visit_date_mweek"        "visit_date_week"         "visit_date_week2"       
[16] "visit_date_week3"        "visit_date_week4"        "visit_date_mday7"       
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1"   "visit_date_wday.lbl_2"   "visit_date_wday.lbl_3"  
[34] "visit_date_wday.lbl_4"   "visit_date_wday.lbl_5"   "visit_date_wday.lbl_6"  
[37] "visit_date_wday.lbl_7"  
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
 [1] "visit_date"              "visits"                  "visit_date_index.num"   
 [4] "visit_date_year"         "visit_date_half"         "visit_date_quarter"     
 [7] "visit_date_month"        "visit_date_day"          "visit_date_wday"        
[10] "visit_date_mday"         "visit_date_qday"         "visit_date_yday"        
[13] "visit_date_mweek"        "visit_date_week"         "visit_date_week2"       
[16] "visit_date_week3"        "visit_date_week4"        "visit_date_mday7"       
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1"   "visit_date_wday.lbl_2"   "visit_date_wday.lbl_3"  
[34] "visit_date_wday.lbl_4"   "visit_date_wday.lbl_5"   "visit_date_wday.lbl_6"  
[37] "visit_date_wday.lbl_7"   "visit_date_sin1_K1"      "visit_date_cos1_K1"     
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
 [1] "visit_date"              "visits"                  "visit_date_index.num"   
 [4] "visit_date_half"         "visit_date_quarter"      "visit_date_month"       
 [7] "visit_date_day"          "visit_date_mday"         "visit_date_qday"        
[10] "visit_date_yday"         "visit_date_mweek"        "visit_date_week"        
[13] "visit_date_week2"        "visit_date_week3"        "visit_date_week4"       
[16] "visit_date_mday7"        "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"     
[28] "visit_date_cos1_K1"   

将显示 visit_datevisits 通过使用 !!v 用于配方函数来根据需要传递给函数,而 timetk 允许传递变量.