创建食谱并动态传递列名
Create Recipes and passing column names dynamically
我有一个函数可以简单地创建几个食谱对象。问题是在函数内部我必须重命名传递的 data.frame
/tibble
的列,以便我可以使 recipes
.
出于显而易见的原因,我不想这样做,主要是,列名必须是 data.frame
本身中的名称,否则它们将无法工作。
简单示例:
library(tidyverse)
data_tbl <- tibble(
visit_date = seq(
from = as.Date("2021-01-01"),
to = as.Date("2021-10-15"),
by = 7,
),
visits = rnbinom(
n = 42,
size = 100,
mu = 66
)
)
ts_auto_recipe <- function(.data, .date_col, .pred_col){
# * Tidyeval ----
date_col_var <- rlang::enquo(.date_col)
pred_col_var <- rlang::enquo(.pred_col)
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
) %>%
dplyr::rename(
date_col = {{ date_col_var }}
, value_col = {{ pred_col_var }}
)
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = date_col ~ . # I have to do the above so I can do this, which I don't like
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(date_col) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj
)
# * Return ----
return(rec_lst)
}
rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)
我这样做的原因是因为我不能在配方函数本身内部使用动态名称,所以像 rlang::sym(names(data_tbl)[[1]])
这样的东西将不起作用,像 data_tbl[[1]]
这样的东西也不会。我正在考虑使用 step_rename()
之类的东西,但这需要您提前知道名称,并且它不能是配方步骤中的变量。但是,您可以将变量传递给 timetk::step_time_series_signature
我唯一能想到的另一件事是强制用户使用特定的列名,就像在 ds
和 y
的 Facebook Prophet R 库中一样
我还注意到,当我 运行 rec_objs
我得到以下结果时,我在终端上得到了一些奇怪的输出:
> rec_objs
$rec_base
Recipe
Inputs:
role #variables
outcome 1
predictor 1
$rec_date
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...
然而当我这样做时:
> rec_objs[[2]]
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")
不会发生。
谢谢,
我想我已经找到了解决这个问题的方法,请参阅以下自定义函数:
ts_auto_recipe_b <- function(.data
, .date_col
, .pred_col
, .step_ts_sig = TRUE
, .step_ts_rm_misc = TRUE
, .step_ts_dummy = TRUE
, .step_ts_fourier = TRUE
, .step_ts_fourier_period = 1
, .K = 1
, .step_ts_yeo = TRUE
, .step_ts_nzv = TRUE) {
# * Tidyeval ----
date_col_var_expr <- rlang::enquo(.date_col)
pred_col_var_expr <- rlang::enquo(.pred_col)
step_ts_sig <- .step_ts_sig
step_ts_rm_misc <- .step_ts_rm_misc
step_ts_dummy <- .step_ts_dummy
step_ts_fourier <- .step_ts_fourier
step_ts_fourier_k <- .K
step_ts_fourier_period <- .step_ts_fourier_period
step_ts_yeo <- .step_ts_yeo
step_ts_nzv <- .step_ts_nzv
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var_expr)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var_expr)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var_expr }}
, {{ pred_col_var_expr }}
, dplyr::everything()
)
# %>%
# dplyr::rename(
# date_col = {{ date_col_var_expr }}
# , value_col = {{ pred_col_var_expr }}
# )
# Original Col names ----
ds <- rlang::sym(names(data_tbl)[[1]])
v <- rlang::sym(names(data_tbl)[[2]])
f <- as.formula(paste(v, " ~ ."))
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = f
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
if(step_ts_sig){
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(ds) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
}
# ** Step rm ----
if(step_ts_rm_misc){
rec_date_obj <- rec_date_obj %>%
recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
}
# ** Step Dummy ----
if(step_ts_dummy){
rec_date_obj <- rec_date_obj %>%
recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
}
# ** Step Fourier ----
if(step_ts_fourier){
rec_date_fourier_obj <- rec_date_obj %>%
timetk::step_fourier(
ds
, period = 1#step_ts_fourier_period
, K = 1#step_ts_fourier_k
)
}
# ** Step YeoJohnson ----
if(step_ts_yeo){
rec_date_fourier_obj <- rec_date_fourier_obj %>%
recipes::step_YeoJohnson(!!v, limits = c(0, 1))
}
# ** Step NZV ----
if(step_ts_nzv){
rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
recipes::step_nzv(recipes::all_predictors())
}
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj,
rec_date_fourier = rec_date_fourier_obj,
rec_date_fourier_nzv = rec_date_fourier_nzv_obj
)
# * Return ----
return(rec_lst)
}
然后 运行 以下内容:
> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7"
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7" "visit_date_sin1_K1" "visit_date_cos1_K1"
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_half" "visit_date_quarter" "visit_date_month"
[7] "visit_date_day" "visit_date_mday" "visit_date_qday"
[10] "visit_date_yday" "visit_date_mweek" "visit_date_week"
[13] "visit_date_week2" "visit_date_week3" "visit_date_week4"
[16] "visit_date_mday7" "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"
[28] "visit_date_cos1_K1"
将显示 visit_date
和 visits
通过使用 !!v
用于配方函数来根据需要传递给函数,而 timetk
允许传递变量.
我有一个函数可以简单地创建几个食谱对象。问题是在函数内部我必须重命名传递的 data.frame
/tibble
的列,以便我可以使 recipes
.
出于显而易见的原因,我不想这样做,主要是,列名必须是 data.frame
本身中的名称,否则它们将无法工作。
简单示例:
library(tidyverse)
data_tbl <- tibble(
visit_date = seq(
from = as.Date("2021-01-01"),
to = as.Date("2021-10-15"),
by = 7,
),
visits = rnbinom(
n = 42,
size = 100,
mu = 66
)
)
ts_auto_recipe <- function(.data, .date_col, .pred_col){
# * Tidyeval ----
date_col_var <- rlang::enquo(.date_col)
pred_col_var <- rlang::enquo(.pred_col)
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
) %>%
dplyr::rename(
date_col = {{ date_col_var }}
, value_col = {{ pred_col_var }}
)
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = date_col ~ . # I have to do the above so I can do this, which I don't like
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(date_col) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj
)
# * Return ----
return(rec_lst)
}
rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)
我这样做的原因是因为我不能在配方函数本身内部使用动态名称,所以像 rlang::sym(names(data_tbl)[[1]])
这样的东西将不起作用,像 data_tbl[[1]]
这样的东西也不会。我正在考虑使用 step_rename()
之类的东西,但这需要您提前知道名称,并且它不能是配方步骤中的变量。但是,您可以将变量传递给 timetk::step_time_series_signature
我唯一能想到的另一件事是强制用户使用特定的列名,就像在 ds
和 y
我还注意到,当我 运行 rec_objs
我得到以下结果时,我在终端上得到了一些奇怪的输出:
> rec_objs
$rec_base
Recipe
Inputs:
role #variables
outcome 1
predictor 1
$rec_date
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...
然而当我这样做时:
> rec_objs[[2]]
Recipe
Inputs:
role #variables
outcome 1
predictor 1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")
不会发生。
谢谢,
我想我已经找到了解决这个问题的方法,请参阅以下自定义函数:
ts_auto_recipe_b <- function(.data
, .date_col
, .pred_col
, .step_ts_sig = TRUE
, .step_ts_rm_misc = TRUE
, .step_ts_dummy = TRUE
, .step_ts_fourier = TRUE
, .step_ts_fourier_period = 1
, .K = 1
, .step_ts_yeo = TRUE
, .step_ts_nzv = TRUE) {
# * Tidyeval ----
date_col_var_expr <- rlang::enquo(.date_col)
pred_col_var_expr <- rlang::enquo(.pred_col)
step_ts_sig <- .step_ts_sig
step_ts_rm_misc <- .step_ts_rm_misc
step_ts_dummy <- .step_ts_dummy
step_ts_fourier <- .step_ts_fourier
step_ts_fourier_k <- .K
step_ts_fourier_period <- .step_ts_fourier_period
step_ts_yeo <- .step_ts_yeo
step_ts_nzv <- .step_ts_nzv
# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}
if(rlang::quo_is_missing(date_col_var_expr)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}
if(rlang::quo_is_missing(pred_col_var_expr)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}
# * Data ----
data_tbl <- tibble::as_tibble(.data)
data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var_expr }}
, {{ pred_col_var_expr }}
, dplyr::everything()
)
# %>%
# dplyr::rename(
# date_col = {{ date_col_var_expr }}
# , value_col = {{ pred_col_var_expr }}
# )
# Original Col names ----
ds <- rlang::sym(names(data_tbl)[[1]])
v <- rlang::sym(names(data_tbl)[[2]])
f <- as.formula(paste(v, " ~ ."))
# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = f
, data = data_tbl
)
# * Add Steps ----
# ** ts signature and normalize ----
if(step_ts_sig){
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(ds) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
}
# ** Step rm ----
if(step_ts_rm_misc){
rec_date_obj <- rec_date_obj %>%
recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
}
# ** Step Dummy ----
if(step_ts_dummy){
rec_date_obj <- rec_date_obj %>%
recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
}
# ** Step Fourier ----
if(step_ts_fourier){
rec_date_fourier_obj <- rec_date_obj %>%
timetk::step_fourier(
ds
, period = 1#step_ts_fourier_period
, K = 1#step_ts_fourier_k
)
}
# ** Step YeoJohnson ----
if(step_ts_yeo){
rec_date_fourier_obj <- rec_date_fourier_obj %>%
recipes::step_YeoJohnson(!!v, limits = c(0, 1))
}
# ** Step NZV ----
if(step_ts_nzv){
rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
recipes::step_nzv(recipes::all_predictors())
}
# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj,
rec_date_fourier = rec_date_fourier_obj,
rec_date_fourier_nzv = rec_date_fourier_nzv_obj
)
# * Return ----
return(rec_lst)
}
然后 运行 以下内容:
> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7"
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_year" "visit_date_half" "visit_date_quarter"
[7] "visit_date_month" "visit_date_day" "visit_date_wday"
[10] "visit_date_mday" "visit_date_qday" "visit_date_yday"
[13] "visit_date_mweek" "visit_date_week" "visit_date_week2"
[16] "visit_date_week3" "visit_date_week4" "visit_date_mday7"
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1" "visit_date_wday.lbl_2" "visit_date_wday.lbl_3"
[34] "visit_date_wday.lbl_4" "visit_date_wday.lbl_5" "visit_date_wday.lbl_6"
[37] "visit_date_wday.lbl_7" "visit_date_sin1_K1" "visit_date_cos1_K1"
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits" "visit_date_index.num"
[4] "visit_date_half" "visit_date_quarter" "visit_date_month"
[7] "visit_date_day" "visit_date_mday" "visit_date_qday"
[10] "visit_date_yday" "visit_date_mweek" "visit_date_week"
[13] "visit_date_week2" "visit_date_week3" "visit_date_week4"
[16] "visit_date_mday7" "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"
[28] "visit_date_cos1_K1"
将显示 visit_date
和 visits
通过使用 !!v
用于配方函数来根据需要传递给函数,而 timetk
允许传递变量.