有没有办法在 R(或 ml3)的食谱包中对行(尤其是虚拟变量)进行分组
Is there a way to group rows (especially dummy variables) in the recipes package in R (or ml3)
# Packages
library(dplyr)
library(recipes)
# toy dataset, with A being multicolored
df <- tibble(name = c("A", "A", "A", "B", "C"), color = c("green", "yellow", "purple", "green", "blue"))
#> # A tibble: 5 x 2
#> name color
#> <chr> <chr>
#> 1 A green
#> 2 A yellow
#> 3 A purple
#> 4 B green
#> 5 C blue
食谱步骤效果很好
dummified_df <- recipe(. ~ ., data = df) %>%
step_dummy(color, one_hot = TRUE) %>%
prep(training = df) %>%
juice()
#> # A tibble: 5 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 0 0
#> 2 A 0 0 0 1
#> 3 A 0 0 1 0
#> 4 B 0 1 0 0
#> 5 C 1 0 0 0
但我真正想要获得的结果是下面的结果,每行一个观察,现在彩色项目不再需要多行。
summarized_dummified_df <- dummified_df %>%
group_by(name) %>%
summarise_all(~ifelse(max(.) > 0, 1, 0)) %>%
ungroup()
#> # A tibble: 3 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 1 1
#> 2 B 0 1 0 0
#> 3 C 1 0 0 0
显然,我可以这样做。
但是为了将我的食谱步骤完全集成到 tidymodels
生态系统中,例如通过工作流,如果我可以将不再需要重复的行分组,这会更好,这要归功于直接在食谱中的虚拟变量.
是否有任何 tidymodels-sanctioned 方法来获得此结果?
我也尝试用 mlr3
来做这个,但没有成功,因为我找不到任何合适的 PipeOp
来聚合行。
library("mlr3")
library("mlr3pipelines")
task = TaskClassif$new("task",
data.table::data.table(
name = c("A", "A", "A", "B", "C"),
color = as.factor(c("green", "yellow", "purple", "green", "blue")),
price = as.factor(c("low", "low", "low", "high", "low"))),
"price"
)
poe = po("encode")
poe$train(list(task))[[1]]$data()
#> price name color.blue color.green color.purple color.yellow
#> 1: low A 0 1 0 0
#> 2: low A 0 0 0 1
#> 3: low A 0 0 1 0
#> 4: high B 0 1 0 0
#> 5: low C 1 0 0 0
我正在研究 custom step_
functions or custom PipeOp
的创建,但我仍然觉得我遗漏了一些东西,因为我的数据类型并不 不常见我.
我为食谱包编写了以下自定义步骤。
step_summarize <- function(
recipe,
...,
role = NA,
trained = FALSE,
col_names = NULL,
skip = FALSE,
id = rand_id("summarize")
){
terms <- ellipse_check(...)
add_step(
recipe,
step_summarize_new(
terms = terms,
role = role,
trained = trained,
col_names = col_names,
skip = skip,
id = id
)
)
}
step_summarize_new <-
function(terms, role, trained, col_names, skip, id) {
step(
subclass = "summarize",
terms = terms,
role = role,
trained = trained,
col_names = col_names,
skip = skip,
id = id
)
}
prep.step_summarize <- function(x, training, info = NULL, ...) {
col_names <- terms_select(terms = x$terms, info = info)
step_summarize_new(
terms = x$terms,
trained = TRUE,
role = x$role,
col_names = col_names,
skip = x$skip,
id = x$id
)
}
bake.step_summarize <- function(object, new_data, ...) {
vars <- object$col_names
new_data <- new_data %>%
group_by(across(- any_of(vars))) %>%
summarise(across(any_of(vars), ~ifelse(max(.) > 0, 1, 0)))
## Always convert to tibbles on the way out
tibble::as_tibble(new_data)
}
它在我的真实数据集上作为预处理步骤正常工作,但在使用 tune
时进一步中断了该行。
这可能链接到 this issue
虚拟变量或指示变量在概念上映射为 one-to-one,而不是 one-to-many,我认为这就是您 运行 参与其中的原因。不过,和你一样,我也想在现实世界中的某个时候 one-to-many 绘制它们的地图。我通常在开始模型预处理工作流之前在数据整理步骤中执行此操作,如下所示:
library(tidyverse)
# toy dataset, with A being multicolored
df <- tibble(name = c("A", "A", "A", "B", "C"), color = c("green", "yellow", "purple", "green", "blue"))
df %>%
mutate(value = 1) %>%
pivot_wider(names_from = "color", names_prefix = "color_", values_from = "value", values_fill = 0)
#> # A tibble: 3 x 5
#> name color_green color_yellow color_purple color_blue
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 1 1 0
#> 2 B 1 0 0 0
#> 3 C 0 0 0 1
由 reprex package (v0.3.0.9001)
于 2020-08-18 创建
# Packages
library(dplyr)
library(recipes)
# toy dataset, with A being multicolored
df <- tibble(name = c("A", "A", "A", "B", "C"), color = c("green", "yellow", "purple", "green", "blue"))
#> # A tibble: 5 x 2
#> name color
#> <chr> <chr>
#> 1 A green
#> 2 A yellow
#> 3 A purple
#> 4 B green
#> 5 C blue
食谱步骤效果很好
dummified_df <- recipe(. ~ ., data = df) %>%
step_dummy(color, one_hot = TRUE) %>%
prep(training = df) %>%
juice()
#> # A tibble: 5 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 0 0
#> 2 A 0 0 0 1
#> 3 A 0 0 1 0
#> 4 B 0 1 0 0
#> 5 C 1 0 0 0
但我真正想要获得的结果是下面的结果,每行一个观察,现在彩色项目不再需要多行。
summarized_dummified_df <- dummified_df %>%
group_by(name) %>%
summarise_all(~ifelse(max(.) > 0, 1, 0)) %>%
ungroup()
#> # A tibble: 3 x 5
#> name color_blue color_green color_purple color_yellow
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 A 0 1 1 1
#> 2 B 0 1 0 0
#> 3 C 1 0 0 0
显然,我可以这样做。
但是为了将我的食谱步骤完全集成到 tidymodels
生态系统中,例如通过工作流,如果我可以将不再需要重复的行分组,这会更好,这要归功于直接在食谱中的虚拟变量.
是否有任何 tidymodels-sanctioned 方法来获得此结果?
我也尝试用 mlr3
来做这个,但没有成功,因为我找不到任何合适的 PipeOp
来聚合行。
library("mlr3")
library("mlr3pipelines")
task = TaskClassif$new("task",
data.table::data.table(
name = c("A", "A", "A", "B", "C"),
color = as.factor(c("green", "yellow", "purple", "green", "blue")),
price = as.factor(c("low", "low", "low", "high", "low"))),
"price"
)
poe = po("encode")
poe$train(list(task))[[1]]$data()
#> price name color.blue color.green color.purple color.yellow
#> 1: low A 0 1 0 0
#> 2: low A 0 0 0 1
#> 3: low A 0 0 1 0
#> 4: high B 0 1 0 0
#> 5: low C 1 0 0 0
我正在研究 custom step_
functions or custom PipeOp
的创建,但我仍然觉得我遗漏了一些东西,因为我的数据类型并不 不常见我.
我为食谱包编写了以下自定义步骤。
step_summarize <- function(
recipe,
...,
role = NA,
trained = FALSE,
col_names = NULL,
skip = FALSE,
id = rand_id("summarize")
){
terms <- ellipse_check(...)
add_step(
recipe,
step_summarize_new(
terms = terms,
role = role,
trained = trained,
col_names = col_names,
skip = skip,
id = id
)
)
}
step_summarize_new <-
function(terms, role, trained, col_names, skip, id) {
step(
subclass = "summarize",
terms = terms,
role = role,
trained = trained,
col_names = col_names,
skip = skip,
id = id
)
}
prep.step_summarize <- function(x, training, info = NULL, ...) {
col_names <- terms_select(terms = x$terms, info = info)
step_summarize_new(
terms = x$terms,
trained = TRUE,
role = x$role,
col_names = col_names,
skip = x$skip,
id = x$id
)
}
bake.step_summarize <- function(object, new_data, ...) {
vars <- object$col_names
new_data <- new_data %>%
group_by(across(- any_of(vars))) %>%
summarise(across(any_of(vars), ~ifelse(max(.) > 0, 1, 0)))
## Always convert to tibbles on the way out
tibble::as_tibble(new_data)
}
它在我的真实数据集上作为预处理步骤正常工作,但在使用 tune
时进一步中断了该行。
这可能链接到 this issue
虚拟变量或指示变量在概念上映射为 one-to-one,而不是 one-to-many,我认为这就是您 运行 参与其中的原因。不过,和你一样,我也想在现实世界中的某个时候 one-to-many 绘制它们的地图。我通常在开始模型预处理工作流之前在数据整理步骤中执行此操作,如下所示:
library(tidyverse)
# toy dataset, with A being multicolored
df <- tibble(name = c("A", "A", "A", "B", "C"), color = c("green", "yellow", "purple", "green", "blue"))
df %>%
mutate(value = 1) %>%
pivot_wider(names_from = "color", names_prefix = "color_", values_from = "value", values_fill = 0)
#> # A tibble: 3 x 5
#> name color_green color_yellow color_purple color_blue
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 1 1 0
#> 2 B 1 0 0 0
#> 3 C 0 0 0 1
由 reprex package (v0.3.0.9001)
于 2020-08-18 创建