Tidymodels:在 Date 列中估算缺失值的正确方法是什么?
Tidymodels: What is the correct way to impute missing values in a Date column?
我对日期列中的缺失值有点费劲。
在我的预处理管道(recipe
-对象)中,我使用 step_impute_knn
函数来填充我所有日期列中的缺失值。不幸的是我收到以下错误:
Assigned data pred_vals must be compatible with existing data.? Error occurred for column avg_begin_first_contract .x Can't convert double to date
这是一个 reprex
版本,我在多个列中估算值,包括 Date
列。如果我仅将值归因于 Date
列,这对我来说并不重要。结果是一样的。下面有一个reprex
,没有通过报错,因为没有使用Date
列。
以前有人遇到过这个问题吗?
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species, Plucked) %>%
step_date(Plucked) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> x train/test split: preprocessor 1/1: Error: Assigned data `pred_vals` must be compatible wi...
#> Warning: All models failed. See the `.notes` column.
Created on 2021-06-15 by the reprex package (v2.0.0)
这里是 reprex,它没有通过错误:
library(tidyverse)
library(tidymodels)
iris[45, 2] <- as.numeric(NA)
iris[37 ,3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
set.seed(123)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 5,
min_n = 5,
trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(split = iris_split)
Created on 2021-06-15 by the reprex package (v2.0.0)
提前致谢!
M.
我想我找到了答案并想与您分享。关键是将日期转换为数值。那么归责就很容易了。这是一个reprex
.
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_mutate_at(
where(lubridate::is.Date),
fn = ~ as.numeric(lubridate::ymd(.x))
) %>%
step_impute_bag(c("Plucked")) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> ! train/test split: preprocessor 1/1, model 1/1: 10 columns were requested but there were 6 ...
Created on 2021-06-29 by the reprex package (v2.0.0)
如果您想在拟合之前从数字恢复为日期,可以通过在代码中添加以下行来实现:
step_mutate_at(c("Plucked"), fn = ~ as.Date(.x, origin = "1970-01-01 UTC"))
再次感谢,
M.
我对日期列中的缺失值有点费劲。
在我的预处理管道(recipe
-对象)中,我使用 step_impute_knn
函数来填充我所有日期列中的缺失值。不幸的是我收到以下错误:
Assigned data pred_vals must be compatible with existing data.? Error occurred for column avg_begin_first_contract .x Can't convert double to date
这是一个 reprex
版本,我在多个列中估算值,包括 Date
列。如果我仅将值归因于 Date
列,这对我来说并不重要。结果是一样的。下面有一个reprex
,没有通过报错,因为没有使用Date
列。
以前有人遇到过这个问题吗?
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species, Plucked) %>%
step_date(Plucked) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> x train/test split: preprocessor 1/1: Error: Assigned data `pred_vals` must be compatible wi...
#> Warning: All models failed. See the `.notes` column.
Created on 2021-06-15 by the reprex package (v2.0.0)
这里是 reprex,它没有通过错误:
library(tidyverse)
library(tidymodels)
iris[45, 2] <- as.numeric(NA)
iris[37 ,3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
set.seed(123)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 5,
min_n = 5,
trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(split = iris_split)
Created on 2021-06-15 by the reprex package (v2.0.0)
提前致谢! M.
我想我找到了答案并想与您分享。关键是将日期转换为数值。那么归责就很容易了。这是一个reprex
.
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_mutate_at(
where(lubridate::is.Date),
fn = ~ as.numeric(lubridate::ymd(.x))
) %>%
step_impute_bag(c("Plucked")) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> ! train/test split: preprocessor 1/1, model 1/1: 10 columns were requested but there were 6 ...
Created on 2021-06-29 by the reprex package (v2.0.0)
如果您想在拟合之前从数字恢复为日期,可以通过在代码中添加以下行来实现:
step_mutate_at(c("Plucked"), fn = ~ as.Date(.x, origin = "1970-01-01 UTC"))
再次感谢, M.