在工作流中预测该列不存在
Predict in workflow throws that column doesn't exist
给定以下代码
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
提取工作流并预测 test_data
的最后一段代码抛出错误:
Error in stop_subscript()
: ! Can't subset columns that don't exist.
x Column profit_next_day
doesn't exist.
但是 profit_next_day
已经存在于 test_data
中,因为我检查了多次,所以我不知道发生了什么。在使用 tidymodels
.
之前从未出现过此错误
这里的问题来自对结果使用 step_naomit()
。通常,当需要重新采样或预测新数据时,更改 行 的步骤(例如删除它们)可能会非常棘手。您可以 read more in detail in our book,但我建议您从食谱中完全删除 step_naomit()
,并将之前的代码更改为:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()
给定以下代码
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
提取工作流并预测 test_data
的最后一段代码抛出错误:
Error in
stop_subscript()
: ! Can't subset columns that don't exist. x Columnprofit_next_day
doesn't exist.
但是 profit_next_day
已经存在于 test_data
中,因为我检查了多次,所以我不知道发生了什么。在使用 tidymodels
.
这里的问题来自对结果使用 step_naomit()
。通常,当需要重新采样或预测新数据时,更改 行 的步骤(例如删除它们)可能会非常棘手。您可以 read more in detail in our book,但我建议您从食谱中完全删除 step_naomit()
,并将之前的代码更改为:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()