在工作流中预测该列不存在

Question

给定以下代码

library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)

df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")

df <- df %>%
  mutate(Date = dmy(Date),
         Change_Percent = str_replace(Change_Percent, "%", ""),
         Change_Percent = as.double(Change_Percent)
         ) %>%
  filter(year(Date) > 2017)

int <- interval(ymd("2020-01-20"), 
                ymd("2022-01-15"))

df <- df %>%
  mutate(covid = ifelse(Date %within% int, T, F))

df %>%
  ggplot(aes(x = Date, y = Price, color = covid)) + 
    geom_line()

df <- df %>%
  arrange(Date) %>%
  mutate(lag1 = lag(Price),
         lag2 = lag(lag1),
         lag3 = lag(lag2),
         profit_next_day = lead(Profit))

# modelatge
df_mod <- df %>%
  select(-covid, -Date, -Vol_K, -Profit) %>%
  mutate(profit_next_day = as.factor(profit_next_day))

set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data  <- testing(data_split)

bitcoin_rec <- 
  recipe(profit_next_day ~ ., data = train_data) %>%
  step_naomit(all_outcomes(), all_predictors()) %>%
  step_normalize(all_numeric_predictors())

bitcoin_prep <-
  prep(bitcoin_rec)

bitcoin_train <- juice(bitcoin_prep)
bitcoin_test  <- bake(bitcoin_prep, test_data)

rf_spec <- 
  rand_forest(trees = 200) %>% 
  set_engine("ranger", importance = "impurity") %>% 
  set_mode("classification")

bitcoin_wflow <- 
  workflow() %>% 
  add_model(rf_spec) %>% 
  add_recipe(bitcoin_prep)

bitcoin_fit <-
  bitcoin_wflow %>%
  fit(data = train_data)

final_model <- last_fit(bitcoin_wflow, data_split)

collect_metrics(final_model)

final_model %>%
  extract_workflow() %>%
  predict(test_data)

提取工作流并预测 test_data 的最后一段代码抛出错误：

Error in stop_subscript(): ! Can't subset columns that don't exist. x Column profit_next_day doesn't exist.

但是 profit_next_day 已经存在于 test_data 中，因为我检查了多次，所以我不知道发生了什么。在使用 tidymodels.

之前从未出现过此错误

Answer 1

这里的问题来自对结果使用 step_naomit()。通常，当需要重新采样或预测新数据时，更改行的步骤（例如删除它们）可能会非常棘手。您可以 read more in detail in our book，但我建议您从食谱中完全删除 step_naomit()，并将之前的代码更改为：

df_mod <- df %>%
  select(-covid, -Date, -Vol_K, -Profit) %>%
  mutate(profit_next_day = as.factor(profit_next_day)) %>%
  na.omit()

在工作流中预测该列不存在

Predict in workflow throws that column doesn't exist

r

tidymodels