Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

Error: Can't subset columns that don't exist when running prediction using {Tidymodels}

我正在尝试使用 Tidymodels 预测 R 中的房地产价格。我正在关注 this tutorial。一切顺利,直到我尝试 运行 对我的测试数据进行预测。

请看下面的代码示例和最后的错误。

我查看了两个类似的问题 ( and ),但似乎我已经定义了变量角色并为我的工作流程提供了一个未经准备的配方。

    # libraries ---------------------------------------------------------------
    library(tidymodels)
    #> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
    #> ✓ broom     0.7.3      ✓ recipes   0.1.15
    #> ✓ dials     0.0.9      ✓ rsample   0.0.8 
    #> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
    #> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
    #> ✓ infer     0.5.4      ✓ tune      0.1.2 
    #> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
    #> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
    #> ✓ purrr     0.3.4
    #> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
    #> x purrr::discard() masks scales::discard()
    #> x dplyr::filter()  masks stats::filter()
    #> x dplyr::lag()     masks stats::lag()
    #> x recipes::step()  masks stats::step()
    library(data.table)
    
    library(purrr)
    
    
    # data --------------------------------------------------------------------
    # 're' means real estate
    # I'm using data.table in general. Using tribble below for cleaner data definition.
    real_estate_data <- tibble::tribble(
        ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
        "30876343",      0.534722222222222,        1,         3,
        "31914489",      0.476119402985075,        1,         1,
        "30972289",      0.507352941176471,        1,         2,
        "31739730",      0.472972972972973,        1,         3,
        "31783137",                0.49875,        2,         3,
        "31809435",      0.439705882352941,        2,         2,
        "31943408",      0.469117647058824,        2,         3,
        "31944348",       0.56231884057971,        2,         1,
        "31961146",      0.472972972972973,        3,         3,
        "24314388",      0.649550561797753,        3,         2,
        "29840270",      0.719178082191781,        3,         3,
        "29840429",      0.719178082191781,        3,         3,
        "30873484",      0.822857142857143,        4,         3,
        "30969673",      0.533802816901408,        4,         3,
        "31333120",      0.741511627906977,        4,         3,
        "31788730",      0.527142857142857,        4,         2,
        "31948441",      0.734848484848485,        5,         2,
        "31962350",                    0.8,        5,         3,
        "31962779",      0.670454545454545,        5,         3,
        "31979128",      0.689054054054054,        5,         1
    )
    
    real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
    
    # train/test split --------------------------------------------------------
    set.seed(123)
    re_split <- initial_split(real_estate_data)
    re_train <- training(re_split)
    re_test  <- testing(re_split)
    
    # workflow (w/ recipe) ----------------------------------------------------
    re_rec <- recipe(re_train,
                     formula = price_per_sqm_huf_mil ~ .) %>%
        update_role(re_id, new_role = "ID") %>%
        step_center(all_numeric(), - district) %>%
        step_scale(all_predictors(), all_numeric(), - district) %>%
        step_dummy(district) %>%
        step_zv(all_predictors())
    
    summary(re_rec)
    #> # A tibble: 4 x 4
    #>   variable              type    role      source  
    #>   <chr>                 <chr>   <chr>     <chr>   
    #> 1 re_id                 nominal ID        original
    #> 2 district              nominal predictor original
    #> 3 num_room              numeric predictor original
    #> 4 price_per_sqm_huf_mil numeric outcome   original
    
    lr_model <-
        linear_reg() %>%
        set_engine("lm")
    
    re_wflow <-
        workflow() %>%
        add_model(lr_model) %>%
        add_recipe(re_rec)
    
    # model training and prediction -------------------------------------------
    re_fit <-
        re_wflow %>%
        fit(data = re_train)
    
    re_pred <- predict(re_fit, re_test)
    #> Error: Can't subset columns that don't exist.
    #> x Column `price_per_sqm_huf_mil` doesn't exist.

reprex package (v0.3.0)

于 2021-01-25 创建

非常感谢!

这里的问题是您使用 step_center() 来转换 结果 (price_per_sqm_huf_mil),并且在预测时,没有可用的结果。您可以改为指定要居中 all_predictors() & all_numeric(),如下所示:

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom     0.7.3      ✓ recipes   0.1.15
#> ✓ dials     0.0.9      ✓ rsample   0.0.8 
#> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
#> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
#> ✓ infer     0.5.4      ✓ tune      0.1.2 
#> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
#> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(dplyr)

real_estate_data <- tibble::tribble(
  ~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
  "30876343",      0.534722222222222,        1,         3,
  "31914489",      0.476119402985075,        1,         1,
  "30972289",      0.507352941176471,        1,         2,
  "31739730",      0.472972972972973,        1,         3,
  "31783137",                0.49875,        2,         3,
  "31809435",      0.439705882352941,        2,         2,
  "31943408",      0.469117647058824,        2,         3,
  "31944348",       0.56231884057971,        2,         1,
  "31961146",      0.472972972972973,        3,         3,
  "24314388",      0.649550561797753,        3,         2,
  "29840270",      0.719178082191781,        3,         3,
  "29840429",      0.719178082191781,        3,         3,
  "30873484",      0.822857142857143,        4,         3,
  "30969673",      0.533802816901408,        4,         3,
  "31333120",      0.741511627906977,        4,         3,
  "31788730",      0.527142857142857,        4,         2,
  "31948441",      0.734848484848485,        5,         2,
  "31962350",                    0.8,        5,         3,
  "31962779",      0.670454545454545,        5,         3,
  "31979128",      0.689054054054054,        5,         1
) %>%
  mutate(district = factor(district))


set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test  <- testing(re_split)

re_rec <- recipe(re_train,
                 formula = price_per_sqm_huf_mil ~ .) %>%
  update_role(re_id, new_role = "ID") %>%
  step_center(all_predictors() & all_numeric()) %>%
  step_scale(all_predictors() & all_numeric()) %>%
  step_dummy(district) %>%
  step_zv(all_predictors())

summary(re_rec)
#> # A tibble: 4 x 4
#>   variable              type    role      source  
#>   <chr>                 <chr>   <chr>     <chr>   
#> 1 re_id                 nominal ID        original
#> 2 district              nominal predictor original
#> 3 num_room              numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome   original

lr_model <-
  linear_reg() %>%
  set_engine("lm")

re_wflow <-
  workflow() %>%
  add_model(lr_model) %>%
  add_recipe(re_rec)

re_fit <-
  re_wflow %>%
  fit(data = re_train)

predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#>   .pred
#>   <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768

reprex package (v0.3.0)

于 2021-01-25 创建

这比你绊倒的人多,所以我们正在努力添加 new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE