Error: Can't subset columns that don't exist when running prediction using {Tidymodels}
Error: Can't subset columns that don't exist when running prediction using {Tidymodels}
我正在尝试使用 Tidymodels 预测 R 中的房地产价格。我正在关注 this tutorial。一切顺利,直到我尝试 运行 对我的测试数据进行预测。
请看下面的代码示例和最后的错误。
我查看了两个类似的问题 ( and ),但似乎我已经定义了变量角色并为我的工作流程提供了一个未经准备的配方。
# libraries ---------------------------------------------------------------
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(data.table)
library(purrr)
# data --------------------------------------------------------------------
# 're' means real estate
# I'm using data.table in general. Using tribble below for cleaner data definition.
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
)
real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
# train/test split --------------------------------------------------------
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
# workflow (w/ recipe) ----------------------------------------------------
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_numeric(), - district) %>%
step_scale(all_predictors(), all_numeric(), - district) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
# model training and prediction -------------------------------------------
re_fit <-
re_wflow %>%
fit(data = re_train)
re_pred <- predict(re_fit, re_test)
#> Error: Can't subset columns that don't exist.
#> x Column `price_per_sqm_huf_mil` doesn't exist.
由 reprex package (v0.3.0)
于 2021-01-25 创建
非常感谢!
这里的问题是您使用 step_center()
来转换 结果 (price_per_sqm_huf_mil
),并且在预测时,没有可用的结果。您可以改为指定要居中 all_predictors() & all_numeric()
,如下所示:
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
) %>%
mutate(district = factor(district))
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#> .pred
#> <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768
由 reprex package (v0.3.0)
于 2021-01-25 创建
这比你绊倒的人多,所以我们正在努力添加 new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE
。
我正在尝试使用 Tidymodels 预测 R 中的房地产价格。我正在关注 this tutorial。一切顺利,直到我尝试 运行 对我的测试数据进行预测。
请看下面的代码示例和最后的错误。
我查看了两个类似的问题 (
# libraries ---------------------------------------------------------------
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(data.table)
library(purrr)
# data --------------------------------------------------------------------
# 're' means real estate
# I'm using data.table in general. Using tribble below for cleaner data definition.
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
)
real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
# train/test split --------------------------------------------------------
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
# workflow (w/ recipe) ----------------------------------------------------
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_numeric(), - district) %>%
step_scale(all_predictors(), all_numeric(), - district) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
# model training and prediction -------------------------------------------
re_fit <-
re_wflow %>%
fit(data = re_train)
re_pred <- predict(re_fit, re_test)
#> Error: Can't subset columns that don't exist.
#> x Column `price_per_sqm_huf_mil` doesn't exist.
由 reprex package (v0.3.0)
于 2021-01-25 创建非常感谢!
这里的问题是您使用 step_center()
来转换 结果 (price_per_sqm_huf_mil
),并且在预测时,没有可用的结果。您可以改为指定要居中 all_predictors() & all_numeric()
,如下所示:
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
) %>%
mutate(district = factor(district))
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#> .pred
#> <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768
由 reprex package (v0.3.0)
于 2021-01-25 创建这比你绊倒的人多,所以我们正在努力添加 new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE
。