如何预测保持回归变量不变的回归结果?
How to predict the outcome of a regression holding regressor constant?
大家好,根据工资数据集(工资是因变量)和下面创建的工作流,我想了解以下内容:
- 对于每个分段模型,
age
等于 30 的人的预测 wage
是多少?
- 考虑到灵活的
pw6_wf_fit
模型配置,特别是上面的六个断点:超过 age
的哪个(近似)值与 wage
相关性最强?
我尝试使用 extract
的版本,但到目前为止我不知道如何在 R 中应用它。对任何评论都有帮助
我使用的代码如下:
if (!require("pacman")) install.packages("pacman")
# load (or install if pacman cannot find an existing installation) the relevant packages
pacman::p_load(
tidyverse, tidymodels, ISLR, patchwork,
rpart, rpart.plot, randomForest, gbm, kernlab, parsnip, skimr
)
data(Wage, package = "ISLR")
Wage %>%
tibble::as_tibble() %>%
skimr::skim()
lin_rec <- recipe(wage ~ age, data = Wage)
# Specify as linear regression
lm_spec <-
linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
plot_model <- function(wf_fit, data) {
predictions <-
tibble::tibble(age = seq(min(data$age), max(data$age))) %>%
dplyr::bind_cols(
predict(wf_fit, new_data = .),
predict(wf_fit, new_data = ., type = "conf_int")
)
p <- ggplot2::ggplot(aes(age, wage), data = data) +
geom_point(alpha = 0.05) +
geom_line(aes(y = .pred),
data = predictions, color = "darkgreen") +
geom_line(aes(y = .pred_lower),
data = predictions, linetype = "dashed", color = "blue") +
geom_line(aes(y = .pred_upper),
data = predictions, linetype = "dashed", color = "blue") +
scale_x_continuous(breaks = seq(20, 80, 5)) +
labs(title = substitute(wf_fit)) +
theme_classic()
return(p)
}
pw3_rec <- lin_rec %>% step_discretize(age, num_breaks = 3, min_unique = 5)
pw4_rec <- lin_rec %>% step_discretize(age, num_breaks = 4, min_unique = 5)
pw5_rec <- lin_rec %>% step_discretize(age, num_breaks = 5, min_unique = 5)
pw6_rec <- lin_rec %>% step_discretize(age, num_breaks = 6, min_unique = 5)
pw3_wf_fit <- workflow(pw3_rec, lm_spec) %>% fit(data = Wage)
pw4_wf_fit <- workflow(pw4_rec, lm_spec) %>% fit(data = Wage)
pw5_wf_fit <- workflow(pw5_rec, lm_spec) %>% fit(data = Wage)
pw6_wf_fit <- workflow(pw6_rec, lm_spec) %>% fit(data = Wage)
(plot_model(pw3_wf_fit, Wage) + plot_model(pw4_wf_fit, Wage)) /
(plot_model(pw5_wf_fit, Wage) + plot_model(pw6_wf_fit, Wage))
第一个问题的答案很简单:
map(list(pw3_wf_fit, pw4_wf_fit, pw5_wf_fit, pw6_wf_fit),
~predict(.x, new_data=tibble(age=30))) %>%
bind_rows()
# # A tibble: 4 × 1
# .pred
# <dbl>
# 1 99.3
# 2 94.2
# 3 92.3
# 4 89.5
大家好,根据工资数据集(工资是因变量)和下面创建的工作流,我想了解以下内容:
- 对于每个分段模型,
age
等于 30 的人的预测wage
是多少? - 考虑到灵活的
pw6_wf_fit
模型配置,特别是上面的六个断点:超过age
的哪个(近似)值与wage
相关性最强?
我尝试使用 extract
的版本,但到目前为止我不知道如何在 R 中应用它。对任何评论都有帮助
我使用的代码如下:
if (!require("pacman")) install.packages("pacman")
# load (or install if pacman cannot find an existing installation) the relevant packages
pacman::p_load(
tidyverse, tidymodels, ISLR, patchwork,
rpart, rpart.plot, randomForest, gbm, kernlab, parsnip, skimr
)
data(Wage, package = "ISLR")
Wage %>%
tibble::as_tibble() %>%
skimr::skim()
lin_rec <- recipe(wage ~ age, data = Wage)
# Specify as linear regression
lm_spec <-
linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
plot_model <- function(wf_fit, data) {
predictions <-
tibble::tibble(age = seq(min(data$age), max(data$age))) %>%
dplyr::bind_cols(
predict(wf_fit, new_data = .),
predict(wf_fit, new_data = ., type = "conf_int")
)
p <- ggplot2::ggplot(aes(age, wage), data = data) +
geom_point(alpha = 0.05) +
geom_line(aes(y = .pred),
data = predictions, color = "darkgreen") +
geom_line(aes(y = .pred_lower),
data = predictions, linetype = "dashed", color = "blue") +
geom_line(aes(y = .pred_upper),
data = predictions, linetype = "dashed", color = "blue") +
scale_x_continuous(breaks = seq(20, 80, 5)) +
labs(title = substitute(wf_fit)) +
theme_classic()
return(p)
}
pw3_rec <- lin_rec %>% step_discretize(age, num_breaks = 3, min_unique = 5)
pw4_rec <- lin_rec %>% step_discretize(age, num_breaks = 4, min_unique = 5)
pw5_rec <- lin_rec %>% step_discretize(age, num_breaks = 5, min_unique = 5)
pw6_rec <- lin_rec %>% step_discretize(age, num_breaks = 6, min_unique = 5)
pw3_wf_fit <- workflow(pw3_rec, lm_spec) %>% fit(data = Wage)
pw4_wf_fit <- workflow(pw4_rec, lm_spec) %>% fit(data = Wage)
pw5_wf_fit <- workflow(pw5_rec, lm_spec) %>% fit(data = Wage)
pw6_wf_fit <- workflow(pw6_rec, lm_spec) %>% fit(data = Wage)
(plot_model(pw3_wf_fit, Wage) + plot_model(pw4_wf_fit, Wage)) /
(plot_model(pw5_wf_fit, Wage) + plot_model(pw6_wf_fit, Wage))
第一个问题的答案很简单:
map(list(pw3_wf_fit, pw4_wf_fit, pw5_wf_fit, pw6_wf_fit),
~predict(.x, new_data=tibble(age=30))) %>%
bind_rows()
# # A tibble: 4 × 1
# .pred
# <dbl>
# 1 99.3
# 2 94.2
# 3 92.3
# 4 89.5