Tidymodels 包:模型预测以使用 R 中的 juice() 和 bake() 函数找到最佳模型拟合
Tidymodels package: Model predictions to find the best model fit using the juice() and bake() functions in R
概览
我使用 tidymodels 包 和 数据框 FID 生成了四个模型(见下文):
- 一般线性模型
- 袋装树
- 随机森林
- 增强树
数据框包含三个预测变量:
- 年份(数字)
- 月份(因子)
- 天(数字)
因变量是频率(数值)
瞄准
我的目标是进行模型预测以提取所有 的 class 和 概率 值拟合模型,它们都经历了 10 倍 cross-validation.
我正在尝试使用函数 prep()、juice() 和 bake() 来为模型预测生成正确的数据 objects objects 按照下面的教程进行操作。
教程(见下面的截图)
https://meghan.rbind.io/post/tidymodels-intro/
在为所有四个模型生成模型预测值(即class和概率)后,最终目标是生成混淆矩阵和接受者操作曲线 (ROC) 来评估所有模型。因此,我需要将测试数据中的真实值与class和概率 从这些模型预测中提取的列。
问题
我正在尝试 运行 predict() 函数来生成教程中的 class 和概率值 (请参见下面的屏幕截图和 link 以上),但我在下面遇到此错误消息。
错误消息
##Class prediction object
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
##Prob
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"
如果有人能提供帮助,我将不胜感激
非常感谢。
Screen-shots 来自教程
R-code
##################################################
##Model Prediction
###################################################
##Open the tidymodels package
library(tidymodels)
library(tidyverse)
library(glmnet)
library(parsnip)
library(rpart)
library(tidyverse) # manipulating data
library(skimr) # data visualization
library(baguette) # bagged trees
library(future) # parallel processing & decrease computation time
library(xgboost) # boosted trees
library(ranger)
library(yardstick)
library(purrr)
library(forcats)
###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)
###########################################################
##Produce the recipe
rec <- recipe(Frequency ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
###########################################################
##Create Models
###########################################################
##########################################################
##General Linear Models
#########################################################
##glm
mod_glm<-linear_reg(mode="regression",
penalty = 0.1,
mixture = 1) %>%
set_engine("glmnet")
##Create workflow
wflow_glm <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_glm)
##Fit the model
plan(multisession)
fit_glm <- fit_resamples(
wflow_glm,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
##########################################################
##Bagged Trees
##########################################################
#####Bagged Trees
mod_bag <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart", times = 10) #10 bootstrap resamples
##Create workflow
wflow_bag <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_bag)
##Fit the model
plan(multisession)
fit_bag <- fit_resamples(
wflow_bag,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
###################################################
##Random forests
###################################################
mod_rf <-rand_forest(trees = 1e3) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation",
verbose = TRUE) %>%
set_mode("regression")
##Create Workflow
wflow_rf <- workflow() %>%
add_model(mod_rf) %>%
add_recipe(rec)
##Fit the model
plan(multisession)
fit_rf<-fit_resamples(
wflow_rf,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
############################################################
##Boosted Trees
############################################################
mod_boost <- boost_tree() %>%
set_engine("xgboost", nthreads = parallel::detectCores()) %>%
set_mode("regression")
##Create Workflow
wflow_boost <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_boost)
##Fit model
plan(multisession)
fit_boost <-fit_resamples(
wflow_boost,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
##################################################
##Prep the models for model prediction
##################################################
# Extract our prepped training data
# and "bake" our testing data
prep<-prep(rec)
training_baked<-juice(prep)
testing_baked <- prep %>% bake(test_data)
# Run the model with our training data
# Find the class predictions from our testing data
# And add back in the true values from testing data
predictions_class <- %>% fit_glm %>%
predict(new_data = testing_baked) %>%
bind_cols(testing_baked %>% dplyr::select(Frequency))
##Error message
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
# Find the probability predictions
# And add all together
predictions_Prob <- fit_glm %>%
predict(testing_baked, type = "prob") %>%
bind_cols(predictions_class)
##Error message
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"
数据框 - FID
structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March",
"April", "May", "June", "July", "August", "September", "October",
"November", "December"), class = "factor"), Frequency = c(36,
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9,
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27,
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15,
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31,
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA,
-36L), class = "data.frame")
如果你的结果或因变量是数字,那么你将不会从预测中得到类或概率;你会得到结果的预测值。回归问题不宜做ROC曲线或混淆矩阵;这些仅适用于分类问题。
相反,您可以制作图表,在其中绘制 x 轴上的真实值和 y 轴上的预测值,如 this chapter 所示。
回答灵感来自 Julia Silge
##################################################
##Model Prediction
###################################################
##Open the tidymodels package
library(tidymodels)
library(tidyverse)
library(glmnet)
library(parsnip)
library(rpart)
library(tidyverse) # manipulating data
library(skimr) # data visualization
library(baguette) # bagged trees
library(future) # parallel processing & decrease computation time
library(xgboost) # boosted trees
library(ranger)
library(yardstick)
library(purrr)
library(forcats)
###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)
###########################################################
##Produce the recipe
rec <- recipe(Frequency ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
###########################################################
##Create Models
###########################################################
##########################################################
##General Linear Models
#########################################################
##############################################################################
############################# Model Training/Tuning ###########################
###############################################################################
## Define a regularized regression and explicitly leave the tuning parameters
## empty for later tuning.
glm_mod_1 <-
parsnip::linear_reg(penalty = tune::tune(), mixture = tune::tune()) %>%
parsnip::set_engine("glmnet")
## Construct a workflow that combines your recipe and your model
ml_wflow <-
workflows::workflow() %>%
workflows::add_recipe(rec) %>%
workflows::add_model(glm_mod_1)
# Find best tuned model
res <-
ml_wflow %>%
tune::tune_grid(resamples = cv,
grid = 10,
metrics = yardstick::metric_set(yardstick::rmse))
############################# Validation ######################################
###############################################################################
best_params <-
res %>%
tune::select_best(metric = "rmse", maximize = FALSE)
#Refit using the entire training data
reg_res <-
ml_wflow %>%
tune::finalize_workflow(best_params) %>%
parsnip::fit(data = train_data)
blue_test_res<-predict(reg_res, new_data=test_data %>% dplyr::select(-
Frequency))
blue_test_res <- bind_cols(blue_test_res, test_data %>%
dplyr::select(Frequency))
##Open a plotting window
dev.new()
##Plot model predictions
ggplot(blue_test_res, aes(x = Frequency, y = .pred)) +
# Create a diagonal line:
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
# Scale and size the x- and y-axis uniformly:
coord_obs_pred()
情节
概览
我使用 tidymodels 包 和 数据框 FID 生成了四个模型(见下文):
- 一般线性模型
- 袋装树
- 随机森林
- 增强树
数据框包含三个预测变量:
- 年份(数字)
- 月份(因子)
- 天(数字)
因变量是频率(数值)
瞄准
我的目标是进行模型预测以提取所有 的 class 和 概率 值拟合模型,它们都经历了 10 倍 cross-validation.
我正在尝试使用函数 prep()、juice() 和 bake() 来为模型预测生成正确的数据 objects objects 按照下面的教程进行操作。
教程(见下面的截图)
https://meghan.rbind.io/post/tidymodels-intro/
在为所有四个模型生成模型预测值(即class和概率)后,最终目标是生成混淆矩阵和接受者操作曲线 (ROC) 来评估所有模型。因此,我需要将测试数据中的真实值与class和概率 从这些模型预测中提取的列。
问题
我正在尝试 运行 predict() 函数来生成教程中的 class 和概率值 (请参见下面的屏幕截图和 link 以上),但我在下面遇到此错误消息。
错误消息
##Class prediction object
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
##Prob
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"
如果有人能提供帮助,我将不胜感激
非常感谢。
Screen-shots 来自教程
R-code
##################################################
##Model Prediction
###################################################
##Open the tidymodels package
library(tidymodels)
library(tidyverse)
library(glmnet)
library(parsnip)
library(rpart)
library(tidyverse) # manipulating data
library(skimr) # data visualization
library(baguette) # bagged trees
library(future) # parallel processing & decrease computation time
library(xgboost) # boosted trees
library(ranger)
library(yardstick)
library(purrr)
library(forcats)
###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)
###########################################################
##Produce the recipe
rec <- recipe(Frequency ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
###########################################################
##Create Models
###########################################################
##########################################################
##General Linear Models
#########################################################
##glm
mod_glm<-linear_reg(mode="regression",
penalty = 0.1,
mixture = 1) %>%
set_engine("glmnet")
##Create workflow
wflow_glm <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_glm)
##Fit the model
plan(multisession)
fit_glm <- fit_resamples(
wflow_glm,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
##########################################################
##Bagged Trees
##########################################################
#####Bagged Trees
mod_bag <- bag_tree() %>%
set_mode("regression") %>%
set_engine("rpart", times = 10) #10 bootstrap resamples
##Create workflow
wflow_bag <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_bag)
##Fit the model
plan(multisession)
fit_bag <- fit_resamples(
wflow_bag,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
###################################################
##Random forests
###################################################
mod_rf <-rand_forest(trees = 1e3) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation",
verbose = TRUE) %>%
set_mode("regression")
##Create Workflow
wflow_rf <- workflow() %>%
add_model(mod_rf) %>%
add_recipe(rec)
##Fit the model
plan(multisession)
fit_rf<-fit_resamples(
wflow_rf,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
############################################################
##Boosted Trees
############################################################
mod_boost <- boost_tree() %>%
set_engine("xgboost", nthreads = parallel::detectCores()) %>%
set_mode("regression")
##Create Workflow
wflow_boost <- workflow() %>%
add_recipe(rec) %>%
add_model(mod_boost)
##Fit model
plan(multisession)
fit_boost <-fit_resamples(
wflow_boost,
cv,
metrics = metric_set(rmse, rsq),
control = control_resamples(save_pred = TRUE,
extract = function(x) extract_model(x)))
##################################################
##Prep the models for model prediction
##################################################
# Extract our prepped training data
# and "bake" our testing data
prep<-prep(rec)
training_baked<-juice(prep)
testing_baked <- prep %>% bake(test_data)
# Run the model with our training data
# Find the class predictions from our testing data
# And add back in the true values from testing data
predictions_class <- %>% fit_glm %>%
predict(new_data = testing_baked) %>%
bind_cols(testing_baked %>% dplyr::select(Frequency))
##Error message
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
# Find the probability predictions
# And add all together
predictions_Prob <- fit_glm %>%
predict(testing_baked, type = "prob") %>%
bind_cols(predictions_class)
##Error message
Error in UseMethod("predict") :
no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"
数据框 - FID
structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March",
"April", "May", "June", "July", "August", "September", "October",
"November", "December"), class = "factor"), Frequency = c(36,
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9,
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27,
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15,
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31,
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA,
-36L), class = "data.frame")
如果你的结果或因变量是数字,那么你将不会从预测中得到类或概率;你会得到结果的预测值。回归问题不宜做ROC曲线或混淆矩阵;这些仅适用于分类问题。
相反,您可以制作图表,在其中绘制 x 轴上的真实值和 y 轴上的预测值,如 this chapter 所示。
回答灵感来自 Julia Silge
##################################################
##Model Prediction
###################################################
##Open the tidymodels package
library(tidymodels)
library(tidyverse)
library(glmnet)
library(parsnip)
library(rpart)
library(tidyverse) # manipulating data
library(skimr) # data visualization
library(baguette) # bagged trees
library(future) # parallel processing & decrease computation time
library(xgboost) # boosted trees
library(ranger)
library(yardstick)
library(purrr)
library(forcats)
###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data <- testing(data_split)
# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)
###########################################################
##Produce the recipe
rec <- recipe(Frequency ~ ., data = FID) %>%
step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels
step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # replaces missing numeric observations with the median
step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
###########################################################
##Create Models
###########################################################
##########################################################
##General Linear Models
#########################################################
##############################################################################
############################# Model Training/Tuning ###########################
###############################################################################
## Define a regularized regression and explicitly leave the tuning parameters
## empty for later tuning.
glm_mod_1 <-
parsnip::linear_reg(penalty = tune::tune(), mixture = tune::tune()) %>%
parsnip::set_engine("glmnet")
## Construct a workflow that combines your recipe and your model
ml_wflow <-
workflows::workflow() %>%
workflows::add_recipe(rec) %>%
workflows::add_model(glm_mod_1)
# Find best tuned model
res <-
ml_wflow %>%
tune::tune_grid(resamples = cv,
grid = 10,
metrics = yardstick::metric_set(yardstick::rmse))
############################# Validation ######################################
###############################################################################
best_params <-
res %>%
tune::select_best(metric = "rmse", maximize = FALSE)
#Refit using the entire training data
reg_res <-
ml_wflow %>%
tune::finalize_workflow(best_params) %>%
parsnip::fit(data = train_data)
blue_test_res<-predict(reg_res, new_data=test_data %>% dplyr::select(-
Frequency))
blue_test_res <- bind_cols(blue_test_res, test_data %>%
dplyr::select(Frequency))
##Open a plotting window
dev.new()
##Plot model predictions
ggplot(blue_test_res, aes(x = Frequency, y = .pred)) +
# Create a diagonal line:
geom_abline(lty = 2) +
geom_point(alpha = 0.5) +
labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
# Scale and size the x- and y-axis uniformly:
coord_obs_pred()
情节