Tidymodels 包:模型预测以使用 R 中的 juice() 和 bake() 函数找到最佳模型拟合

Tidymodels package: Model predictions to find the best model fit using the juice() and bake() functions in R

概览

我使用 tidymodels 包 数据框 FID 生成了四个模型(见下文):

  1. 一般线性模型
  2. 袋装树
  3. 随机森林
  4. 增强树

数据框包含三个预测变量

  1. 年份(数字)
  2. 月份(因子)
  3. 天(数字)

因变量是频率(数值)

瞄准

我的目标是进行模型预测以提取所有 class 概率 值拟合模型,它们都经历了 10 倍 cross-validation.

我正在尝试使用函数 prep()、juice() 和 bake() 来为模型预测生成正确的数据 objects objects 按照下面的教程进行操作。

教程(见下面的截图)

https://meghan.rbind.io/post/tidymodels-intro/

在为所有四个模型生成模型预测值(即class和概率)后,最终目标是生成混淆矩阵接受者操作曲线 (ROC) 来评估所有模型。因此,我需要将测试数据中的真实值class概率 从这些模型预测中提取的列。

问题

我正在尝试 运行 predict() 函数来生成教程中的 class 和概率值 (请参见下面的屏幕截图和 link 以上),但我在下面遇到此错误消息。

错误消息

##Class prediction object
Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"

##Prob
Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"

如果有人能提供帮助,我将不胜感激

非常感谢。

Screen-shots 来自教程

R-code

    ##################################################
    ##Model Prediction
    ###################################################
    ##Open the tidymodels package
    library(tidymodels)
    library(tidyverse)
    library(glmnet)
    library(parsnip)
    library(rpart)
    library(tidyverse) # manipulating data
    library(skimr) # data visualization
    library(baguette) # bagged trees
    library(future) # parallel processing & decrease computation time
    library(xgboost) # boosted trees
    library(ranger)
    library(yardstick)
    library(purrr)
    library(forcats)    

###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)

###########################################################
##Produce the recipe

rec <- recipe(Frequency ~ ., data = FID) %>% 
          step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
          step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
          step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
          step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

###########################################################
##Create Models
###########################################################

##########################################################
##General Linear Models
#########################################################

##glm
mod_glm<-linear_reg(mode="regression",
                       penalty = 0.1, 
                       mixture = 1) %>% 
                            set_engine("glmnet")

##Create workflow
wflow_glm <- workflow() %>% 
                add_recipe(rec) %>%
                      add_model(mod_glm)

##Fit the model
plan(multisession)

fit_glm <- fit_resamples(
                        wflow_glm,
                        cv,
                        metrics = metric_set(rmse, rsq),
                        control = control_resamples(save_pred = TRUE,
                              extract = function(x) extract_model(x)))

##########################################################
##Bagged Trees
##########################################################

#####Bagged Trees
mod_bag <- bag_tree() %>%
            set_mode("regression") %>%
              set_engine("rpart", times = 10) #10 bootstrap resamples
                

##Create workflow
wflow_bag <- workflow() %>% 
                   add_recipe(rec) %>%
                       add_model(mod_bag)

##Fit the model
plan(multisession)

fit_bag <- fit_resamples(
                      wflow_bag,
                      cv,
                      metrics = metric_set(rmse, rsq),
                      control = control_resamples(save_pred = TRUE,
                              extract = function(x) extract_model(x)))
###################################################
##Random forests
###################################################

mod_rf <-rand_forest(trees = 1e3) %>%
                              set_engine("ranger",
                              num.threads = parallel::detectCores(), 
                              importance = "permutation", 
                              verbose = TRUE) %>% 
                              set_mode("regression") 
                              
##Create Workflow

wflow_rf <- workflow() %>% 
               add_model(mod_rf) %>% 
                     add_recipe(rec)

##Fit the model

plan(multisession)

fit_rf<-fit_resamples(
             wflow_rf,
             cv,
             metrics = metric_set(rmse, rsq),
             control = control_resamples(save_pred = TRUE,
                                         extract = function(x) extract_model(x)))

############################################################
##Boosted Trees
############################################################

mod_boost <- boost_tree() %>% 
                 set_engine("xgboost", nthreads = parallel::detectCores()) %>% 
                      set_mode("regression")

##Create Workflow

wflow_boost <- workflow() %>% 
                  add_recipe(rec) %>% 
                    add_model(mod_boost)

##Fit model

plan(multisession)

fit_boost <-fit_resamples(
                       wflow_boost,
                       cv,
                       metrics = metric_set(rmse, rsq),
                       control = control_resamples(save_pred = TRUE,
                                         extract = function(x) extract_model(x)))

##################################################
##Prep the models for model prediction
##################################################

# Extract our prepped training data 
# and "bake" our testing data

prep<-prep(rec)

training_baked<-juice(prep)

testing_baked <- prep %>% bake(test_data) 

# Run the model with our training data
# Find the class predictions from our testing data
# And add back in the true values from testing data

predictions_class <-  %>% fit_glm %>%
                              predict(new_data = testing_baked) %>%
                                  bind_cols(testing_baked %>% dplyr::select(Frequency))

##Error message

  Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"
    
# Find the probability predictions
# And add all together

predictions_Prob <- fit_glm %>%
                        predict(testing_baked, type = "prob") %>%
                              bind_cols(predictions_class)

     ##Error message
     Error in UseMethod("predict") : 
  no applicable method for 'predict' applied to an object of class "c('resample_results', 'tune_results', 'tbl_df', 'tbl', 'data.frame')"

数据框 - FID

structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", 
"April", "May", "June", "July", "August", "September", "October", 
"November", "December"), class = "factor"), Frequency = c(36, 
28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9, 
7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27, 
43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15, 
29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31, 
28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA, 
-36L), class = "data.frame")

如果你的结果或因变量是数字,那么你将不会从预测中得到类或概率;你会得到结果的预测值。回归问题不宜做ROC曲线或混淆矩阵;这些仅适用于分类问题。

相反,您可以制作图表,在其中绘制 x 轴上的真实值和 y 轴上的预测值,如 this chapter 所示。

回答灵感来自 Julia Silge

    ##################################################
    ##Model Prediction
    ###################################################
    ##Open the tidymodels package
    library(tidymodels)
    library(tidyverse)
    library(glmnet)
    library(parsnip)
    library(rpart)
    library(tidyverse) # manipulating data
    library(skimr) # data visualization
    library(baguette) # bagged trees
    library(future) # parallel processing & decrease computation time
    library(xgboost) # boosted trees
    library(ranger)
    library(yardstick)
    library(purrr)
    library(forcats)    

###########################################################
#split this single dataset into two: a training set and a testing set
data_split <- initial_split(FID)
# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

# resample the data with 10-fold cross-validation (10-fold by default)
cv <- vfold_cv(train_data, v=10)

###########################################################
##Produce the recipe

rec <- recipe(Frequency ~ ., data = FID) %>% 
          step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
          step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
          step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
          step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables

###########################################################
##Create Models
###########################################################

##########################################################
##General Linear Models
#########################################################

    ##############################################################################
    ############################# Model Training/Tuning ###########################
    ###############################################################################
    
    ## Define a regularized regression and explicitly leave the tuning parameters
    ## empty for later tuning.
    glm_mod_1 <- 
           parsnip::linear_reg(penalty = tune::tune(), mixture = tune::tune()) %>%
          parsnip::set_engine("glmnet")
    
    ## Construct a workflow that combines your recipe and your model
    ml_wflow <-
              workflows::workflow() %>%
                   workflows::add_recipe(rec) %>%
                        workflows::add_model(glm_mod_1)
    
    # Find best tuned model
    res <-
         ml_wflow %>%
               tune::tune_grid(resamples = cv,
                               grid = 10,
                               metrics = yardstick::metric_set(yardstick::rmse))
    
    ############################# Validation ######################################
    ###############################################################################
    
    best_params <-
             res %>%
                   tune::select_best(metric = "rmse", maximize = FALSE)
    
    #Refit using the entire training data
    reg_res <-
          ml_wflow %>%
              tune::finalize_workflow(best_params) %>%
              parsnip::fit(data = train_data)
    
    blue_test_res<-predict(reg_res, new_data=test_data %>% dplyr::select(- 
          Frequency))
    
    blue_test_res <- bind_cols(blue_test_res, test_data %>% 
                   dplyr::select(Frequency))
    
    ##Open a plotting window
    
    dev.new()
    
    ##Plot model predictions
    ggplot(blue_test_res, aes(x = Frequency, y = .pred)) + 
      # Create a diagonal line:
        geom_abline(lty = 2) + 
        geom_point(alpha = 0.5) + 
        labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
      # Scale and size the x- and y-axis uniformly:
      coord_obs_pred()

情节