如何使用 VIP 包和 tidymodels(包括配方)计算 FIRM 重要性度量

How to compute FIRM importance measure using VIP package and tidymodels (including recipe)

我想计算由 tidymodels 工作流程制作的模型的 FIRM 重要性分数。对于正则表达式,我将使用鸢尾花数据集并尝试预测观察结果是否是山毛榉。

library(tidymodels)
library(readr)
library(vip)

#clean data
iris <- iris %>%
  mutate(class  = case_when(Species == 'setosa' ~ 'setosa',
                            TRUE ~ 'other'))
iris$class = as.factor(iris$class)
iris <- subset(iris, select = -c(Species))

#split data into training and testing
iris_split = initial_split(iris, prop = 0.8)
cv_splits = vfold_cv(training(iris_split), v = 5)

#preprocessing
iris_recipe = recipe(class ~., data = iris) %>%
  step_center(Sepal.Length) %>%
  prep()

#specify MARS model
model = rand_forest(
  mode = "classification",
  mtry = tune(),
  trees = 50
) %>% 
  set_engine("ranger", importance = "impurity")

#tuning parameters
tuning_grid = grid_regular(mtry(range=c(1,4)), levels = 4)

iris_wkfl = workflow() %>%
  add_recipe(iris_recipe) %>%
  add_model(model) 
  
iris_tune = tune_grid(iris_wkfl,
            resamples = cv_splits,
            grid = tuning_grid,
            metrics = metric_set(accuracy))

best_params = iris_tune %>%
  select_best(metric = "accuracy")

best_model = finalize_workflow(iris_wkfl, best_params) %>%
  parsnip::fit(data = training(iris_split)) %>%
  pull_workflow_fit()

vip(best_model, method = "firm")

最后一行产生了来自 pdp 包的错误。

get_training_data.default(object) 错误: 无法从对象中提取训练数据。请在调用 partial.

时使用 train 参数提供原始训练数据

下面这行是否正确?或者我是否需要先使用我的配方提供转换后的训练数据?我想确保 vip 在计算重要性分数时应用了我的食谱。我知道错误是“原始训练数据”,但我不确定 pdp 是否知道我的工作流程。

vip(best_model, method = "firm", train = training(iris_split))

您需要采用与我相同的方法 outlined in this answer

首先调整模型,然后在训练数据上训练您的模型:

library(tidymodels)

#clean data
iris = iris %>%
  mutate(class  = case_when(Species == 'setosa' ~ 'setosa',
                            TRUE ~ 'other'),
         class = factor(class)) %>%
  select(-Species)

#split data into training and testing
iris_split = initial_split(iris, prop = 0.8)
iris_train = training(iris_split)
iris_test = testing(iris_split)
cv_splits = vfold_cv(iris_train, v = 5)

#preprocessing
iris_recipe = recipe(class ~., data = iris_train) %>%
  step_center(Sepal.Length)

#specify ranger model
rf_spec = rand_forest(
  mode = "classification",
  mtry = tune(),
  trees = 50
) %>% 
  set_engine("ranger", importance = "impurity") 
## don't need any importance here if you will do it another way; probably remove

#tuning parameters
tuning_grid = grid_regular(mtry(range=c(1,4)), levels = 4)

iris_wkfl = workflow() %>%
  add_recipe(iris_recipe) %>%
  add_model(rf_spec) 

iris_tune = tune_grid(iris_wkfl,
                      resamples = cv_splits,
                      grid = tuning_grid,
                      metrics = metric_set(accuracy))
#> 
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#> 
#>     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#>     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#>     splice
#> 
#> Attaching package: 'vctrs'
#> The following object is masked from 'package:tibble':
#> 
#>     data_frame
#> The following object is masked from 'package:dplyr':
#> 
#>     data_frame

best_params = iris_tune %>%
  select_best(metric = "accuracy")

rf_fit = finalize_workflow(iris_wkfl, best_params) %>%
  fit(data = iris_train)

您的模型现已 训练,您可以像 FIRM 一样计算与模型无关的变量重要性分数。有几个步骤:

  • pull() 工作流程外的拟合模型。
  • 您必须指定 target/outcome 变量,class
  • 在这种情况下,我们需要同时传递原始训练数据(您必须在此处使用预处理的 训练 数据,这是您从菜谱中得到的)和正确的底层函数用于从 ranger 进行预测(对于大多数模型,这是 predict(),但不幸的是,对于 ranger,它是 predictions())。
library(vip)
#> 
#> Attaching package: 'vip'
#> The following object is masked from 'package:utils':
#> 
#>     vi
rf_fit %>%
  pull_workflow_fit() %>%
  vip(method = "firm", 
      target = "class", metric = "accuracy",
      pred_wrapper = ranger::predictions, 
      train = bake(prep(iris_recipe), new_data = NULL))

reprex package (v0.3.0.9001)

于 2020-12-10 创建