tidymodels 列中发现的新级别

tidymodels Novel levels found in column

我正在使用 tidymodels 创建随机 Forrest 预测。我的测试数据包含训练数据中不存在的新因子水平,这会导致错误:

1: Novel levels found in column 'Siblings': '4'. The levels have been removed, and values have been coerced to 'NA'. 
2: There are new levels in a factor: NA 
> test_predict
Fehler: Objekt 'test_predict' nicht gefunden

我试图在“兄弟姐妹”列中包含 step_novelstep_dummy,但这并没有解决错误。我应该如何处理训练数据中不存在的新因素?

library(tidyverse)
library(tidymodels)

data <-
  data.frame(
    Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

test <-
  data.frame(
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

#Model
rf_model <-
  rand_forest() %>%
  set_args(
    mtry = 3,
    trees = 1000,
    min_n = 15
  ) %>%
  set_engine("ranger", 
             importance = "impurity") %>%
  set_mode("classification")

#Recipe
data_recipe <- 
  recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
  step_novel(Siblings) %>%
  step_dummy(Siblings)
#Workflow
rf_workflow <- 
  workflow() %>%
  add_recipe(data_recipe) %>%
  add_model(rf_model)

final_model <- fit(rf_workflow, data)
final_model

test_predict <- predict(final_model, test)
test_predict

回答我自己的问题:

我们需要申请 step_novel,然后申请 step_unknown。据我从文档中了解到 step_novel 将数据中出现的任何新因素标记为“新”。这可用于在应用配方后检查数据时轻松识别此类因素。 step_unknown 从数据中删除任何此类因素,并在应用模型时将值转换为 NA:

data_recipe <- 
   recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
   step_novel(Siblings) %>%
   step_unknown(Siblings)

如果您在 documentation for step_novel() 中注意到,它表示:

When fitting a model that can deal with new factor levels, consider using workflows::add_recipe() with allow_novel_levels = TRUE set in hardhat::default_recipe_blueprint(). This will allow your model to handle new levels at prediction time, instead of throwing warnings or errors.

所以你想这样做:

library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip

data <-
  data.frame(
    Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

test <-
  data.frame(
    Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
    Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
    Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s")) 
  )

#Model
rf_model <-
  rand_forest() %>%
  set_args(
    mtry = 3,
    trees = 1000,
    min_n = 15
  ) %>%
  set_engine("ranger", 
             importance = "impurity") %>%
  set_mode("classification")

#Recipe
data_recipe <- 
  recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
  step_novel(Siblings) %>%
  step_dummy(Siblings)

#Workflow
rf_workflow <- 
  workflow() %>%
  add_recipe(data_recipe, 
             blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)) %>%
  add_model(rf_model)

final_model <- fit(rf_workflow, data)
final_model
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#> 
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 2 Recipe Steps
#> 
#> • step_novel()
#> • step_dummy()
#> 
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Ranger result
#> 
#> Call:
#>  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3,      x), num.trees = ~1000, min.node.size = min_rows(~15, x),      importance = ~"impurity", num.threads = 1, verbose = FALSE,      seed = sample.int(10^5, 1), probability = TRUE) 
#> 
#> Type:                             Probability estimation 
#> Number of trees:                  1000 
#> Sample size:                      16 
#> Number of independent variables:  5 
#> Mtry:                             3 
#> Target node size:                 15 
#> Variable importance mode:         impurity 
#> Splitrule:                        gini 
#> OOB prediction error (Brier s.):  0.254242

test_predict <- predict(final_model, test)
test_predict
#> # A tibble: 16 x 1
#>    .pred_class
#>    <fct>      
#>  1 0          
#>  2 1          
#>  3 0          
#>  4 1          
#>  5 0          
#>  6 0          
#>  7 0          
#>  8 0          
#>  9 0          
#> 10 1          
#> 11 0          
#> 12 1          
#> 13 0          
#> 14 0          
#> 15 0          
#> 16 0

reprex package (v2.0.0)

于 2021-07-09 创建

工作流函数对新数据的因子水平和其他方面非常严格,确保它们与训练数据匹配。