尝试使用 tidymodels 预测二进制变量等于 1 的概率

Trying to predict the probability of a binary variable being equal to 1 using tidymodels

我试图通过估计一个逻辑回归(没有惩罚)来预测 two_year_recid 的概率,其中包括一个灵活的控制列表,不包括 decile_scorerace_factor,但是我一直收到错误提示

Error in eval_tidy(f[[2]], dat) : object '.' not found

这显示在下面代码块的 fit_full 开头的行上

rec_full <- recipe(
  two_year_recid ~ ., 
  data = train
  ) %>% 
  step_dummy(all_nominal()) %>% 
  step_interact(~ all_predictors() * all_predictors()) %>% 
  step_poly(age, degree = 3) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors())

mod_lm <- logistic_reg() %>% 
  set_engine('glm')

wf_full <- workflow() %>% 
  add_recipe(rec_full) %>% 
  add_model(mod_lm)

fit_full <- wf_full %>% fit(data = train)

test <- test %>%
  select(two_year_recid) %>% 
  bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))

我正在使用的数据和我所做的清理

raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")

## Main working data
df <- raw %>% 
        filter(days_b_screening_arrest <= 30) %>%
        filter(days_b_screening_arrest >= -30) %>%
        filter(is_recid != -1) %>%
        filter(c_charge_degree != "O") %>%
        filter(score_text != 'N/A')

## clean main working data a bit more
df <- df %>% 
  mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
         charge_factor = fct_explicit_na(c_charge_desc),
         race_factor = fct_explicit_na(race),
         race_factor = fct_relevel(race_factor, "Caucasian"),
         charge_factor = fct_lump_min(charge_factor, 30),
         sex_factor = factor(sex, levels = c("Female","Male")),
         priors_factor = ifelse(priors_count > 20, 20, priors_count),
         priors_factor = factor(priors_factor),
         two_year_recid = factor(two_year_recid)) %>% 
  select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor,  decile_score, length_of_stay) 

feature_names <- names(df)[-c(1,10,11)]


dfn = subset(df, select = -c(decile_score, race_factor))

set.seed(5281110)

split <- initial_split(dfn, p = 0.75)
train <- training(split)
test  <- testing(split)

以及我正在使用的库

library(tidyverse)
library(tidymodels)
library(AER)

当您添加步骤 step_dummy(all_nominal()) 时,select 编辑了您的 结果 two_year_recid 并将其变成了虚拟变量,因为它是名义变量。一定要说你不想 select 它,可以通过 -two_year_recid 或使用 -all_outcomes() 明确添加它。然后你的模型将适合并预测:

library(tidymodels)
library(tidyverse)

raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   .default = col_character(),
#>   id = col_double(),
#>   compas_screening_date = col_date(format = ""),
#>   dob = col_date(format = ""),
#>   age = col_double(),
#>   juv_fel_count = col_double(),
#>   decile_score = col_double(),
#>   juv_misd_count = col_double(),
#>   juv_other_count = col_double(),
#>   priors_count = col_double(),
#>   days_b_screening_arrest = col_double(),
#>   c_jail_in = col_datetime(format = ""),
#>   c_jail_out = col_datetime(format = ""),
#>   c_offense_date = col_date(format = ""),
#>   c_arrest_date = col_date(format = ""),
#>   c_days_from_compas = col_double(),
#>   is_recid = col_double(),
#>   r_days_from_arrest = col_double(),
#>   r_offense_date = col_date(format = ""),
#>   r_jail_in = col_date(format = ""),
#>   r_jail_out = col_date(format = "")
#>   # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.

## Main working data
df <- raw %>% 
  filter(days_b_screening_arrest <= 30) %>%
  filter(days_b_screening_arrest >= -30) %>%
  filter(is_recid != -1) %>%
  filter(c_charge_degree != "O") %>%
  filter(score_text != 'N/A')

## clean main working data a bit more
df <- df %>% 
  mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
         charge_factor = fct_explicit_na(c_charge_desc),
         race_factor = fct_explicit_na(race),
         race_factor = fct_relevel(race_factor, "Caucasian"),
         charge_factor = fct_lump_min(charge_factor, 30),
         sex_factor = factor(sex, levels = c("Female","Male")),
         priors_factor = ifelse(priors_count > 20, 20, priors_count),
         priors_factor = factor(priors_factor),
         two_year_recid = factor(two_year_recid)) %>% 
  select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor,  decile_score, length_of_stay) 

feature_names <- names(df)[-c(1,10,11)]


dfn = subset(df, select = -c(decile_score, race_factor))

set.seed(5281110)

split <- initial_split(dfn, p = 0.75)
train <- training(split)
test  <- testing(split)

rec_full <- recipe(
  two_year_recid ~ ., 
  data = train
) %>% 
  step_dummy(all_nominal(), -two_year_recid) %>% 
  step_interact(~ all_predictors() * all_predictors()) %>% 
  step_poly(age, degree = 3) %>% 
  step_normalize(all_predictors()) %>% 
  step_nzv(all_predictors())

mod_lm <- logistic_reg() %>% 
  set_engine('glm')

wf_full <- workflow() %>% 
  add_recipe(rec_full) %>% 
  add_model(mod_lm)

fit_full <- wf_full %>% fit(data = train)

test %>%
  select(two_year_recid) %>% 
  bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#>    two_year_recid full 
#>    <fct>          <fct>
#>  1 1              0    
#>  2 0              0    
#>  3 0              0    
#>  4 1              1    
#>  5 1              1    
#>  6 1              1    
#>  7 1              1    
#>  8 1              0    
#>  9 0              0    
#> 10 1              0    
#> # … with 1,533 more rows

reprex package (v0.3.0.9001)

于 2020-12-09 创建