尝试使用 tidymodels 预测二进制变量等于 1 的概率
Trying to predict the probability of a binary variable being equal to 1 using tidymodels
我试图通过估计一个逻辑回归(没有惩罚)来预测 two_year_recid
的概率,其中包括一个灵活的控制列表,不包括 decile_score
和 race_factor
,但是我一直收到错误提示
Error in eval_tidy(f[[2]], dat) : object '.' not found
这显示在下面代码块的 fit_full
开头的行上
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal()) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test <- test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))
我正在使用的数据和我所做的清理
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
以及我正在使用的库
library(tidyverse)
library(tidymodels)
library(AER)
当您添加步骤 step_dummy(all_nominal())
时,select 编辑了您的 结果 two_year_recid
并将其变成了虚拟变量,因为它是名义变量。一定要说你不想 select 它,可以通过 -two_year_recid
或使用 -all_outcomes()
明确添加它。然后你的模型将适合并预测:
library(tidymodels)
library(tidyverse)
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_character(),
#> id = col_double(),
#> compas_screening_date = col_date(format = ""),
#> dob = col_date(format = ""),
#> age = col_double(),
#> juv_fel_count = col_double(),
#> decile_score = col_double(),
#> juv_misd_count = col_double(),
#> juv_other_count = col_double(),
#> priors_count = col_double(),
#> days_b_screening_arrest = col_double(),
#> c_jail_in = col_datetime(format = ""),
#> c_jail_out = col_datetime(format = ""),
#> c_offense_date = col_date(format = ""),
#> c_arrest_date = col_date(format = ""),
#> c_days_from_compas = col_double(),
#> is_recid = col_double(),
#> r_days_from_arrest = col_double(),
#> r_offense_date = col_date(format = ""),
#> r_jail_in = col_date(format = ""),
#> r_jail_out = col_date(format = "")
#> # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal(), -two_year_recid) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#> two_year_recid full
#> <fct> <fct>
#> 1 1 0
#> 2 0 0
#> 3 0 0
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 0
#> 9 0 0
#> 10 1 0
#> # … with 1,533 more rows
由 reprex package (v0.3.0.9001)
于 2020-12-09 创建
我试图通过估计一个逻辑回归(没有惩罚)来预测 two_year_recid
的概率,其中包括一个灵活的控制列表,不包括 decile_score
和 race_factor
,但是我一直收到错误提示
Error in eval_tidy(f[[2]], dat) : object '.' not found
这显示在下面代码块的 fit_full
开头的行上
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal()) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test <- test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))
我正在使用的数据和我所做的清理
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
以及我正在使用的库
library(tidyverse)
library(tidymodels)
library(AER)
当您添加步骤 step_dummy(all_nominal())
时,select 编辑了您的 结果 two_year_recid
并将其变成了虚拟变量,因为它是名义变量。一定要说你不想 select 它,可以通过 -two_year_recid
或使用 -all_outcomes()
明确添加它。然后你的模型将适合并预测:
library(tidymodels)
library(tidyverse)
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_character(),
#> id = col_double(),
#> compas_screening_date = col_date(format = ""),
#> dob = col_date(format = ""),
#> age = col_double(),
#> juv_fel_count = col_double(),
#> decile_score = col_double(),
#> juv_misd_count = col_double(),
#> juv_other_count = col_double(),
#> priors_count = col_double(),
#> days_b_screening_arrest = col_double(),
#> c_jail_in = col_datetime(format = ""),
#> c_jail_out = col_datetime(format = ""),
#> c_offense_date = col_date(format = ""),
#> c_arrest_date = col_date(format = ""),
#> c_days_from_compas = col_double(),
#> is_recid = col_double(),
#> r_days_from_arrest = col_double(),
#> r_offense_date = col_date(format = ""),
#> r_jail_in = col_date(format = ""),
#> r_jail_out = col_date(format = "")
#> # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal(), -two_year_recid) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#> two_year_recid full
#> <fct> <fct>
#> 1 1 0
#> 2 0 0
#> 3 0 0
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 0
#> 9 0 0
#> 10 1 0
#> # … with 1,533 more rows
由 reprex package (v0.3.0.9001)
于 2020-12-09 创建