如何提取 Tidymodels 产生的 GLMNET 系数
How to extract GLMNET coefficients produced by Tidymodels
我使用 tidymodels 估计了 glmnet 逻辑回归。但是我无法在 tidymodels:
中找出两件密切相关的事情
- a) 如何提取估计系数
- b) 保存估计模型以供将来生产使用。
下面是伪模型的代码。我尝试了 tidy()
、coef()
和 predict()
,但都失败了。任何帮助都感激不尽。谢谢。
library(tidymodels)
#> -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.0 --
#> v broom 0.7.0 v recipes 0.1.13
#> v dials 0.0.8 v rsample 0.0.7
#> v dplyr 1.0.0 v tibble 3.0.3
#> v ggplot2 3.3.2 v tune 0.1.1
#> v infer 0.5.2 v workflows 0.1.2
#> v parsnip 0.1.2 v yardstick 0.0.7
#> v purrr 0.3.4
#> -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
set.seed(1234)
train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
x1 = rnorm(1000),
x2 = rnorm(1000)
)
test <- tibble(y = factor(sample(c(0,1), 300, replace = TRUE)),
x1 = rnorm(300),
x2 = rnorm(300)
)
lr_cv <- vfold_cv(train)
lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lr_wf <- workflow() %>%
add_model(lr_mod) %>%
add_formula(y ~ .)
lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))
lr_tune <- tune_grid(lr_wf, resamples = lr_cv, grid = lr_grid, metrics = metric_set(roc_auc))
lr_best <- select_best(lr_tune)
lr_best
#> # A tibble: 1 x 2
#> penalty .config
#> <dbl> <chr>
#> 1 0.0001 Model1
lr_wf_final <- lr_wf %>% finalize_workflow(lr_best)
final_mod <- lr_wf_final %>% fit(train)
# the followings were tried but didn't work
lambda <- lr_best[1]
lambda
#> # A tibble: 1 x 1
#> penalty
#> <dbl>
#> 1 0.0001
predict(final.mod , s = lambda, type ="coefficients")[1:3 ,]
#> Error in predict(final.mod, s = lambda, type = "coefficients"): object 'final.mod' not found
tidy(final_mod)
#> # A tibble: 35 x 5
#> term step estimate lambda dev.ratio
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 1 -0.0560 0.0183 1.71e-14
#> 2 (Intercept) 2 -0.0561 0.0167 1.65e- 4
#> 3 (Intercept) 3 -0.0562 0.0152 3.02e- 4
#> 4 (Intercept) 4 -0.0562 0.0139 4.16e- 4
#> 5 (Intercept) 5 -0.0563 0.0126 5.10e- 4
#> 6 (Intercept) 6 -0.0564 0.0115 5.89e- 4
#> 7 (Intercept) 7 -0.0564 0.0105 6.54e- 4
#> 8 (Intercept) 8 -0.0565 0.00956 7.08e- 4
#> 9 (Intercept) 9 -0.0565 0.00871 7.53e- 4
#> 10 (Intercept) 10 -0.0566 0.00794 7.90e- 4
#> # ... with 25 more rows
coef(final_mod)
#> NULL
由 reprex package (v0.3.0)
于 2020-07-25 创建
我们最近添加了一些更新的支持来获取估计系数。
library(tidymodels)
#> ── Attaching packages ─────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom 0.7.0 ✓ recipes 0.1.13
#> ✓ dials 0.0.8 ✓ rsample 0.0.7
#> ✓ dplyr 1.0.0 ✓ tibble 3.0.3
#> ✓ ggplot2 3.3.2 ✓ tidyr 1.1.0
#> ✓ infer 0.5.3 ✓ tune 0.1.1.9000
#> ✓ modeldata 0.0.2 ✓ workflows 0.1.2
#> ✓ parsnip 0.1.2.9000 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
set.seed(1234)
train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
x1 = as.numeric(y) + rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000)
)
lr_cv <- vfold_cv(train)
lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))
lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lr_wf <- workflow() %>%
add_model(lr_mod) %>%
add_formula(y ~ .)
lr_best <- lr_wf %>%
tune_grid(
resamples = lr_cv,
grid = lr_grid
) %>%
select_best("roc_auc")
lr_wf %>%
finalize_workflow(lr_best) %>%
fit(train) %>%
pull_workflow_fit() %>%
tidy()
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#> Loaded glmnet 4.0-2
#> # A tibble: 4 x 3
#> term estimate penalty
#> <chr> <dbl> <dbl>
#> 1 (Intercept) -1.30 0.0316
#> 2 x1 0.834 0.0316
#> 3 x2 0 0.0316
#> 4 x3 0 0.0316
由 reprex package (v0.3.0.9001)
于 2020 年 7 月 30 日创建
提取系数的步骤是拉出拟合的对象,然后整理它。
如果你想保存这个对象以供将来使用,你可以只保存这些系数(毕竟它是一个线性模型),或者你可以将最终和拟合的工作流保存为一个二进制对象,这可以预测自。
我使用 tidymodels 估计了 glmnet 逻辑回归。但是我无法在 tidymodels:
中找出两件密切相关的事情- a) 如何提取估计系数
- b) 保存估计模型以供将来生产使用。
下面是伪模型的代码。我尝试了 tidy()
、coef()
和 predict()
,但都失败了。任何帮助都感激不尽。谢谢。
library(tidymodels)
#> -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.0 --
#> v broom 0.7.0 v recipes 0.1.13
#> v dials 0.0.8 v rsample 0.0.7
#> v dplyr 1.0.0 v tibble 3.0.3
#> v ggplot2 3.3.2 v tune 0.1.1
#> v infer 0.5.2 v workflows 0.1.2
#> v parsnip 0.1.2 v yardstick 0.0.7
#> v purrr 0.3.4
#> -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
set.seed(1234)
train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
x1 = rnorm(1000),
x2 = rnorm(1000)
)
test <- tibble(y = factor(sample(c(0,1), 300, replace = TRUE)),
x1 = rnorm(300),
x2 = rnorm(300)
)
lr_cv <- vfold_cv(train)
lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lr_wf <- workflow() %>%
add_model(lr_mod) %>%
add_formula(y ~ .)
lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))
lr_tune <- tune_grid(lr_wf, resamples = lr_cv, grid = lr_grid, metrics = metric_set(roc_auc))
lr_best <- select_best(lr_tune)
lr_best
#> # A tibble: 1 x 2
#> penalty .config
#> <dbl> <chr>
#> 1 0.0001 Model1
lr_wf_final <- lr_wf %>% finalize_workflow(lr_best)
final_mod <- lr_wf_final %>% fit(train)
# the followings were tried but didn't work
lambda <- lr_best[1]
lambda
#> # A tibble: 1 x 1
#> penalty
#> <dbl>
#> 1 0.0001
predict(final.mod , s = lambda, type ="coefficients")[1:3 ,]
#> Error in predict(final.mod, s = lambda, type = "coefficients"): object 'final.mod' not found
tidy(final_mod)
#> # A tibble: 35 x 5
#> term step estimate lambda dev.ratio
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 1 -0.0560 0.0183 1.71e-14
#> 2 (Intercept) 2 -0.0561 0.0167 1.65e- 4
#> 3 (Intercept) 3 -0.0562 0.0152 3.02e- 4
#> 4 (Intercept) 4 -0.0562 0.0139 4.16e- 4
#> 5 (Intercept) 5 -0.0563 0.0126 5.10e- 4
#> 6 (Intercept) 6 -0.0564 0.0115 5.89e- 4
#> 7 (Intercept) 7 -0.0564 0.0105 6.54e- 4
#> 8 (Intercept) 8 -0.0565 0.00956 7.08e- 4
#> 9 (Intercept) 9 -0.0565 0.00871 7.53e- 4
#> 10 (Intercept) 10 -0.0566 0.00794 7.90e- 4
#> # ... with 25 more rows
coef(final_mod)
#> NULL
由 reprex package (v0.3.0)
于 2020-07-25 创建我们最近添加了一些更新的支持来获取估计系数。
library(tidymodels)
#> ── Attaching packages ─────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom 0.7.0 ✓ recipes 0.1.13
#> ✓ dials 0.0.8 ✓ rsample 0.0.7
#> ✓ dplyr 1.0.0 ✓ tibble 3.0.3
#> ✓ ggplot2 3.3.2 ✓ tidyr 1.1.0
#> ✓ infer 0.5.3 ✓ tune 0.1.1.9000
#> ✓ modeldata 0.0.2 ✓ workflows 0.1.2
#> ✓ parsnip 0.1.2.9000 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
set.seed(1234)
train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
x1 = as.numeric(y) + rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000)
)
lr_cv <- vfold_cv(train)
lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))
lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_mode("classification") %>%
set_engine("glmnet")
lr_wf <- workflow() %>%
add_model(lr_mod) %>%
add_formula(y ~ .)
lr_best <- lr_wf %>%
tune_grid(
resamples = lr_cv,
grid = lr_grid
) %>%
select_best("roc_auc")
lr_wf %>%
finalize_workflow(lr_best) %>%
fit(train) %>%
pull_workflow_fit() %>%
tidy()
#> Loading required package: Matrix
#>
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#>
#> expand, pack, unpack
#> Loaded glmnet 4.0-2
#> # A tibble: 4 x 3
#> term estimate penalty
#> <chr> <dbl> <dbl>
#> 1 (Intercept) -1.30 0.0316
#> 2 x1 0.834 0.0316
#> 3 x2 0 0.0316
#> 4 x3 0 0.0316
由 reprex package (v0.3.0.9001)
于 2020 年 7 月 30 日创建提取系数的步骤是拉出拟合的对象,然后整理它。
如果你想保存这个对象以供将来使用,你可以只保存这些系数(毕竟它是一个线性模型),或者你可以将最终和拟合的工作流保存为一个二进制对象,这可以预测自。