使用 R 中数据帧列表中的 tidymodels 进行引导
Bootstrapping using tidymodels from a list of dataframes in R
我正在 运行 使用 tidymodels
创建一个模型,其中按组拆分数据并对每个单独的数据帧进行 运行 回归。这很好用。但是,现在我还需要 bootstrap 我的结果。我不确定如何将它构建到我现有的代码中。
我的原始代码如下所示:
library(dplyr)
year <- rep(2014:2018, length.out=10000)
group <- sample(c(0,1,2,3,4,5,6), replace=TRUE, size=10000)
value <- sample(10000, replace=T)
female <- sample(c(0,1), replace=TRUE, size=10000)
smoker <- sample(c(0,1), replace=TRUE, size=10000)
dta <- data.frame(year=year, group=group, value=value, female=female, smoker=smoker)
# cut the dataset into list
table_list <- dta %>%
group_by(year, group) %>%
group_split()
# fit model per subgroup
model_list <- lapply(table_list, function(x) glm(smoker ~ female, data=x,
family=binomial(link="probit")))
# predict
pred_list <- lapply(model_list, function(x) predict.glm(x, type = "response"))
我想 bootstrap 替换以获得 bootstrapped 预测值。我的直觉是,我应该在创建 table_list 时通过创建随机样本来进一步拆分数据集。但我该怎么做呢?
感谢您的帮助。
这相当复杂,有分组和 bootstrapping,所以我可能会接近它 like this,使用 map()
两层深:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
year <- rep(2014:2018, length.out=10000)
group <- sample(c(0,1,2,3,4,5,6), replace=TRUE, size=10000)
value <- sample(10000, replace=T)
female <- sample(c(0,1), replace=TRUE, size=10000)
smoker <- sample(c(0,1), replace=TRUE, size=10000)
dta <- tibble(year=year, group=group, value=value, female=female, smoker=smoker)
glm_boot_mods <-
dta %>%
nest(data = c(-year, -group)) %>%
mutate(boots = map(
data,
~ bootstraps(., times = 20) %>%
mutate(model = map(.$splits, ~ glm(smoker ~ female, data = analysis(.x),
family = binomial(link = "probit"))),
preds = map2(model, .$splits, ~predict(.x, newdata = assessment(.y))))
))
glm_boot_mods
#> # A tibble: 35 × 4
#> year group data boots
#> <int> <dbl> <list> <list>
#> 1 2014 1 <tibble [288 × 3]> <bootstraps [20 × 4]>
#> 2 2015 4 <tibble [273 × 3]> <bootstraps [20 × 4]>
#> 3 2016 3 <tibble [301 × 3]> <bootstraps [20 × 4]>
#> 4 2017 2 <tibble [282 × 3]> <bootstraps [20 × 4]>
#> 5 2018 0 <tibble [276 × 3]> <bootstraps [20 × 4]>
#> 6 2014 3 <tibble [279 × 3]> <bootstraps [20 × 4]>
#> 7 2016 2 <tibble [314 × 3]> <bootstraps [20 × 4]>
#> 8 2018 1 <tibble [296 × 3]> <bootstraps [20 × 4]>
#> 9 2014 0 <tibble [304 × 3]> <bootstraps [20 × 4]>
#> 10 2015 6 <tibble [288 × 3]> <bootstraps [20 × 4]>
#> # … with 25 more rows
第一个 map()
为每个分组创建 bootstrap 重采样,然后我们更深一层 每个重采样 拟合模型并预测对于该重新抽样的保留意见。您可以在这里看到第一组的内部结构:
glm_boot_mods %>%
head(1) %>%
pull(boots)
#> [[1]]
#> # Bootstrap sampling
#> # A tibble: 20 × 4
#> splits id model preds
#> <list> <chr> <list> <list>
#> 1 <split [288/111]> Bootstrap01 <glm> <dbl [111]>
#> 2 <split [288/93]> Bootstrap02 <glm> <dbl [93]>
#> 3 <split [288/103]> Bootstrap03 <glm> <dbl [103]>
#> 4 <split [288/106]> Bootstrap04 <glm> <dbl [106]>
#> 5 <split [288/109]> Bootstrap05 <glm> <dbl [109]>
#> 6 <split [288/109]> Bootstrap06 <glm> <dbl [109]>
#> 7 <split [288/92]> Bootstrap07 <glm> <dbl [92]>
#> 8 <split [288/111]> Bootstrap08 <glm> <dbl [111]>
#> 9 <split [288/99]> Bootstrap09 <glm> <dbl [99]>
#> 10 <split [288/111]> Bootstrap10 <glm> <dbl [111]>
#> 11 <split [288/102]> Bootstrap11 <glm> <dbl [102]>
#> 12 <split [288/104]> Bootstrap12 <glm> <dbl [104]>
#> 13 <split [288/115]> Bootstrap13 <glm> <dbl [115]>
#> 14 <split [288/111]> Bootstrap14 <glm> <dbl [111]>
#> 15 <split [288/108]> Bootstrap15 <glm> <dbl [108]>
#> 16 <split [288/110]> Bootstrap16 <glm> <dbl [110]>
#> 17 <split [288/110]> Bootstrap17 <glm> <dbl [110]>
#> 18 <split [288/111]> Bootstrap18 <glm> <dbl [111]>
#> 19 <split [288/103]> Bootstrap19 <glm> <dbl [103]>
#> 20 <split [288/109]> Bootstrap20 <glm> <dbl [109]>
由 reprex package (v2.0.1)
创建于 2021-11-02
请注意,对于每个重新采样的保留观察结果都有预测。根据您要执行的操作,您可以在接下来需要处理的 glm_boot_mods
列上使用 unnest()
。
我正在 运行 使用 tidymodels
创建一个模型,其中按组拆分数据并对每个单独的数据帧进行 运行 回归。这很好用。但是,现在我还需要 bootstrap 我的结果。我不确定如何将它构建到我现有的代码中。
我的原始代码如下所示:
library(dplyr)
year <- rep(2014:2018, length.out=10000)
group <- sample(c(0,1,2,3,4,5,6), replace=TRUE, size=10000)
value <- sample(10000, replace=T)
female <- sample(c(0,1), replace=TRUE, size=10000)
smoker <- sample(c(0,1), replace=TRUE, size=10000)
dta <- data.frame(year=year, group=group, value=value, female=female, smoker=smoker)
# cut the dataset into list
table_list <- dta %>%
group_by(year, group) %>%
group_split()
# fit model per subgroup
model_list <- lapply(table_list, function(x) glm(smoker ~ female, data=x,
family=binomial(link="probit")))
# predict
pred_list <- lapply(model_list, function(x) predict.glm(x, type = "response"))
我想 bootstrap 替换以获得 bootstrapped 预测值。我的直觉是,我应该在创建 table_list 时通过创建随机样本来进一步拆分数据集。但我该怎么做呢?
感谢您的帮助。
这相当复杂,有分组和 bootstrapping,所以我可能会接近它 like this,使用 map()
两层深:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
year <- rep(2014:2018, length.out=10000)
group <- sample(c(0,1,2,3,4,5,6), replace=TRUE, size=10000)
value <- sample(10000, replace=T)
female <- sample(c(0,1), replace=TRUE, size=10000)
smoker <- sample(c(0,1), replace=TRUE, size=10000)
dta <- tibble(year=year, group=group, value=value, female=female, smoker=smoker)
glm_boot_mods <-
dta %>%
nest(data = c(-year, -group)) %>%
mutate(boots = map(
data,
~ bootstraps(., times = 20) %>%
mutate(model = map(.$splits, ~ glm(smoker ~ female, data = analysis(.x),
family = binomial(link = "probit"))),
preds = map2(model, .$splits, ~predict(.x, newdata = assessment(.y))))
))
glm_boot_mods
#> # A tibble: 35 × 4
#> year group data boots
#> <int> <dbl> <list> <list>
#> 1 2014 1 <tibble [288 × 3]> <bootstraps [20 × 4]>
#> 2 2015 4 <tibble [273 × 3]> <bootstraps [20 × 4]>
#> 3 2016 3 <tibble [301 × 3]> <bootstraps [20 × 4]>
#> 4 2017 2 <tibble [282 × 3]> <bootstraps [20 × 4]>
#> 5 2018 0 <tibble [276 × 3]> <bootstraps [20 × 4]>
#> 6 2014 3 <tibble [279 × 3]> <bootstraps [20 × 4]>
#> 7 2016 2 <tibble [314 × 3]> <bootstraps [20 × 4]>
#> 8 2018 1 <tibble [296 × 3]> <bootstraps [20 × 4]>
#> 9 2014 0 <tibble [304 × 3]> <bootstraps [20 × 4]>
#> 10 2015 6 <tibble [288 × 3]> <bootstraps [20 × 4]>
#> # … with 25 more rows
第一个 map()
为每个分组创建 bootstrap 重采样,然后我们更深一层 每个重采样 拟合模型并预测对于该重新抽样的保留意见。您可以在这里看到第一组的内部结构:
glm_boot_mods %>%
head(1) %>%
pull(boots)
#> [[1]]
#> # Bootstrap sampling
#> # A tibble: 20 × 4
#> splits id model preds
#> <list> <chr> <list> <list>
#> 1 <split [288/111]> Bootstrap01 <glm> <dbl [111]>
#> 2 <split [288/93]> Bootstrap02 <glm> <dbl [93]>
#> 3 <split [288/103]> Bootstrap03 <glm> <dbl [103]>
#> 4 <split [288/106]> Bootstrap04 <glm> <dbl [106]>
#> 5 <split [288/109]> Bootstrap05 <glm> <dbl [109]>
#> 6 <split [288/109]> Bootstrap06 <glm> <dbl [109]>
#> 7 <split [288/92]> Bootstrap07 <glm> <dbl [92]>
#> 8 <split [288/111]> Bootstrap08 <glm> <dbl [111]>
#> 9 <split [288/99]> Bootstrap09 <glm> <dbl [99]>
#> 10 <split [288/111]> Bootstrap10 <glm> <dbl [111]>
#> 11 <split [288/102]> Bootstrap11 <glm> <dbl [102]>
#> 12 <split [288/104]> Bootstrap12 <glm> <dbl [104]>
#> 13 <split [288/115]> Bootstrap13 <glm> <dbl [115]>
#> 14 <split [288/111]> Bootstrap14 <glm> <dbl [111]>
#> 15 <split [288/108]> Bootstrap15 <glm> <dbl [108]>
#> 16 <split [288/110]> Bootstrap16 <glm> <dbl [110]>
#> 17 <split [288/110]> Bootstrap17 <glm> <dbl [110]>
#> 18 <split [288/111]> Bootstrap18 <glm> <dbl [111]>
#> 19 <split [288/103]> Bootstrap19 <glm> <dbl [103]>
#> 20 <split [288/109]> Bootstrap20 <glm> <dbl [109]>
由 reprex package (v2.0.1)
创建于 2021-11-02请注意,对于每个重新采样的保留观察结果都有预测。根据您要执行的操作,您可以在接下来需要处理的 glm_boot_mods
列上使用 unnest()
。