Bootstrap 使用 grouped/nested 数据重采样和整理回归模型
Bootstrap resampling and tidy regression models with grouped/nested data
我正在尝试使用 bootstrapping 估计回归斜率及其置信区间。我想为分组数据做这件事。我正在按照本网站 (https://www.tidymodels.org/learn/statistics/bootstrap/) 上的示例进行操作,但我无法弄清楚如何让它与 grouped/nested 数据一起使用。我不断收到以下信息:
错误:mutate()
列 model
有问题。
ℹ model = map(splits, ~lm(conc ~ yday, data = .))
。
x 对象 'conc' 未找到
library(tidyverse)
library(tidymodels)
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
# This is where I get the error
lm_boot <-
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
nest() %>%
bootstraps(., times = 1000, apparent = TRUE) %>%
mutate(model = map(splits, ~lm(conc ~ yday, data = .)),
coef_info = map(model, tidy))
boot_coefs <-
lm_boot %>%
unnest(coef_info)
percentile_intervals <- int_pctl(lm_boot, coef_info)
percentile_intervals
更新
我尝试映射 bootstrap 函数,然后对该列表列中的拆分进行线性回归。它生成了一个名为 model
的新列,但其中似乎没有任何模型元素。
lm_boot <-
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
nest() %>%
mutate(boots = map(data, ~bootstraps(., times = 1000, apparent = TRUE)),
model = map(boots, "splits", ~lm(conc ~ yday, data = .x)))
有什么想法吗?
我可以帮你分道扬镳。为了模仿 Bootstrap resampling and tidy regression models 中的示例,我添加了 nest()
、mutate()
、unnest()
序列以在每个组中生成引导程序。
您将无法在此结果上直接使用 int_pctl()
,因为它不是从 bootstraps()
.
生成的 rset
对象
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
lm_boot <- dat %>%
nest(data = -c(site, year, samp_depth_cat2, analyte)) %>%
mutate(boots = map(data, ~bootstraps(.x, times = 10, apparent = TRUE))) %>%
unnest(boots) %>%
mutate(model = map(splits, ~lm(conc ~ yday, data = analysis(.))),
coef_info = map(model, tidy))
lm_boot %>%
unnest(coef_info)
#> # A tibble: 44 × 13
#> site year samp_depth_cat2 analyte data splits id model term estimate
#> <chr> <dbl> <fct> <chr> <list> <list> <chr> <lis> <chr> <dbl>
#> 1 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 52.0
#> 2 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.512
#> 3 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.5
#> 4 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.438
#> 5 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.7
#> 6 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.462
#> 7 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.5
#> 8 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.445
#> 9 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.0
#> 10 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.452
#> # … with 34 more rows, and 3 more variables: std.error <dbl>, statistic <dbl>,
#> # p.value <dbl>
由 reprex package (v2.0.1)
于 2021-08-30 创建
您可以将引导程序包装在 group_modify
中以将其应用于每个组。
library(tidyverse)
library(tidymodels)
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
group_modify(
~ bootstraps(., times = 100, apparent = TRUE) %>%
mutate(
model = map(splits, ~ lm(conc ~ yday, data = .)),
coefs = map(model, tidy)
) %>%
int_pctl(coefs)
)
#> Warning: Recommend at least 1000 non-missing bootstrap resamples for terms:
#> `(Intercept)`, `yday`.
#> Warning: Recommend at least 1000 non-missing bootstrap resamples for terms:
#> `(Intercept)`, `yday`.
#> # A tibble: 4 × 10
#> # Groups: site, year, samp_depth_cat2, analyte [2]
#> site year samp_depth_cat2 analyte term .lower .estimate .upper .alpha
#> <chr> <dbl> <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 mb 2015 Mid-2 NO3 (Intercept) 39.1 49.5 53.2 0.05
#> 2 mb 2015 Mid-2 NO3 yday -0.563 -0.443 -0.241 0.05
#> 3 sp 2015 Bottom NH4 (Intercept) -5.60 3.40 6.68 0.05
#> 4 sp 2015 Bottom NH4 yday 0.138 0.236 0.420 0.05
#> # … with 1 more variable: .method <chr>
我正在尝试使用 bootstrapping 估计回归斜率及其置信区间。我想为分组数据做这件事。我正在按照本网站 (https://www.tidymodels.org/learn/statistics/bootstrap/) 上的示例进行操作,但我无法弄清楚如何让它与 grouped/nested 数据一起使用。我不断收到以下信息:
错误:mutate()
列 model
有问题。
ℹ model = map(splits, ~lm(conc ~ yday, data = .))
。
x 对象 'conc' 未找到
library(tidyverse)
library(tidymodels)
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
# This is where I get the error
lm_boot <-
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
nest() %>%
bootstraps(., times = 1000, apparent = TRUE) %>%
mutate(model = map(splits, ~lm(conc ~ yday, data = .)),
coef_info = map(model, tidy))
boot_coefs <-
lm_boot %>%
unnest(coef_info)
percentile_intervals <- int_pctl(lm_boot, coef_info)
percentile_intervals
更新
我尝试映射 bootstrap 函数,然后对该列表列中的拆分进行线性回归。它生成了一个名为 model
的新列,但其中似乎没有任何模型元素。
lm_boot <-
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
nest() %>%
mutate(boots = map(data, ~bootstraps(., times = 1000, apparent = TRUE)),
model = map(boots, "splits", ~lm(conc ~ yday, data = .x)))
有什么想法吗?
我可以帮你分道扬镳。为了模仿 Bootstrap resampling and tidy regression models 中的示例,我添加了 nest()
、mutate()
、unnest()
序列以在每个组中生成引导程序。
您将无法在此结果上直接使用 int_pctl()
,因为它不是从 bootstraps()
.
rset
对象
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
lm_boot <- dat %>%
nest(data = -c(site, year, samp_depth_cat2, analyte)) %>%
mutate(boots = map(data, ~bootstraps(.x, times = 10, apparent = TRUE))) %>%
unnest(boots) %>%
mutate(model = map(splits, ~lm(conc ~ yday, data = analysis(.))),
coef_info = map(model, tidy))
lm_boot %>%
unnest(coef_info)
#> # A tibble: 44 × 13
#> site year samp_depth_cat2 analyte data splits id model term estimate
#> <chr> <dbl> <fct> <chr> <list> <list> <chr> <lis> <chr> <dbl>
#> 1 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 52.0
#> 2 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.512
#> 3 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.5
#> 4 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.438
#> 5 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.7
#> 6 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.462
#> 7 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.5
#> 8 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.445
#> 9 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> (Int… 50.0
#> 10 mb 2015 Mid-2 NO3 <tibb… <split… Boot… <lm> yday -0.452
#> # … with 34 more rows, and 3 more variables: std.error <dbl>, statistic <dbl>,
#> # p.value <dbl>
由 reprex package (v2.0.1)
于 2021-08-30 创建您可以将引导程序包装在 group_modify
中以将其应用于每个组。
library(tidyverse)
library(tidymodels)
dat <-
structure(list(site = c("mb", "mb", "mb", "mb", "mb", "mb", "mb",
"mb", "sp", "sp", "sp", "sp", "sp", "sp", "sp", "sp"), year = c(2015,
2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,
2015, 2015, 2015, 2015), yday = c(15, 15, 35, 35, 48, 48, 69,
69, 15, 15, 37, 37, 49, 49, 69, 69), samp_depth_cat2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Mid-2",
"Bottom"), class = "factor"), analyte = c("NO3", "NO3", "NO3",
"NO3", "NO3", "NO3", "NO3", "NO3", "NH4", "NH4", "NH4", "NH4",
"NH4", "NH4", "NH4", "NH4"), conc = c(44.8171069465267, 44.7775358035268,
33.3678662097523, 33.0710828871279, 25.8427604055115, 26.9309658742058,
23.7585524380667, 17.5240386949382, 8.35832733633183, 9.29280745341615,
10.0797380595417, 10.2322058970515, 13.7930668951239, 15.6226805882773,
25.3003042764332, 16.8723637466981)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
set.seed(27)
dat %>%
group_by(site, year, samp_depth_cat2, analyte) %>%
group_modify(
~ bootstraps(., times = 100, apparent = TRUE) %>%
mutate(
model = map(splits, ~ lm(conc ~ yday, data = .)),
coefs = map(model, tidy)
) %>%
int_pctl(coefs)
)
#> Warning: Recommend at least 1000 non-missing bootstrap resamples for terms:
#> `(Intercept)`, `yday`.
#> Warning: Recommend at least 1000 non-missing bootstrap resamples for terms:
#> `(Intercept)`, `yday`.
#> # A tibble: 4 × 10
#> # Groups: site, year, samp_depth_cat2, analyte [2]
#> site year samp_depth_cat2 analyte term .lower .estimate .upper .alpha
#> <chr> <dbl> <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 mb 2015 Mid-2 NO3 (Intercept) 39.1 49.5 53.2 0.05
#> 2 mb 2015 Mid-2 NO3 yday -0.563 -0.443 -0.241 0.05
#> 3 sp 2015 Bottom NH4 (Intercept) -5.60 3.40 6.68 0.05
#> 4 sp 2015 Bottom NH4 yday 0.138 0.236 0.420 0.05
#> # … with 1 more variable: .method <chr>