在嵌套数据列中引导统计数据并以整洁的格式检索结果
Bootsrapping a statistic in a nested data column and retrieve results in tidy format
我正在尝试以更整洁的方式进行一些 bootstrapping(我知道如何在 base R 中进行此操作并获得结果,但我想知道如何将所有内容放入一个更整洁的管道)。
首先我定义了两个函数。一个用于 bootstrap 的统计数据,一个用于 boostrap 本身:
library(boot)
library(tidyverse)
share <- function(data, i)
{
share_boot <- data[i, ] %>%
summarize(across(everything(), mean)) %>%
pivot_longer(everything()) %>%
summarize(value/sum(value)) %>%
pull()
return(share_boot)
}
boot_results <- function(data, statistic, R)
{
boot_results_function <- boot(data = data,
statistic = statistic,
R = R)
return(boot_results_function)
}
然后我想 bootstrap 我对一些嵌套数据的统计,我基本上想为每一行做 bootstrap:
# Creating toy data
set.seed(1)
df <- tibble(country = rep(1:3, each = 20),
group = rep(rep(1:2, each = 10), 3),
value1 = runif(60),
value2 = runif(60),
value3 = runif(60))
# Doing the boostrap and retreiving results
df2 <- df %>%
group_by(country, group) %>%
nest(data = -c(country, group)) %>%
rowwise() %>%
mutate(results = list(boot_results(data, share, 5))) %>%
ungroup() %>%
hoist(., results, "t0", "t") %>%
select(-results)
这为我提供了 t
、t0
和 data
的嵌套列表列。
# A tibble: 6 x 5
country group data t0 t
<int> <int> <list> <list> <list>
1 1 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
2 1 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
3 2 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
4 2 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
5 3 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
6 3 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
我现在想做几件事:
- 这三列 unnest_longer 的种类,所以我最终每 country/group 三行。
- 从
data
列中检索列名。
我可能把事情复杂化了很多,但我卡在了最后一步,我不知道如何 pivot/unnest 列表列。
预期结果:
# A tibble: 18 x 5
country group data_names t0 t
<int> <int> <chr> <dbl> <list>
1 1 1 value1 0.355 <dbl [5 x 1]>
2 1 1 value2 0.329 <dbl [5 x 1]>
3 1 1 value3 0.315 <dbl [5 x 1]>
4 1 2 value1 0.324 <dbl [5 x 1]>
5 1 2 value2 0.361 <dbl [5 x 1]>
6 1 2 value3 0.315 <dbl [5 x 1]>
7 2 1 value1 0.320 <dbl [5 x 1]>
8 2 1 value2 0.310 <dbl [5 x 1]>
9 2 1 value3 0.371 <dbl [5 x 1]>
10 2 2 value1 0.360 <dbl [5 x 1]>
11 2 2 value2 0.386 <dbl [5 x 1]>
12 2 2 value3 0.254 <dbl [5 x 1]>
13 3 1 value1 0.368 <dbl [5 x 1]>
14 3 1 value2 0.319 <dbl [5 x 1]>
15 3 1 value3 0.314 <dbl [5 x 1]>
16 3 2 value1 0.263 <dbl [5 x 1]>
17 3 2 value2 0.293 <dbl [5 x 1]>
18 3 2 value3 0.443 <dbl [5 x 1]>
使用 map
提取列名,按列拆分矩阵以创建 1 列矩阵并将它们 unnest
在一起。
library(tidyverse)
df2 %>%
mutate(data = map(data, names),
t = map(t, ~map(asplit(.x, 2), matrix, ncol = 1))) %>%
unnest(c(data, t0, t))
# country group data t0 t
# <int> <int> <chr> <dbl> <list>
# 1 1 1 value1 0.355 <dbl [5 × 1]>
# 2 1 1 value2 0.329 <dbl [5 × 1]>
# 3 1 1 value3 0.315 <dbl [5 × 1]>
# 4 1 2 value1 0.324 <dbl [5 × 1]>
# 5 1 2 value2 0.361 <dbl [5 × 1]>
# 6 1 2 value3 0.315 <dbl [5 × 1]>
# 7 2 1 value1 0.320 <dbl [5 × 1]>
# 8 2 1 value2 0.310 <dbl [5 × 1]>
# 9 2 1 value3 0.371 <dbl [5 × 1]>
#10 2 2 value1 0.360 <dbl [5 × 1]>
#11 2 2 value2 0.386 <dbl [5 × 1]>
#12 2 2 value3 0.254 <dbl [5 × 1]>
#13 3 1 value1 0.368 <dbl [5 × 1]>
#14 3 1 value2 0.319 <dbl [5 × 1]>
#15 3 1 value3 0.314 <dbl [5 × 1]>
#16 3 2 value1 0.263 <dbl [5 × 1]>
#17 3 2 value2 0.293 <dbl [5 × 1]>
#18 3 2 value3 0.443 <dbl [5 × 1]>
我正在尝试以更整洁的方式进行一些 bootstrapping(我知道如何在 base R 中进行此操作并获得结果,但我想知道如何将所有内容放入一个更整洁的管道)。
首先我定义了两个函数。一个用于 bootstrap 的统计数据,一个用于 boostrap 本身:
library(boot)
library(tidyverse)
share <- function(data, i)
{
share_boot <- data[i, ] %>%
summarize(across(everything(), mean)) %>%
pivot_longer(everything()) %>%
summarize(value/sum(value)) %>%
pull()
return(share_boot)
}
boot_results <- function(data, statistic, R)
{
boot_results_function <- boot(data = data,
statistic = statistic,
R = R)
return(boot_results_function)
}
然后我想 bootstrap 我对一些嵌套数据的统计,我基本上想为每一行做 bootstrap:
# Creating toy data
set.seed(1)
df <- tibble(country = rep(1:3, each = 20),
group = rep(rep(1:2, each = 10), 3),
value1 = runif(60),
value2 = runif(60),
value3 = runif(60))
# Doing the boostrap and retreiving results
df2 <- df %>%
group_by(country, group) %>%
nest(data = -c(country, group)) %>%
rowwise() %>%
mutate(results = list(boot_results(data, share, 5))) %>%
ungroup() %>%
hoist(., results, "t0", "t") %>%
select(-results)
这为我提供了 t
、t0
和 data
的嵌套列表列。
# A tibble: 6 x 5
country group data t0 t
<int> <int> <list> <list> <list>
1 1 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
2 1 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
3 2 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
4 2 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
5 3 1 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
6 3 2 <tibble [10 x 3]> <dbl [3]> <dbl [5 x 3]>
我现在想做几件事:
- 这三列 unnest_longer 的种类,所以我最终每 country/group 三行。
- 从
data
列中检索列名。
我可能把事情复杂化了很多,但我卡在了最后一步,我不知道如何 pivot/unnest 列表列。
预期结果:
# A tibble: 18 x 5
country group data_names t0 t
<int> <int> <chr> <dbl> <list>
1 1 1 value1 0.355 <dbl [5 x 1]>
2 1 1 value2 0.329 <dbl [5 x 1]>
3 1 1 value3 0.315 <dbl [5 x 1]>
4 1 2 value1 0.324 <dbl [5 x 1]>
5 1 2 value2 0.361 <dbl [5 x 1]>
6 1 2 value3 0.315 <dbl [5 x 1]>
7 2 1 value1 0.320 <dbl [5 x 1]>
8 2 1 value2 0.310 <dbl [5 x 1]>
9 2 1 value3 0.371 <dbl [5 x 1]>
10 2 2 value1 0.360 <dbl [5 x 1]>
11 2 2 value2 0.386 <dbl [5 x 1]>
12 2 2 value3 0.254 <dbl [5 x 1]>
13 3 1 value1 0.368 <dbl [5 x 1]>
14 3 1 value2 0.319 <dbl [5 x 1]>
15 3 1 value3 0.314 <dbl [5 x 1]>
16 3 2 value1 0.263 <dbl [5 x 1]>
17 3 2 value2 0.293 <dbl [5 x 1]>
18 3 2 value3 0.443 <dbl [5 x 1]>
使用 map
提取列名,按列拆分矩阵以创建 1 列矩阵并将它们 unnest
在一起。
library(tidyverse)
df2 %>%
mutate(data = map(data, names),
t = map(t, ~map(asplit(.x, 2), matrix, ncol = 1))) %>%
unnest(c(data, t0, t))
# country group data t0 t
# <int> <int> <chr> <dbl> <list>
# 1 1 1 value1 0.355 <dbl [5 × 1]>
# 2 1 1 value2 0.329 <dbl [5 × 1]>
# 3 1 1 value3 0.315 <dbl [5 × 1]>
# 4 1 2 value1 0.324 <dbl [5 × 1]>
# 5 1 2 value2 0.361 <dbl [5 × 1]>
# 6 1 2 value3 0.315 <dbl [5 × 1]>
# 7 2 1 value1 0.320 <dbl [5 × 1]>
# 8 2 1 value2 0.310 <dbl [5 × 1]>
# 9 2 1 value3 0.371 <dbl [5 × 1]>
#10 2 2 value1 0.360 <dbl [5 × 1]>
#11 2 2 value2 0.386 <dbl [5 × 1]>
#12 2 2 value3 0.254 <dbl [5 × 1]>
#13 3 1 value1 0.368 <dbl [5 × 1]>
#14 3 1 value2 0.319 <dbl [5 × 1]>
#15 3 1 value3 0.314 <dbl [5 × 1]>
#16 3 2 value1 0.263 <dbl [5 × 1]>
#17 3 2 value2 0.293 <dbl [5 × 1]>
#18 3 2 value3 0.443 <dbl [5 × 1]>