为分组变量计算 bootstrap 重采样

Question

我有以下数据集来计算 Soil_N 和 Soil_P 的标准化效果大小，为此我对每个复制使用了下面的代码。

  df <- tibble(
  Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
  Diversity = rep(c("high", "low"), each = 10),
  Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89), 
  Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
  Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))

由于 Soil_N 和 Soil_P 的一个复制值缺失，我想计算剩余 4 个复制的值以创建一个根据以下代码丢失的副本的值。

library(dplyr)
library(tidyr)
df %>%
 pivot_wider(id_cols = c(Diversity, Replicate),
 names_from = Soilwater,
 values_from = c(Soil_N, Soil_P) %>%
 mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))), 
  ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced), 
  ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced))%>% 
  group_by(Diversity)

我确定此代码需要在各自的列中分别替换 Soil_N 和 Soil_P 的平均值，但我不知道如何替换。
接下来，我愿意从分组的重复中计算 bootstrap 样本，按每个 Diversity 分组，迭代 1000 次 （这意味着 每个 ES 计算使用 5 进行 1000 次使用公式 ES = (optimal-reduced)/(optimal+reduced)) 复制 。 Soil_N 和 Soil_P.

都应该这样做

对于bootstrap，我使用的是代码

df_bs<- bootstraps(df, times = 1000, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
 as.data.frame(x) %>%
  group_by(Diversity) %>%
  summarise(meanN = mean(ES_N), 
  meanP = mean(ES_P))
  })) %>%
  unnest(means)

这里的输出，我会pivot_longer如下在ggplot2中制作图表。

 df_final<- df_bs%>%
 select(meanN, meanP)%>%
 pivot_longer(!Diversity, 
           names_to = "variables", 
           values_to = "ES")
 df_final

非常感谢您的帮助！谢谢！

Answer 1

我想你的意思是 bootstrap 从分组的重复中抽样，按每个 Diversity 类别分组？ bootstraps 将整个数据帧作为从中采样的参数。借用 the examples here 中的代码，您可以从每个多样性类别（有替换）中随机抽取 5 个重复样本，并为每个 bootstrap 重采样提取每个类别的均值：

library(tidyverse)
library(rsample)

df <- tibble(
  Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
  Diversity = rep(c("high", "low"), each = 10),
  Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
  Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
)


df_stand <- df %>%
  pivot_wider(
    id_cols = c(Diversity, Replicate),
    names_from = Soilwater,
    values_from = Soil_N
  ) %>%
  mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
    ES = (optimal - reduced) / (optimal + reduced)
  )


bootstraps(df_stand, times = 50, strata = "Diversity") %>%
  mutate(means = map(splits, function(x) {
    as.data.frame(x) %>%
      group_by(Diversity) %>%
      summarise(mean = mean(ES))
  })) %>%
  unnest(means)


#> # A tibble: 100 x 4
#>    splits         id          Diversity    mean
#>    <list>         <chr>       <chr>       <dbl>
#>  1 <split [10/3]> Bootstrap01 high      -0.165 
#>  2 <split [10/3]> Bootstrap01 low       -0.0868
#>  3 <split [10/3]> Bootstrap02 high      -0.173 
#>  4 <split [10/3]> Bootstrap02 low       -0.0906
#>  5 <split [10/4]> Bootstrap03 high      -0.167 
#>  6 <split [10/4]> Bootstrap03 low       -0.103 
#>  7 <split [10/3]> Bootstrap04 high      -0.184 
#>  8 <split [10/3]> Bootstrap04 low       -0.0881
#>  9 <split [10/4]> Bootstrap05 high      -0.159 
#> 10 <split [10/4]> Bootstrap05 low       -0.0633
#> # ... with 90 more rows

编辑

您的解决方案就快完成了。添加新的 Soil_P 列后，您的 across 函数需要 select 枢轴后的所有新列。您可以使用 across(starts_with("Soil")) 来获得所有正确的：

library(tidyverse)
library(rsample)

df <- tibble(
  Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
  Diversity = rep(c("high", "low"), each = 10),
  Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89), 
  Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
  Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))



df_stand <- df %>%
  pivot_wider(
    id_cols = c(Diversity, Replicate),
    names_from = Soilwater,
    values_from = c(Soil_N, Soil_P)
  ) %>%
  mutate(
    across(starts_with("Soil"), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
    ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
    ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced)
  )


df_bs <- bootstraps(df_stand, times = 1000, strata = 'Diversity') %>%
  mutate(means = map(splits, function(x) {
    as.data.frame(x) %>%
      group_by(Diversity) %>%
      summarise(meanN = mean(ES_N),
                meanP = mean(ES_P))
  })) %>%
  unnest(means)

df_final <- df_bs %>% 
  select(Diversity, meanN, meanP) %>% 
  pivot_longer(-Diversity,
               names_to = "variables", 
               values_to = "ES")

df_final

#> # A tibble: 4,000 x 3
#>    Diversity variables      ES
#>    <chr>     <chr>       <dbl>
#>  1 high      meanN     -0.177 
#>  2 high      meanP     -0.48  
#>  3 low       meanN     -0.0611
#>  4 low       meanP     -0.241 
#>  5 high      meanN     -0.164 
#>  6 high      meanP     -0.68  
#>  7 low       meanN     -0.0787
#>  8 low       meanP     -0.299 
#>  9 high      meanN     -0.187 
#> 10 high      meanP     -0.44  
#> # ... with 3,990 more rows

这将输出数据帧，其中每个 Diversity 值的 1,000 bootstrap 个样本的平均值。然后这只是根据需要进行绘图的情况:)。

^{由 reprex package (v2.0.1)}

于 2022-03-18 创建

为分组变量计算 bootstrap 重采样

calculating bootstrap resampling for grouped variables

dplyr

purrr

tidyverse

rsample

编辑