为分组变量计算 bootstrap 重采样
calculating bootstrap resampling for grouped variables
我有以下数据集来计算 Soil_N 和 Soil_P 的标准化效果大小,为此我对每个复制使用了下面的代码。
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
由于 Soil_N 和 Soil_P 的一个复制值缺失,我想计算剩余 4 个复制的值以创建一个根据以下代码丢失的副本的值。
library(dplyr)
library(tidyr)
df %>%
pivot_wider(id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced))%>%
group_by(Diversity)
我确定此代码需要在各自的列中分别替换 Soil_N 和 Soil_P 的平均值,但我不知道如何替换。
接下来,我愿意从分组的重复中计算 bootstrap 样本,按每个 Diversity 分组,迭代 1000 次 (这意味着 每个 ES 计算使用 5 进行 1000 次使用公式 ES = (optimal-reduced)/(optimal+reduced)) 复制 。 Soil_N 和 Soil_P.
都应该这样做
对于bootstrap,我使用的是代码
df_bs<- bootstraps(df, times = 1000, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
这里的输出,我会pivot_longer
如下在ggplot2中制作图表。
df_final<- df_bs%>%
select(meanN, meanP)%>%
pivot_longer(!Diversity,
names_to = "variables",
values_to = "ES")
df_final
非常感谢您的帮助!谢谢!
我想你的意思是 bootstrap 从分组的重复中抽样,按每个 Diversity
类别分组? bootstraps
将整个数据帧作为从中采样的参数。借用 the examples here 中的代码,您可以从每个多样性类别(有替换)中随机抽取 5 个重复样本,并为每个 bootstrap 重采样提取每个类别的均值:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
)
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = Soil_N
) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES = (optimal - reduced) / (optimal + reduced)
)
bootstraps(df_stand, times = 50, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(mean = mean(ES))
})) %>%
unnest(means)
#> # A tibble: 100 x 4
#> splits id Diversity mean
#> <list> <chr> <chr> <dbl>
#> 1 <split [10/3]> Bootstrap01 high -0.165
#> 2 <split [10/3]> Bootstrap01 low -0.0868
#> 3 <split [10/3]> Bootstrap02 high -0.173
#> 4 <split [10/3]> Bootstrap02 low -0.0906
#> 5 <split [10/4]> Bootstrap03 high -0.167
#> 6 <split [10/4]> Bootstrap03 low -0.103
#> 7 <split [10/3]> Bootstrap04 high -0.184
#> 8 <split [10/3]> Bootstrap04 low -0.0881
#> 9 <split [10/4]> Bootstrap05 high -0.159
#> 10 <split [10/4]> Bootstrap05 low -0.0633
#> # ... with 90 more rows
编辑
您的解决方案就快完成了。添加新的 Soil_P
列后,您的 across
函数需要 select 枢轴后的所有新列。您可以使用 across(starts_with("Soil"))
来获得所有正确的:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P)
) %>%
mutate(
across(starts_with("Soil"), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced)
)
df_bs <- bootstraps(df_stand, times = 1000, strata = 'Diversity') %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
df_final <- df_bs %>%
select(Diversity, meanN, meanP) %>%
pivot_longer(-Diversity,
names_to = "variables",
values_to = "ES")
df_final
#> # A tibble: 4,000 x 3
#> Diversity variables ES
#> <chr> <chr> <dbl>
#> 1 high meanN -0.177
#> 2 high meanP -0.48
#> 3 low meanN -0.0611
#> 4 low meanP -0.241
#> 5 high meanN -0.164
#> 6 high meanP -0.68
#> 7 low meanN -0.0787
#> 8 low meanP -0.299
#> 9 high meanN -0.187
#> 10 high meanP -0.44
#> # ... with 3,990 more rows
这将输出数据帧,其中每个 Diversity
值的 1,000 bootstrap 个样本的平均值。然后这只是根据需要进行绘图的情况:)。
由 reprex package (v2.0.1)
于 2022-03-18 创建
我有以下数据集来计算 Soil_N 和 Soil_P 的标准化效果大小,为此我对每个复制使用了下面的代码。
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
由于 Soil_N 和 Soil_P 的一个复制值缺失,我想计算剩余 4 个复制的值以创建一个根据以下代码丢失的副本的值。
library(dplyr)
library(tidyr)
df %>%
pivot_wider(id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced))%>%
group_by(Diversity)
我确定此代码需要在各自的列中分别替换 Soil_N 和 Soil_P 的平均值,但我不知道如何替换。
接下来,我愿意从分组的重复中计算 bootstrap 样本,按每个 Diversity 分组,迭代 1000 次 (这意味着 每个 ES 计算使用 5 进行 1000 次使用公式 ES = (optimal-reduced)/(optimal+reduced)) 复制 。 Soil_N 和 Soil_P.
对于bootstrap,我使用的是代码
df_bs<- bootstraps(df, times = 1000, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
这里的输出,我会pivot_longer
如下在ggplot2中制作图表。
df_final<- df_bs%>%
select(meanN, meanP)%>%
pivot_longer(!Diversity,
names_to = "variables",
values_to = "ES")
df_final
非常感谢您的帮助!谢谢!
我想你的意思是 bootstrap 从分组的重复中抽样,按每个 Diversity
类别分组? bootstraps
将整个数据帧作为从中采样的参数。借用 the examples here 中的代码,您可以从每个多样性类别(有替换)中随机抽取 5 个重复样本,并为每个 bootstrap 重采样提取每个类别的均值:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)
)
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = Soil_N
) %>%
mutate(across(c(optimal, reduced), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES = (optimal - reduced) / (optimal + reduced)
)
bootstraps(df_stand, times = 50, strata = "Diversity") %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(mean = mean(ES))
})) %>%
unnest(means)
#> # A tibble: 100 x 4
#> splits id Diversity mean
#> <list> <chr> <chr> <dbl>
#> 1 <split [10/3]> Bootstrap01 high -0.165
#> 2 <split [10/3]> Bootstrap01 low -0.0868
#> 3 <split [10/3]> Bootstrap02 high -0.173
#> 4 <split [10/3]> Bootstrap02 low -0.0906
#> 5 <split [10/4]> Bootstrap03 high -0.167
#> 6 <split [10/4]> Bootstrap03 low -0.103
#> 7 <split [10/3]> Bootstrap04 high -0.184
#> 8 <split [10/3]> Bootstrap04 low -0.0881
#> 9 <split [10/4]> Bootstrap05 high -0.159
#> 10 <split [10/4]> Bootstrap05 low -0.0633
#> # ... with 90 more rows
编辑
您的解决方案就快完成了。添加新的 Soil_P
列后,您的 across
函数需要 select 枢轴后的所有新列。您可以使用 across(starts_with("Soil"))
来获得所有正确的:
library(tidyverse)
library(rsample)
df <- tibble(
Soilwater = rep(rep(c("optimal", "reduced"), each = 5), times = 2),
Diversity = rep(c("high", "low"), each = 10),
Soil_N = c(50, 45, 49, 48, 49, 69, 68, 69, 70, 67, 79, 78, 79, NA_integer_, 77, 89, 89, 87, 88, 89),
Soil_P = c(2, 2, 1, 3, 4, 8, 8, 9, 7, 6, 11, 12, 11, NA_integer_, 10, 18, 18, 17, 21, 19),
Replicate = c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5))
df_stand <- df %>%
pivot_wider(
id_cols = c(Diversity, Replicate),
names_from = Soilwater,
values_from = c(Soil_N, Soil_P)
) %>%
mutate(
across(starts_with("Soil"), ~ replace_na(.x, mean(.x, na.rm = TRUE))),
ES_N = (Soil_N_optimal - Soil_N_reduced) / (Soil_N_optimal + Soil_N_reduced),
ES_P = (Soil_P_optimal - Soil_P_reduced) / (Soil_P_optimal + Soil_P_reduced)
)
df_bs <- bootstraps(df_stand, times = 1000, strata = 'Diversity') %>%
mutate(means = map(splits, function(x) {
as.data.frame(x) %>%
group_by(Diversity) %>%
summarise(meanN = mean(ES_N),
meanP = mean(ES_P))
})) %>%
unnest(means)
df_final <- df_bs %>%
select(Diversity, meanN, meanP) %>%
pivot_longer(-Diversity,
names_to = "variables",
values_to = "ES")
df_final
#> # A tibble: 4,000 x 3
#> Diversity variables ES
#> <chr> <chr> <dbl>
#> 1 high meanN -0.177
#> 2 high meanP -0.48
#> 3 low meanN -0.0611
#> 4 low meanP -0.241
#> 5 high meanN -0.164
#> 6 high meanP -0.68
#> 7 low meanN -0.0787
#> 8 low meanP -0.299
#> 9 high meanN -0.187
#> 10 high meanP -0.44
#> # ... with 3,990 more rows
这将输出数据帧,其中每个 Diversity
值的 1,000 bootstrap 个样本的平均值。然后这只是根据需要进行绘图的情况:)。
由 reprex package (v2.0.1)
于 2022-03-18 创建