跨多个列循环计算描述性统计信息
Calculate the descriptive statistics in a loop across several columns
我正在为一个变量生成随机数并重复该过程几次。我想在循环的每次迭代中计算每个组(group1
、group2
、group3
)的 value
的平均值。我想存储结果,以便之后可以估计循环所有迭代中每个组的平均份额。
require(tidyverse)
set.seed(21)
group1 <- sample(c("A1", "A2", "B1", "B2", "C1", "C2"), 1000, TRUE)
group2 <- sample(c("G1", "G2", "G4"), 1000, TRUE)
group3 <- sample(c("D1", "D2"), 1000, TRUE)
prob <- runif(1000, 0, 1)
df <- as.data.frame(cbind(group1, group2, group3, prob))
df$prob <- as.numeric(df$prob)
for (i in 1:15) {
df <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob))
# [INSERT CALCULATION OF MEAN FOR EACH GROUP VARIABLE AND STORE IT]
}
# [INSERT CALCULATION OF MEAN ACROSS ALL ITERATIONS]
我的主要问题是如何以有效的方式估计多个变量的 value
的平均值,并以平滑的方式存储结果。
提前致谢。
澄清一下:
我希望最终结果看起来像这样:
col "group1_A1" "Group1_A2" "group1_B1" "group1_B2" "group1_C1" "group1_C2" "group2_G1" "group2_G2" "group2_G4" "group3_D1" "group3_D2"
x1 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x2 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x3 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x4 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x4 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x5 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x6 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x7 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x8 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x9 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x10 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
其中三组子组的意思是替换 "x_bar
并且每一行都是一次迭代计算的意思。一个简单的解决方案是使用 dplyr
的 group_by
,但我想找到一个解决方案,所以我遍历了所有三个分组变量。
换句话说:假设变量 prob
是死亡的概率。 group1
表示6个年龄段的变量,group2
表示社会经济地位,group3
表示性别。然后我想看看谁最有可能死去。为此,我随机生成了一个依赖于 prob
概率的伯努利变量。为了去除一些随机性,我重复这个过程 15 次,然后想看看死亡的每个社会人口群体的份额有多大(在变量 value
上收到 1
的值)。对于每次迭代,我想要来计算死者的群体归属(所以有多少男性死亡,有多少老人死亡)。很抱歉没有想出一个更令人愉快的例子。
library(dplyr)
df <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob)) %>%
summarize_if(is.numeric,mean,na.rm = TRUE) #summarize_all also works if all columns are numeric anyway
这是一种使用一些 tidyverse
函数的方法。
library(dplyr)
library(tidyr)
df2 <- df %>%
pivot_longer(starts_with("group") ) %>%
mutate(group = paste0(name, "_", value)) %>%
select(group)
for (i in 1:15) {
df2 <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob)) %>%
pivot_longer(starts_with("group"), values_to = "val" ) %>%
mutate(group = paste0(name, "_", val)) %>%
group_by(group) %>%
summarise(mean = mean(value, na.rm = TRUE)) %>%
rename_with(.cols = mean, .fn = ~ paste0("mean", i)) %>%
inner_join(df2, by = c("group" = "group"))
}
df2 %>%
pivot_longer(starts_with("mean"), names_to = "trial", names_prefix = "mean") %>%
distinct() %>%
pivot_wider(id_cols = mean, names_from = "group", values_from = "value")
# A tibble: 15 x 12
trial group1_A1 group1_A2 group1_B1 group1_B2 group1_C1 group1_C2 group2_G1 group2_G2 group2_G4 group3_D1 group3_D2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 15 0.519 0.514 0.516 0.519 0.551 0.533 0.529 0.542 0.507 0.518 0.533
2 14 0.481 0.486 0.503 0.536 0.487 0.550 0.526 0.493 0.507 0.495 0.520
3 13 0.506 0.541 0.477 0.470 0.572 0.556 0.575 0.499 0.496 0.486 0.552
4 12 0.519 0.534 0.497 0.557 0.604 0.509 0.549 0.522 0.548 0.548 0.531
5 11 0.5 0.568 0.458 0.481 0.497 0.562 0.542 0.496 0.496 0.467 0.548
6 10 0.525 0.466 0.503 0.503 0.535 0.580 0.581 0.490 0.496 0.488 0.548
7 9 0.494 0.547 0.490 0.448 0.610 0.598 0.578 0.504 0.519 0.501 0.560
8 8 0.538 0.554 0.471 0.530 0.599 0.538 0.545 0.516 0.559 0.565 0.518
9 7 0.525 0.588 0.548 0.475 0.535 0.568 0.601 0.499 0.522 0.507 0.565
10 6 0.513 0.527 0.529 0.546 0.561 0.503 0.562 0.513 0.522 0.510 0.550
11 5 0.462 0.493 0.503 0.508 0.513 0.568 0.513 0.493 0.522 0.507 0.510
12 4 0.506 0.5 0.452 0.481 0.599 0.556 0.545 0.516 0.496 0.520 0.516
13 3 0.525 0.466 0.503 0.525 0.567 0.556 0.529 0.550 0.499 0.497 0.552
14 2 0.462 0.554 0.471 0.514 0.519 0.574 0.536 0.516 0.499 0.520 0.512
15 1 0.506 0.541 0.497 0.470 0.519 0.544 0.510 0.519 0.507 0.510 0.514
这就是您的第一部分 - data.frame 其中每一行都是一个试验,每组均值。
你的第二部分如下:
df2 %>%
pivot_longer(starts_with("mean"), names_to = "trial", names_prefix = "mean") %>%
distinct() %>%
group_by(group) %>%
summarize(mean = mean(value))
# A tibble: 11 x 2
group mean
<chr> <dbl>
1 group1_A1 0.505
2 group1_A2 0.525
3 group1_B1 0.495
4 group1_B2 0.504
5 group1_C1 0.551
6 group1_C2 0.553
7 group2_G1 0.548
8 group2_G2 0.511
9 group2_G4 0.513
10 group3_D1 0.509
11 group3_D2 0.535
我正在为一个变量生成随机数并重复该过程几次。我想在循环的每次迭代中计算每个组(group1
、group2
、group3
)的 value
的平均值。我想存储结果,以便之后可以估计循环所有迭代中每个组的平均份额。
require(tidyverse)
set.seed(21)
group1 <- sample(c("A1", "A2", "B1", "B2", "C1", "C2"), 1000, TRUE)
group2 <- sample(c("G1", "G2", "G4"), 1000, TRUE)
group3 <- sample(c("D1", "D2"), 1000, TRUE)
prob <- runif(1000, 0, 1)
df <- as.data.frame(cbind(group1, group2, group3, prob))
df$prob <- as.numeric(df$prob)
for (i in 1:15) {
df <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob))
# [INSERT CALCULATION OF MEAN FOR EACH GROUP VARIABLE AND STORE IT]
}
# [INSERT CALCULATION OF MEAN ACROSS ALL ITERATIONS]
我的主要问题是如何以有效的方式估计多个变量的 value
的平均值,并以平滑的方式存储结果。
提前致谢。
澄清一下:
我希望最终结果看起来像这样:
col "group1_A1" "Group1_A2" "group1_B1" "group1_B2" "group1_C1" "group1_C2" "group2_G1" "group2_G2" "group2_G4" "group3_D1" "group3_D2"
x1 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x2 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x3 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x4 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x4 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x5 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x6 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x7 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x8 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x9 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
x10 "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar" "x_bar"
其中三组子组的意思是替换 "x_bar
并且每一行都是一次迭代计算的意思。一个简单的解决方案是使用 dplyr
的 group_by
,但我想找到一个解决方案,所以我遍历了所有三个分组变量。
换句话说:假设变量 prob
是死亡的概率。 group1
表示6个年龄段的变量,group2
表示社会经济地位,group3
表示性别。然后我想看看谁最有可能死去。为此,我随机生成了一个依赖于 prob
概率的伯努利变量。为了去除一些随机性,我重复这个过程 15 次,然后想看看死亡的每个社会人口群体的份额有多大(在变量 value
上收到 1
的值)。对于每次迭代,我想要来计算死者的群体归属(所以有多少男性死亡,有多少老人死亡)。很抱歉没有想出一个更令人愉快的例子。
library(dplyr)
df <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob)) %>%
summarize_if(is.numeric,mean,na.rm = TRUE) #summarize_all also works if all columns are numeric anyway
这是一种使用一些 tidyverse
函数的方法。
library(dplyr)
library(tidyr)
df2 <- df %>%
pivot_longer(starts_with("group") ) %>%
mutate(group = paste0(name, "_", value)) %>%
select(group)
for (i in 1:15) {
df2 <- df %>%
mutate(value = rbinom(nrow(df), 1, prob = prob)) %>%
pivot_longer(starts_with("group"), values_to = "val" ) %>%
mutate(group = paste0(name, "_", val)) %>%
group_by(group) %>%
summarise(mean = mean(value, na.rm = TRUE)) %>%
rename_with(.cols = mean, .fn = ~ paste0("mean", i)) %>%
inner_join(df2, by = c("group" = "group"))
}
df2 %>%
pivot_longer(starts_with("mean"), names_to = "trial", names_prefix = "mean") %>%
distinct() %>%
pivot_wider(id_cols = mean, names_from = "group", values_from = "value")
# A tibble: 15 x 12
trial group1_A1 group1_A2 group1_B1 group1_B2 group1_C1 group1_C2 group2_G1 group2_G2 group2_G4 group3_D1 group3_D2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 15 0.519 0.514 0.516 0.519 0.551 0.533 0.529 0.542 0.507 0.518 0.533
2 14 0.481 0.486 0.503 0.536 0.487 0.550 0.526 0.493 0.507 0.495 0.520
3 13 0.506 0.541 0.477 0.470 0.572 0.556 0.575 0.499 0.496 0.486 0.552
4 12 0.519 0.534 0.497 0.557 0.604 0.509 0.549 0.522 0.548 0.548 0.531
5 11 0.5 0.568 0.458 0.481 0.497 0.562 0.542 0.496 0.496 0.467 0.548
6 10 0.525 0.466 0.503 0.503 0.535 0.580 0.581 0.490 0.496 0.488 0.548
7 9 0.494 0.547 0.490 0.448 0.610 0.598 0.578 0.504 0.519 0.501 0.560
8 8 0.538 0.554 0.471 0.530 0.599 0.538 0.545 0.516 0.559 0.565 0.518
9 7 0.525 0.588 0.548 0.475 0.535 0.568 0.601 0.499 0.522 0.507 0.565
10 6 0.513 0.527 0.529 0.546 0.561 0.503 0.562 0.513 0.522 0.510 0.550
11 5 0.462 0.493 0.503 0.508 0.513 0.568 0.513 0.493 0.522 0.507 0.510
12 4 0.506 0.5 0.452 0.481 0.599 0.556 0.545 0.516 0.496 0.520 0.516
13 3 0.525 0.466 0.503 0.525 0.567 0.556 0.529 0.550 0.499 0.497 0.552
14 2 0.462 0.554 0.471 0.514 0.519 0.574 0.536 0.516 0.499 0.520 0.512
15 1 0.506 0.541 0.497 0.470 0.519 0.544 0.510 0.519 0.507 0.510 0.514
这就是您的第一部分 - data.frame 其中每一行都是一个试验,每组均值。
你的第二部分如下:
df2 %>%
pivot_longer(starts_with("mean"), names_to = "trial", names_prefix = "mean") %>%
distinct() %>%
group_by(group) %>%
summarize(mean = mean(value))
# A tibble: 11 x 2
group mean
<chr> <dbl>
1 group1_A1 0.505
2 group1_A2 0.525
3 group1_B1 0.495
4 group1_B2 0.504
5 group1_C1 0.551
6 group1_C2 0.553
7 group2_G1 0.548
8 group2_G2 0.511
9 group2_G4 0.513
10 group3_D1 0.509
11 group3_D2 0.535