t-test 一组与 tidyverse 中的许多组
t-test of one group versus many groups in tidyverse
我有以下小标题
test_tbl <- tibble(name = rep(c("John", "Allan", "George", "Peter", "Paul"), each = 12),
category = rep(rep(LETTERS[1:4], each = 3), 5),
replicate = rep(1:3, 20),
value = sample.int(n = 1e5, size = 60, replace = T))
# A tibble: 60 x 4
name category replicate value
<chr> <chr> <int> <int>
1 John A 1 71257
2 John A 2 98887
3 John A 3 87354
4 John B 1 25352
5 John B 2 69913
6 John B 3 43086
7 John C 1 24957
8 John C 2 33928
9 John C 3 79854
10 John D 1 32842
11 John D 2 19156
12 John D 3 50283
13 Allan A 1 98188
14 Allan A 2 26208
15 Allan A 3 69329
16 Allan B 1 32696
17 Allan B 2 81240
18 Allan B 3 54689
19 Allan C 1 77044
20 Allan C 2 97776
# … with 40 more rows
我想 group_by(name, category)
并执行 3 个 t.test 调用,将 category
B、C 和 D 与 category
A 进行比较。
我想存储输出中的 estimate
和 p.value
。预期的结果是这样的:
# A tibble: 5 x 7
name B_vs_A_estimate B_vs_A_p_value C_vs_A_estimate C_vs_A_p_value D_vs_A_estimate D_vs_A_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John -0.578 0.486 0.198 0.309 0.631 0.171
2 Allan 0.140 0.644 0.728 0.283 0.980 0.485
3 George -0.778 0.320 -0.424 0.391 -0.154 0.589
4 Peter -0.435 0.470 -0.156 0.722 0.315 0.0140
5 Paul 0.590 0.0150 -0.473 0.475 0.681 0.407
我更喜欢使用 tidyverse
and/or broom
.
的解决方案
编辑:清洁代码
map(unique(test_tbl$name),function(nm){test_tbl %>% filter(name == nm)}) %>%
map2(unique(test_tbl$name),function(dat,nm){
map(LETTERS[2:4],function(cat){
dat %>%
filter(category == "A") %>%
pull %>%
t.test(dat %>% filter(category == cat) %>% pull)
}) %>%
map_dfr(broom::glance) %>%
select(statistic,p.value) %>%
mutate(
name = nm,
cross_cat = paste0(LETTERS[2:4]," versus A")
)
}) %>%
{do.call(rbind,.)}
test_tbl %>%
dplyr::group_by(name) %>%
dplyr::summarise(estimate_AB =
t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$estimate[1] - x$estimate[2]}),
pvalue_AB = t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$p.value})
)
这是我按组测试 A 和 B 的方法。我认为您可以扩展我的方法,或者尝试合并第一个解决方案中的代码。
有很多方法可以实现所需的输出,但也许这个是更直观且易于调试的方法(您可以在任何地方放置 browser()
)
test_tbl %>%
group_by(name) %>%
do({
sub_tbl <- .
expand.grid(g1="A", g2=c("B", "C", "D"), stringsAsFactors = FALSE) %>%
mutate(test=as.character(glue::glue("{g1}_vs_{g2}"))) %>%
rowwise() %>%
do({
gs <- .
t_res <- t.test(sub_tbl %>% filter(category == gs$g1) %>% pull(value),
sub_tbl %>% filter(category == gs$g2) %>% pull(value))
data.frame(test=gs$test, estimate=t_res$statistic, p_value=t_res$p.value,
stringsAsFactors = FALSE)
})
}) %>%
ungroup() %>%
gather(key="statistic", value="val", -name, -test) %>%
mutate(test_statistic = paste(test, statistic, sep = "_")) %>%
select(-test, -statistic) %>%
spread(key="test_statistic", value="val")
结果
# A tibble: 5 x 7
name A_vs_B_estimate A_vs_B_p_value A_vs_C_estimate A_vs_C_p_value A_vs_D_estimate A_vs_D_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Allan -0.270 0.803 -1.03 0.396 1.55 0.250
2 George 0.201 0.855 0.221 0.838 1.07 0.380
3 John -1.59 0.249 0.0218 0.984 -0.410 0.704
4 Paul 0.116 0.918 -1.62 0.215 -1.53 0.212
5 Peter 0.471 0.664 0.551 0.611 0.466 0.680
它按名称对记录进行分组,然后应用函数 (do
#1)。将子数据框保存在 sub_tbl
中,展开所有测试用例 (expand.grid
) 并创建一个包含两个字母的 test
名称。现在,对于每个组合,将函数应用于 运行 t 检验 (do
#2)。该匿名函数在第 1 组 (g1
) 和第 2 组 (g2
) 和 returns 数据帧之间执行测试,并带有结果。
第二部分基本上重新排列列以获得最终输出。
我们可以使用
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
test_tbl %>%
split(.$name) %>%
map_dfr(~ {
Avalue <- .x$value[.x$category == 'A']
.x %>%
filter(category != 'A') %>%
group_by(category) %>%
summarise(out = t.test(value, Avalue)$p.value) %>%
mutate(category = str_c(category, '_vs_A_p_value'))}, .id = 'name') %>%
pivot_wider(names_from = category, values_from = out)
我有以下小标题
test_tbl <- tibble(name = rep(c("John", "Allan", "George", "Peter", "Paul"), each = 12),
category = rep(rep(LETTERS[1:4], each = 3), 5),
replicate = rep(1:3, 20),
value = sample.int(n = 1e5, size = 60, replace = T))
# A tibble: 60 x 4
name category replicate value
<chr> <chr> <int> <int>
1 John A 1 71257
2 John A 2 98887
3 John A 3 87354
4 John B 1 25352
5 John B 2 69913
6 John B 3 43086
7 John C 1 24957
8 John C 2 33928
9 John C 3 79854
10 John D 1 32842
11 John D 2 19156
12 John D 3 50283
13 Allan A 1 98188
14 Allan A 2 26208
15 Allan A 3 69329
16 Allan B 1 32696
17 Allan B 2 81240
18 Allan B 3 54689
19 Allan C 1 77044
20 Allan C 2 97776
# … with 40 more rows
我想 group_by(name, category)
并执行 3 个 t.test 调用,将 category
B、C 和 D 与 category
A 进行比较。
我想存储输出中的 estimate
和 p.value
。预期的结果是这样的:
# A tibble: 5 x 7
name B_vs_A_estimate B_vs_A_p_value C_vs_A_estimate C_vs_A_p_value D_vs_A_estimate D_vs_A_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 John -0.578 0.486 0.198 0.309 0.631 0.171
2 Allan 0.140 0.644 0.728 0.283 0.980 0.485
3 George -0.778 0.320 -0.424 0.391 -0.154 0.589
4 Peter -0.435 0.470 -0.156 0.722 0.315 0.0140
5 Paul 0.590 0.0150 -0.473 0.475 0.681 0.407
我更喜欢使用 tidyverse
and/or broom
.
编辑:清洁代码
map(unique(test_tbl$name),function(nm){test_tbl %>% filter(name == nm)}) %>%
map2(unique(test_tbl$name),function(dat,nm){
map(LETTERS[2:4],function(cat){
dat %>%
filter(category == "A") %>%
pull %>%
t.test(dat %>% filter(category == cat) %>% pull)
}) %>%
map_dfr(broom::glance) %>%
select(statistic,p.value) %>%
mutate(
name = nm,
cross_cat = paste0(LETTERS[2:4]," versus A")
)
}) %>%
{do.call(rbind,.)}
test_tbl %>%
dplyr::group_by(name) %>%
dplyr::summarise(estimate_AB =
t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$estimate[1] - x$estimate[2]}),
pvalue_AB = t.test(value[category == "A"| category == "B"] ~ category[category == "A" | category == "B"]) %>% (function(x){x$p.value})
)
这是我按组测试 A 和 B 的方法。我认为您可以扩展我的方法,或者尝试合并第一个解决方案中的代码。
有很多方法可以实现所需的输出,但也许这个是更直观且易于调试的方法(您可以在任何地方放置 browser()
)
test_tbl %>%
group_by(name) %>%
do({
sub_tbl <- .
expand.grid(g1="A", g2=c("B", "C", "D"), stringsAsFactors = FALSE) %>%
mutate(test=as.character(glue::glue("{g1}_vs_{g2}"))) %>%
rowwise() %>%
do({
gs <- .
t_res <- t.test(sub_tbl %>% filter(category == gs$g1) %>% pull(value),
sub_tbl %>% filter(category == gs$g2) %>% pull(value))
data.frame(test=gs$test, estimate=t_res$statistic, p_value=t_res$p.value,
stringsAsFactors = FALSE)
})
}) %>%
ungroup() %>%
gather(key="statistic", value="val", -name, -test) %>%
mutate(test_statistic = paste(test, statistic, sep = "_")) %>%
select(-test, -statistic) %>%
spread(key="test_statistic", value="val")
结果
# A tibble: 5 x 7
name A_vs_B_estimate A_vs_B_p_value A_vs_C_estimate A_vs_C_p_value A_vs_D_estimate A_vs_D_p_value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Allan -0.270 0.803 -1.03 0.396 1.55 0.250
2 George 0.201 0.855 0.221 0.838 1.07 0.380
3 John -1.59 0.249 0.0218 0.984 -0.410 0.704
4 Paul 0.116 0.918 -1.62 0.215 -1.53 0.212
5 Peter 0.471 0.664 0.551 0.611 0.466 0.680
它按名称对记录进行分组,然后应用函数 (do
#1)。将子数据框保存在 sub_tbl
中,展开所有测试用例 (expand.grid
) 并创建一个包含两个字母的 test
名称。现在,对于每个组合,将函数应用于 运行 t 检验 (do
#2)。该匿名函数在第 1 组 (g1
) 和第 2 组 (g2
) 和 returns 数据帧之间执行测试,并带有结果。
第二部分基本上重新排列列以获得最终输出。
我们可以使用
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
test_tbl %>%
split(.$name) %>%
map_dfr(~ {
Avalue <- .x$value[.x$category == 'A']
.x %>%
filter(category != 'A') %>%
group_by(category) %>%
summarise(out = t.test(value, Avalue)$p.value) %>%
mutate(category = str_c(category, '_vs_A_p_value'))}, .id = 'name') %>%
pivot_wider(names_from = category, values_from = out)