如何编写对数据框中所有 group/variable 组合进行配对 t 检验的函数
How to write a function that conducts paired t-tests on all group/variable combinations in a data frame
我在下面创建了一个类似于 data
的数据框:
ID <- data.frame(ID=rep(c(12,122,242,329,595,130,145,245,654,878),each=5))
Var <- data.frame(Variable=c("Copper","Iron","Lead","Zinc","CaCO"))
n <- 10
Variable <- do.call("rbind",replicate(n,Var,simplify=F))
Location <- rep(c("Alpha","Beta","Gamma"), times=c(20,20,10))
Location <- data.frame(Location)
set.seed(1)
FirstPt<- data.frame(FirstPt=sample(1:100,50,replace=T))
LastPt <- data.frame(LastPt=sample(1:100,50,replace=T))
First3<- data.frame(First3=sample(1:100,50,replace=T))
First5<- data.frame(First5=sample(1:100,50,replace=T))
First7<- data.frame(First7=sample(1:100,50,replace=T))
First10<- data.frame(First10=sample(1:100,50,replace=T))
Last3<- data.frame(Last3=sample(1:100,50,replace=T))
Last5<- data.frame(Last5=sample(1:100,50,replace=T))
Last7<- data.frame(Last7=sample(1:100,50,replace=T))
Last10<- data.frame(Last10=sample(1:100,50,replace=T))
data <- cbind(ID,Location,Variable,FirstPt,LastPt,First3,First5,First7,
First10,Last3,Last5,Last7,Last10)
这可能是一个由两部分组成的问题,但我想编写一个函数,将所有相同的 Variables
分组(例如,所有 Copper
的观察)并进行在所有可能的数值列组合之间进行配对 t 检验 (FirstPt
:Last10
)。我希望它 return 像这样的数据框中的 p 值:
Test P-Value
FirstPt.vs.LastPt …
FirstPt.vs.First3 …
ect... …
这可能是第二个函数,但我也想在观察结果按 Location
分组后执行此操作,以便输出数据框如下所示:
Test P-Value
FirstPt.vs.LastPt.InAlpha
FirstPt.vs.LastPt.InBeta
ect...
我想这就是你想要的。关键是使用 group_by 并从 tidyverse 开始。
df <- NULL
for(i in (4:(ncol(data)-1))){
for(j in ((i+1):ncol(data))){
df <- rbind(df,data %>%
group_by(Location) %>%
do(data.frame(pval = t.test(.[[i]],.[[j]], data = .)$p.value)) %>%
ungroup() %>%
mutate(Test = paste0(colnames(data)[i],'.vs.',colnames(data)[j]))
)
}
}
df$Test <- paste0(df$Test,'.In',df$Location)
也许,你可以使用下面的代码实现你想要的:
library(dplyr)
library(tidyr)
data %>%
pivot_longer(cols = FirstPt:Last10) %>%
group_by(Variable) %>%
summarise(p_value = list(combn(name, 2, function(x)
t.test(value[name == x[1]], value[name == x[2]])$p.value)),
test = list(combn(name, 2, paste, collapse = "_"))) %>%
unnest(cols = c(test, p_value))
# Variable p_value test
# <fct> <dbl> <chr>
# 1 CaCO 0.915 FirstPt_LastPt
# 2 CaCO 0.529 FirstPt_First3
# 3 CaCO 0.337 FirstPt_First5
# 4 CaCO 0.350 FirstPt_First7
# 5 CaCO 0.395 FirstPt_First10
# 6 CaCO 0.765 FirstPt_Last3
# 7 CaCO 0.204 FirstPt_Last5
# 8 CaCO 0.873 FirstPt_Last7
# 9 CaCO 0.479 FirstPt_Last10
#10 CaCO 1 FirstPt_FirstPt
# … with 24,740 more rows
要按 Location
分组,您可以将其添加到 group_by
命令中,并保持其余代码不变。
您可以使用一个函数完成这两项操作:
library(tidyverse)
t.test.by.group.combos <- function(.data, groups){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\((.*)?\)", replacement = "\1")[-1]
.data %>%
group_by(!!!groups) %>%
select_if(is.integer) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = (FirstPt:Last10), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, !!!groups, sep = "."),
p.value = map2_dbl(data.x,data.y, ~t.test(unlist(.x), unlist(.y))$p.value)) %>%
select(test,p.value)%>%
filter(!duplicated(p.value))
) %>%
bind_rows()
}
t.test.by.group.combos(data, vars(Variable))
#> # A tibble: 225 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO 0.511
#> 2 FirstPt.vs.First3.CaCO 0.184
#> 3 FirstPt.vs.First5.CaCO 0.494
#> 4 FirstPt.vs.First7.CaCO 0.354
#> 5 FirstPt.vs.First10.CaCO 0.893
#> 6 FirstPt.vs.Last3.CaCO 0.496
#> 7 FirstPt.vs.Last5.CaCO 0.909
#> 8 FirstPt.vs.Last7.CaCO 0.439
#> 9 FirstPt.vs.Last10.CaCO 0.146
#> 10 LastPt.vs.First3.CaCO 0.578
#> # … with 215 more rows
t.test.by.group.combos(data, vars(Variable, Location))
#> # A tibble: 674 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO.Alpha 0.850
#> 2 FirstPt.vs.First3.CaCO.Alpha 0.822
#> 3 FirstPt.vs.First5.CaCO.Alpha 0.895
#> 4 FirstPt.vs.First7.CaCO.Alpha 0.810
#> 5 FirstPt.vs.First10.CaCO.Alpha 0.645
#> 6 FirstPt.vs.Last3.CaCO.Alpha 0.870
#> 7 FirstPt.vs.Last5.CaCO.Alpha 0.465
#> 8 FirstPt.vs.Last7.CaCO.Alpha 0.115
#> 9 FirstPt.vs.Last10.CaCO.Alpha 0.474
#> 10 LastPt.vs.First3.CaCO.Alpha 0.991
#> # … with 664 more rows
这是一个冗长的函数,但通常我们按 groups
参数分组,然后 select 分组和任何整数列,然后按组拆分数据帧。之后,我们映射变量的所有组合并为每个组合执行 t.tests。最后,我们将所有组重新加入到一个数据框中。
我在下面创建了一个类似于 data
的数据框:
ID <- data.frame(ID=rep(c(12,122,242,329,595,130,145,245,654,878),each=5))
Var <- data.frame(Variable=c("Copper","Iron","Lead","Zinc","CaCO"))
n <- 10
Variable <- do.call("rbind",replicate(n,Var,simplify=F))
Location <- rep(c("Alpha","Beta","Gamma"), times=c(20,20,10))
Location <- data.frame(Location)
set.seed(1)
FirstPt<- data.frame(FirstPt=sample(1:100,50,replace=T))
LastPt <- data.frame(LastPt=sample(1:100,50,replace=T))
First3<- data.frame(First3=sample(1:100,50,replace=T))
First5<- data.frame(First5=sample(1:100,50,replace=T))
First7<- data.frame(First7=sample(1:100,50,replace=T))
First10<- data.frame(First10=sample(1:100,50,replace=T))
Last3<- data.frame(Last3=sample(1:100,50,replace=T))
Last5<- data.frame(Last5=sample(1:100,50,replace=T))
Last7<- data.frame(Last7=sample(1:100,50,replace=T))
Last10<- data.frame(Last10=sample(1:100,50,replace=T))
data <- cbind(ID,Location,Variable,FirstPt,LastPt,First3,First5,First7,
First10,Last3,Last5,Last7,Last10)
这可能是一个由两部分组成的问题,但我想编写一个函数,将所有相同的 Variables
分组(例如,所有 Copper
的观察)并进行在所有可能的数值列组合之间进行配对 t 检验 (FirstPt
:Last10
)。我希望它 return 像这样的数据框中的 p 值:
Test P-Value
FirstPt.vs.LastPt …
FirstPt.vs.First3 …
ect... …
这可能是第二个函数,但我也想在观察结果按 Location
分组后执行此操作,以便输出数据框如下所示:
Test P-Value
FirstPt.vs.LastPt.InAlpha
FirstPt.vs.LastPt.InBeta
ect...
我想这就是你想要的。关键是使用 group_by 并从 tidyverse 开始。
df <- NULL
for(i in (4:(ncol(data)-1))){
for(j in ((i+1):ncol(data))){
df <- rbind(df,data %>%
group_by(Location) %>%
do(data.frame(pval = t.test(.[[i]],.[[j]], data = .)$p.value)) %>%
ungroup() %>%
mutate(Test = paste0(colnames(data)[i],'.vs.',colnames(data)[j]))
)
}
}
df$Test <- paste0(df$Test,'.In',df$Location)
也许,你可以使用下面的代码实现你想要的:
library(dplyr)
library(tidyr)
data %>%
pivot_longer(cols = FirstPt:Last10) %>%
group_by(Variable) %>%
summarise(p_value = list(combn(name, 2, function(x)
t.test(value[name == x[1]], value[name == x[2]])$p.value)),
test = list(combn(name, 2, paste, collapse = "_"))) %>%
unnest(cols = c(test, p_value))
# Variable p_value test
# <fct> <dbl> <chr>
# 1 CaCO 0.915 FirstPt_LastPt
# 2 CaCO 0.529 FirstPt_First3
# 3 CaCO 0.337 FirstPt_First5
# 4 CaCO 0.350 FirstPt_First7
# 5 CaCO 0.395 FirstPt_First10
# 6 CaCO 0.765 FirstPt_Last3
# 7 CaCO 0.204 FirstPt_Last5
# 8 CaCO 0.873 FirstPt_Last7
# 9 CaCO 0.479 FirstPt_Last10
#10 CaCO 1 FirstPt_FirstPt
# … with 24,740 more rows
要按 Location
分组,您可以将其添加到 group_by
命令中,并保持其余代码不变。
您可以使用一个函数完成这两项操作:
library(tidyverse)
t.test.by.group.combos <- function(.data, groups){
by <- gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\((.*)?\)", replacement = "\1")[-1]
.data %>%
group_by(!!!groups) %>%
select_if(is.integer) %>%
group_split() %>%
map(.,
~pivot_longer(., cols = (FirstPt:Last10), names_to = "name", values_to = "val") %>%
nest(data = val) %>%
full_join(.,.,by = by) %>%
filter(name.x != name.y) %>%
mutate(test = paste(name.x, "vs",name.y, !!!groups, sep = "."),
p.value = map2_dbl(data.x,data.y, ~t.test(unlist(.x), unlist(.y))$p.value)) %>%
select(test,p.value)%>%
filter(!duplicated(p.value))
) %>%
bind_rows()
}
t.test.by.group.combos(data, vars(Variable))
#> # A tibble: 225 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO 0.511
#> 2 FirstPt.vs.First3.CaCO 0.184
#> 3 FirstPt.vs.First5.CaCO 0.494
#> 4 FirstPt.vs.First7.CaCO 0.354
#> 5 FirstPt.vs.First10.CaCO 0.893
#> 6 FirstPt.vs.Last3.CaCO 0.496
#> 7 FirstPt.vs.Last5.CaCO 0.909
#> 8 FirstPt.vs.Last7.CaCO 0.439
#> 9 FirstPt.vs.Last10.CaCO 0.146
#> 10 LastPt.vs.First3.CaCO 0.578
#> # … with 215 more rows
t.test.by.group.combos(data, vars(Variable, Location))
#> # A tibble: 674 x 2
#> test p.value
#> <chr> <dbl>
#> 1 FirstPt.vs.LastPt.CaCO.Alpha 0.850
#> 2 FirstPt.vs.First3.CaCO.Alpha 0.822
#> 3 FirstPt.vs.First5.CaCO.Alpha 0.895
#> 4 FirstPt.vs.First7.CaCO.Alpha 0.810
#> 5 FirstPt.vs.First10.CaCO.Alpha 0.645
#> 6 FirstPt.vs.Last3.CaCO.Alpha 0.870
#> 7 FirstPt.vs.Last5.CaCO.Alpha 0.465
#> 8 FirstPt.vs.Last7.CaCO.Alpha 0.115
#> 9 FirstPt.vs.Last10.CaCO.Alpha 0.474
#> 10 LastPt.vs.First3.CaCO.Alpha 0.991
#> # … with 664 more rows
这是一个冗长的函数,但通常我们按 groups
参数分组,然后 select 分组和任何整数列,然后按组拆分数据帧。之后,我们映射变量的所有组合并为每个组合执行 t.tests。最后,我们将所有组重新加入到一个数据框中。