替换循环以在 R 中进行 T 检验
Replacing loop to do T-test in R
我有一个具有以下结构的数据框:
set.seed(1)
dat<- data.frame(gender=sample(rep(c("Man","Woman"),3000)),
age=sample(rep(c("Young","Old"),3000)),
question=rep(c("Q1", "Q2", "Q3"),2000),
response=rep(c("Res1", "Res2"),3000),
value=sample(rep(c(0,1),3000)))
head(dat)
# gender age question response value
#1 Man Old Q1 Res1 0
#2 Man Young Q2 Res2 1
#3 Man Old Q3 Res1 0
#4 Woman Old Q1 Res2 1
#5 Man Old Q2 Res1 1
#6 Man Old Q3 Res2 1
我创建了一个循环来对每个问题的每个响应进行 t 检验,并将输出加入数据框。
library(tidyverse)
library(rstatix)
data.list1<- list()
for (i in 1:length(table(dat$question))) {
dat1<- dat %>%
filter(question==names(table(dat$question))[[i]])
data.list2 <- list()
for(f in 1:(ncol(dat1)-3)){
dat2<- dat1 %>%
t_test(reformulate(colnames(dat1)[f], "value"),
detailed=T) %>%
mutate(question=names(table(dat$question))[[i]],
response=names(table(dat$response))[[f]])
data.list2[[f]]<- dat2
}
data.list1[[i]] <- bind_rows(data.list2)
}
final.output<- bind_rows(data.list1) %>%
select(question, response, group1, estimate1,
group2, estimate2,p)
final.output
# question response group1 estimate1 group2 estimate2 p
# <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
#1 Q1 Res1 Man 0.492 Woman 0.494 0.932
#2 Q1 Res2 Old 0.484 Young 0.502 0.418
#3 Q2 Res1 Man 0.500 Woman 0.509 0.687
#4 Q2 Res2 Old 0.489 Young 0.518 0.198
#5 Q3 Res1 Man 0.495 Woman 0.510 0.504
#6 Q3 Res2 Old 0.511 Young 0.494 0.452
我的问题是我实际使用的数据帧比本例中使用的数据帧大得多并且包含更多变量,因此循环需要很长时间才能运行(超过 10 分钟) .有什么方法可以不使用循环来获得相同的输出吗?
我们可以使用 imap
、select
、'question'、循环列和 'value' 循环列名称的命名向量,然后按 'question'、summarise
t_test
输出 list
和 unnest
(来自 tidyr
)list
输出
library(purrr)
library(dplyr)
library(rstatix)
library(tidyr)
imap_dfr(c(gender = "gender", age = "age"), ~ {
nm1 <- .x
dat %>%
select(question, .x, value) %>%
group_by(question) %>%
summarise(out = list(t_test(reformulate( nm1, "value"),
detailed = TRUE, data = cur_data())))},
.id = 'variable') %>%
unnest(c(out))
-输出
# A tibble: 6 x 17
variable question estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p df conf.low conf.high method alternative
<chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 gender Q1 -0.00192 0.492 0.494 value Man Woman 1006 994 -0.0857 0.932 1998. -0.0458 0.0420 T-test two.sided
2 gender Q2 -0.00901 0.500 0.509 value Man Woman 1001 999 -0.403 0.687 1998. -0.0529 0.0349 T-test two.sided
3 gender Q3 -0.0150 0.495 0.510 value Man Woman 993 1007 -0.669 0.504 1998. -0.0588 0.0289 T-test two.sided
4 age Q1 -0.0181 0.484 0.502 value Old Young 992 1008 -0.810 0.418 1998. -0.0620 0.0258 T-test two.sided
5 age Q2 -0.0288 0.489 0.518 value Old Young 977 1023 -1.29 0.198 1994. -0.0727 0.0150 T-test two.sided
6 age Q3 0.0168 0.511 0.494 value Old Young 1031 969 0.752 0.452 1990. -0.0271 0.0607 T-test two.sided
我有一个具有以下结构的数据框:
set.seed(1)
dat<- data.frame(gender=sample(rep(c("Man","Woman"),3000)),
age=sample(rep(c("Young","Old"),3000)),
question=rep(c("Q1", "Q2", "Q3"),2000),
response=rep(c("Res1", "Res2"),3000),
value=sample(rep(c(0,1),3000)))
head(dat)
# gender age question response value
#1 Man Old Q1 Res1 0
#2 Man Young Q2 Res2 1
#3 Man Old Q3 Res1 0
#4 Woman Old Q1 Res2 1
#5 Man Old Q2 Res1 1
#6 Man Old Q3 Res2 1
我创建了一个循环来对每个问题的每个响应进行 t 检验,并将输出加入数据框。
library(tidyverse)
library(rstatix)
data.list1<- list()
for (i in 1:length(table(dat$question))) {
dat1<- dat %>%
filter(question==names(table(dat$question))[[i]])
data.list2 <- list()
for(f in 1:(ncol(dat1)-3)){
dat2<- dat1 %>%
t_test(reformulate(colnames(dat1)[f], "value"),
detailed=T) %>%
mutate(question=names(table(dat$question))[[i]],
response=names(table(dat$response))[[f]])
data.list2[[f]]<- dat2
}
data.list1[[i]] <- bind_rows(data.list2)
}
final.output<- bind_rows(data.list1) %>%
select(question, response, group1, estimate1,
group2, estimate2,p)
final.output
# question response group1 estimate1 group2 estimate2 p
# <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
#1 Q1 Res1 Man 0.492 Woman 0.494 0.932
#2 Q1 Res2 Old 0.484 Young 0.502 0.418
#3 Q2 Res1 Man 0.500 Woman 0.509 0.687
#4 Q2 Res2 Old 0.489 Young 0.518 0.198
#5 Q3 Res1 Man 0.495 Woman 0.510 0.504
#6 Q3 Res2 Old 0.511 Young 0.494 0.452
我的问题是我实际使用的数据帧比本例中使用的数据帧大得多并且包含更多变量,因此循环需要很长时间才能运行(超过 10 分钟) .有什么方法可以不使用循环来获得相同的输出吗?
我们可以使用 imap
、select
、'question'、循环列和 'value' 循环列名称的命名向量,然后按 'question'、summarise
t_test
输出 list
和 unnest
(来自 tidyr
)list
输出
library(purrr)
library(dplyr)
library(rstatix)
library(tidyr)
imap_dfr(c(gender = "gender", age = "age"), ~ {
nm1 <- .x
dat %>%
select(question, .x, value) %>%
group_by(question) %>%
summarise(out = list(t_test(reformulate( nm1, "value"),
detailed = TRUE, data = cur_data())))},
.id = 'variable') %>%
unnest(c(out))
-输出
# A tibble: 6 x 17
variable question estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p df conf.low conf.high method alternative
<chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 gender Q1 -0.00192 0.492 0.494 value Man Woman 1006 994 -0.0857 0.932 1998. -0.0458 0.0420 T-test two.sided
2 gender Q2 -0.00901 0.500 0.509 value Man Woman 1001 999 -0.403 0.687 1998. -0.0529 0.0349 T-test two.sided
3 gender Q3 -0.0150 0.495 0.510 value Man Woman 993 1007 -0.669 0.504 1998. -0.0588 0.0289 T-test two.sided
4 age Q1 -0.0181 0.484 0.502 value Old Young 992 1008 -0.810 0.418 1998. -0.0620 0.0258 T-test two.sided
5 age Q2 -0.0288 0.489 0.518 value Old Young 977 1023 -1.29 0.198 1994. -0.0727 0.0150 T-test two.sided
6 age Q3 0.0168 0.511 0.494 value Old Young 1031 969 0.752 0.452 1990. -0.0271 0.0607 T-test two.sided