如何用跨函数重写相同的代码
How to rewrite the same code with across function
我编写了以下代码
out %>% group_by(tests0, GROUP) %>%
summarise(
mean0 = mean(score0, na.rm = T),
stderr0 = std.error(score0, na.rm = T),
mean7 = mean(score7, na.rm = T),
stederr7 = std.error(score7, na.rm = T),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value,
)
我得到了以下输出
tests0 GROUP mean0 stderr0 mean7 stederr7 diff.std.mean p.value
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ADAS_CogT0 CONTROL 12.6 0.525 13.6 0.662 -1.15 0.00182
2 ADAS_CogT0 TRAINING 14.0 0.613 12.6 0.570 1.40 0.00295
3 PVF_T0 CONTROL 32.1 1.22 31.3 1.45 0.498 0.636
4 PVF_T0 TRAINING 31.6 1.37 34.3 1.51 -2.48 0.0102
5 ROCF_CT0 CONTROL 29.6 0.893 30.3 0.821 -0.180 0.835
6 ROCF_CT0 TRAINING 30.1 0.906 29.5 0.929 0.489 0.615
7 ROCF_IT0 CONTROL 12.8 0.563 12.2 0.683 0.580 0.356
8 ROCF_IT0 TRAINING 10.9 0.735 12.3 0.768 -1.44 0.0238
9 ROCF_RT0 CONTROL 12.1 0.725 12.5 0.797 -0.370 0.598
10 ROCF_RT0 TRAINING 10.5 0.746 10.9 0.742 -0.534 0.370
11 SVF_T0 CONTROL 35.5 1.05 34 1.15 1.42 0.107
12 SVF_T0 TRAINING 34.1 1.04 32.9 1.16 0.962 0.231
如果我想通过 across function
做同样的事情,我应该怎么做才能达到上面代码中显示的相同结果?实际上我遇到了麻烦,因为我从这个问题 下发布的答案中抽取了一些例子,但我无法正确地适应它。
这里是数据集
> dput(head(out, 100))
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100), GROUP = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("CONTROL", "TRAINING"), class = "factor"),
Gender = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
2L), .Label = c("M", "F"), class = "factor"), Age = c(74,
76, 81, 74, 69, 72, 75, 83, 78, 72, 82, 68, 72, 72, 73, 80,
69, 72, 70, 80, 75, 80, 78, 74, 82, 74, 80, 82, 78, 81, 66,
71, 70, 79, 78, 73, 72, 77, 77, 71, 83, 74, 70, 71, 77, 69,
67, 64, 79, 71, 77, 77, 73, 67, 68, 79, 81, 67, 84, 75, 80,
73, 68, 74, 77, 79, 79, 72, 73, 78, 76, 78, 77, 74, 78, 77,
77, 82, 77, 70, 77, 81, 79, 75, 74, 78, 69, 77, 73, 77, 70,
79, 70, 72, 77, 72, 71, 71, 73, 81), Education = c(18, 4,
8, 5, 8, 11, 5, 5, 4, 8, 8, 12, 5, 18, 13, 5, 13, 13, 5,
5, 13, 5, 3, 8, 17, 5, 8, 5, 5, 8, 17, 8, 18, 18, 13, 13,
13, 13, 15, 17, 8, 5, 5, 13, 8, 5, 11, 13, 8, 8, 8, 5, 13,
8, 5, 17, 8, 12, 13, 5, 8, 8, 8, 5, 3, 8, 18, 5, 8, 13, 8,
5, 17, 8, 5, 17, 5, 8, 11, 8, 8, 5, 12, 3, 8, 8, 8, 13, 5,
5, 8, 8, 13, 5, 5, 8, 13, 5, 8, 12), tests0 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT0",
"PVF_T0", "ROCF_CT0", "ROCF_IT0", "ROCF_RT0", "SVF_T0"), class = "factor"),
score0 = c(14.66, 15.33, 17.33, 19, 7.66, 12.6, 18.67, 14.99,
17.99, 17.33, 13.66, 16.99, 10.66, 9.66, 14.99, 15.66, 13.33,
4.33, 14.33, 15.99, 16.33, 10.66, 14.66, 10.66, 19.33, 17.66,
15.99, 20.66, 20.6, 17, 10.33, 6.33, 6.66, 19.99, 13.33,
24.33, 12.33, 10.33, 12.33, 9.66, 10.99, 13.99, 23, 6.32,
11.32, 13.99, 14.66, 8.99, 14.33, 9.99, 7.33, 15.66, 14,
7.99, 23.32, 14.66, 9.99, 5.66, 6.99, 11.66, 10.33, 6.99,
19.32, NA, 10, 17.66, 13.66, 10.32, NA, NA, 8.66, 9, 6.99,
14.99, 9.66, 13.66, 15.32, 12, 14, 13.66, 11.99, 15.66, 16,
15, 16.99, 20, 11, 7.99, 8.33, 8.32, 14.99, 18.66, 10.33,
11.99, 9.32, 17, 14.33, 14.66, 16.6, 9.99), tests7 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT7",
"PVF_T7", "ROCF_CT7", "ROCF_IT7", "ROCF_RT7", "SVF_T7"), class = "factor"),
score7 = c(16, 9.32, 21.33, 17, 8.32, 11, 14.99, 10.99, 17,
18.33, 13.32, 14.34, 8.99, 7, 11.99, 15.33, 6.99, 5.33, 12.32,
13, 21.32, 7.99, 13.33, 11.99, 17.32, 16.32, 16.33, 14.66,
18.99, 17.33, 7.99, 9.33, 10.99, NA, 12.99, 16.33, 21.66,
9, 9.34, 8.66, 8.33, 13.66, 15.66, 6.66, 10.99, 13.33, 13.33,
7.99, 11.99, 11.32, 7.33, 9.66, 6.99, NA, 15.99, 15.66, 14.66,
6.32, 7, 11, 14, 10.33, 24.66, NA, 14.99, NA, 15.99, 9.32,
NA, NA, 9.99, 9.33, 7.66, 17.33, 10.32, 16, 17, 12.99, 15,
14.33, 10, 14.99, 19, 13.99, 19.33, NA, 10, 6.99, 11.66,
6.66, 14.33, 16, 8.66, 10, NA, 20, 14.99, 19.66, 26.66, 8.99
)), row.names = c(NA, -100L), class = c("tbl_df", "tbl",
"data.frame"))
>
下面你可以找到我想获得的方法。它是一种需要.x 操作的方法。
out %>%
group_by(across(all_of(tests0, GROUP))) %>% summarise(across(starts_with('score'),
list(mean = ~ mean(.x,na.rm = T),
stderr = ~ std.error(.x, na.rm = TRUE),
diff.std.mean = ~ t.test(.x, na.rm = T)))$estimate,
p.value = ~ t.test(.x, na.rm = T)))$p.value)),.groups = "drop")
您可以在 across()
:
中使用参数 .names
library(dplyr)
out %>%
group_by(tests0, GROUP) %>%
summarize(across(c(score0, score7), sd, na.rm = TRUE, .names = "sd_{.col}"),
across(c(score0, score7), mean, na.rm = TRUE, .names = "mean_{.col}"),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value) %>%
ungroup()
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
#> # A tibble: 2 x 8
#> tests0 GROUP sd_score0 sd_score7 mean_score0 mean_score7 diff.std.mean p.value
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_~ CONT~ 3.72 4.81 12.5 13.5 -1.24 0.00471
#> 2 ADAS_~ TRAI~ 4.55 4.15 14.0 12.6 1.40 0.00295
由 reprex package (v2.0.1)
于 2021-11-26 创建
编辑
如果您更喜欢列表,那么确定单独的部分然后将它们绑定在一起会更容易:
library(data.table)
by <- c("tests0", "GROUP")
out_dt <- data.table::data.table(out)
means <- out_dt[, sapply(.SD, function(x) list(mean = mean(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
sds <- out_dt[, sapply(.SD, function(x) list(sd = sd(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
t_est <- out_dt[, .(diff.std.mean = t.test(score0, score7, paired = T)$estimate), by = by]
tpvalue <- out_dt[, .(p.value = t.test(score0, score7, paired = T)$p.value), by = by]
list(means = means, sds = sds, diff.std.mean = t_est, p.value = tpvalue)
这是您可能要考虑的另一种方法。首先,我把你的代码剪切并粘贴到一个函数中。提取列名称并删除对用于计算标准误差的 plotrix 包的依赖是唯一的更改。
g <- function (df)
{
nms <- c(names(df)[1:2],
paste0('mean', sub(".*[a-z]","",names(df)[3])),
paste0('stderr', sub(".*[a-z]","",names(df)[3])),
paste0('mean', sub(".*[a-z]","",names(df)[4])),
paste0('stderr', sub(".*[a-z]","",names(df)[4])),
'diff.std.mean', 'p.value')
z <- df %>% group_by(df[,1:2]) %>%
summarize(
x1 = mean(pull(df[,3]), na.rm = T),
x2 = sd(pull(df[,3]), na.rm=T) / sqrt(sum(!is.na(pull(df[,3])))),
x3 = mean(pull(df[,4]), na.rm = T),
x4 = sd(pull(df[,4]), na.rm=T) / sqrt(sum(!is.na(pull(df[,4])))),
x5 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$estimate,
x6 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$p.value)
colnames(z) <- nms
return(z)
}
然后,因为测试数据只有一个水平的因子,而且你使用的 plotrix::std.error 函数的样本量不足,所以我在 'test0' 因子中引入了变化,将样本大小,并删除未使用的级别,因为它们会导致空帧上的迭代。此外,我添加了一个 score8 来展示您如何 运行 其他变量。
s <- t %>% mutate(tests0 = case_when(Education <= 8 ~ 'ADAS_CogTO', T ~ 'PVF_T0'),
score8 = score0 + score7)
q <- rbind(s, s)
fct_drop(q$tests0)
然后我按因子级别拆分框架,将函数应用于每个拆分,然后将数据重新合并到一个允许您操作分数和组变量的函数中。我假设每个 2 个,这对分数变量是安全的,因为你正在进行配对 t 检验,并且它很容易用组变量扩展(如果你只是将分数变量移动到位置 1 和 2,并使用所有剩余的变量作为组变量传递给函数)。
h <- function(df, group_vars, score_vars)
{
z <- df %>% select(group_vars, score_vars)
z <- z %>% group_by(z[,1:2]) %>%
group_map( ~ g(.x), .keep = T) %>%
bind_rows()
}
请注意,如果您希望将此应用到其他数据,您只需更改传递给组和评分变量的列。如果你也想改变它应该相当容易,只是认为这是一个很好的框架,可以用来做你想做的事情。考虑一下您如何处理 test0 为空且 test7 为非空(或反之亦然)的情况,因为这些观察结果包含在您的汇总统计数据中,但必然被排除在 t 检验之外。祝你好运。
x <- h(q, c("tests0", "GROUP"), c("score0", "score7")) %>%
group_by(tests0) %>%
pivot_wider(id_cols = tests0,
names_from = GROUP,
values_from = c("mean0","stderr0","mean7","stderr7",
'diff.std.mean', 'p.value'))
我没有名为 std.error
的函数,所以我使用了 sd
,当然你可以更改它。
library(dplyr)
library(readr)
out %>%
group_by(tests0, GROUP) %>%
summarise(
across(c(score0, score7), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
with(t.test(score0, score7, paired = T),
tibble(diff.std.mean = estimate,
p.value)))
# # A tibble: 2 × 8
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
实际上我只是将上面的代码放在一个函数中,该函数接受一个 x
和 y
参数,然后是 运行 fun(df, x = score0, y = score7)
。但是,只是为了好玩,如果你必须使用 .x
和 .y
,这是一种方法(尽管我觉得这样做有点傻)
df %>%
group_by(tests0, GROUP) %>%
select(starts_with('score')) %>%
summarise(
across(everything(), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
across(everything(), list(list)) %>%
pmap_dfr(~ t.test(.x, .y, paired = TRUE)[c('estimate', 'p.value')]) %>%
transmute(diff.std.mean = estimate, p.value))
# # A tibble: 2 × 8
# # Groups: tests0 [1]
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
我想到了一种可能的解决方法(可能有帮助也可能没有帮助),方法是“手动”使用 across()
,而不是一次一列地应用函数。生成的输出是一个 data.frame
,其中的列表列深度嵌套,因此 unnest()
会派上用场。我还使用 possibly()
来解决不存在两列的情况,请记住 across()
可以匹配任意数量的列,而 t.test()
需要 x
和 y
参数。
代码:
library(tidyverse)
data <-
df %>%
group_by(tests0, GROUP) %>%
summarize(
all = list(across(starts_with("score")) %>%
{
tibble(
ttest = data.frame(possibly(~ reduce(., ~ t.test(.x, .y, paired = TRUE))[c("estimate", 'p.value')], NA)(.)),
means = data.frame(map(., ~ mean(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "mean"))),
stderrs = data.frame(map(., ~ sd(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "stederr")))
)
})
)
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
data %>%
unnest(all) %>%
unnest(-c("tests0", "GROUP"))
#> # A tibble: 2 × 8
#> # Groups: tests0 [1]
#> tests0 GROUP estimate p.value mean0 mean7 stederr0 stederr7
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_CogT0 CONTROL -1.24 0.00471 12.5 13.5 3.72 4.81
#> 2 ADAS_CogT0 TRAINING 1.40 0.00295 14.0 12.6 4.55 4.15
由 reprex package (v2.0.1)
于 2021-11-29 创建
我编写了以下代码
out %>% group_by(tests0, GROUP) %>%
summarise(
mean0 = mean(score0, na.rm = T),
stderr0 = std.error(score0, na.rm = T),
mean7 = mean(score7, na.rm = T),
stederr7 = std.error(score7, na.rm = T),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value,
)
我得到了以下输出
tests0 GROUP mean0 stderr0 mean7 stederr7 diff.std.mean p.value
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ADAS_CogT0 CONTROL 12.6 0.525 13.6 0.662 -1.15 0.00182
2 ADAS_CogT0 TRAINING 14.0 0.613 12.6 0.570 1.40 0.00295
3 PVF_T0 CONTROL 32.1 1.22 31.3 1.45 0.498 0.636
4 PVF_T0 TRAINING 31.6 1.37 34.3 1.51 -2.48 0.0102
5 ROCF_CT0 CONTROL 29.6 0.893 30.3 0.821 -0.180 0.835
6 ROCF_CT0 TRAINING 30.1 0.906 29.5 0.929 0.489 0.615
7 ROCF_IT0 CONTROL 12.8 0.563 12.2 0.683 0.580 0.356
8 ROCF_IT0 TRAINING 10.9 0.735 12.3 0.768 -1.44 0.0238
9 ROCF_RT0 CONTROL 12.1 0.725 12.5 0.797 -0.370 0.598
10 ROCF_RT0 TRAINING 10.5 0.746 10.9 0.742 -0.534 0.370
11 SVF_T0 CONTROL 35.5 1.05 34 1.15 1.42 0.107
12 SVF_T0 TRAINING 34.1 1.04 32.9 1.16 0.962 0.231
如果我想通过 across function
做同样的事情,我应该怎么做才能达到上面代码中显示的相同结果?实际上我遇到了麻烦,因为我从这个问题
这里是数据集
> dput(head(out, 100))
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100), GROUP = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("CONTROL", "TRAINING"), class = "factor"),
Gender = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
2L), .Label = c("M", "F"), class = "factor"), Age = c(74,
76, 81, 74, 69, 72, 75, 83, 78, 72, 82, 68, 72, 72, 73, 80,
69, 72, 70, 80, 75, 80, 78, 74, 82, 74, 80, 82, 78, 81, 66,
71, 70, 79, 78, 73, 72, 77, 77, 71, 83, 74, 70, 71, 77, 69,
67, 64, 79, 71, 77, 77, 73, 67, 68, 79, 81, 67, 84, 75, 80,
73, 68, 74, 77, 79, 79, 72, 73, 78, 76, 78, 77, 74, 78, 77,
77, 82, 77, 70, 77, 81, 79, 75, 74, 78, 69, 77, 73, 77, 70,
79, 70, 72, 77, 72, 71, 71, 73, 81), Education = c(18, 4,
8, 5, 8, 11, 5, 5, 4, 8, 8, 12, 5, 18, 13, 5, 13, 13, 5,
5, 13, 5, 3, 8, 17, 5, 8, 5, 5, 8, 17, 8, 18, 18, 13, 13,
13, 13, 15, 17, 8, 5, 5, 13, 8, 5, 11, 13, 8, 8, 8, 5, 13,
8, 5, 17, 8, 12, 13, 5, 8, 8, 8, 5, 3, 8, 18, 5, 8, 13, 8,
5, 17, 8, 5, 17, 5, 8, 11, 8, 8, 5, 12, 3, 8, 8, 8, 13, 5,
5, 8, 8, 13, 5, 5, 8, 13, 5, 8, 12), tests0 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT0",
"PVF_T0", "ROCF_CT0", "ROCF_IT0", "ROCF_RT0", "SVF_T0"), class = "factor"),
score0 = c(14.66, 15.33, 17.33, 19, 7.66, 12.6, 18.67, 14.99,
17.99, 17.33, 13.66, 16.99, 10.66, 9.66, 14.99, 15.66, 13.33,
4.33, 14.33, 15.99, 16.33, 10.66, 14.66, 10.66, 19.33, 17.66,
15.99, 20.66, 20.6, 17, 10.33, 6.33, 6.66, 19.99, 13.33,
24.33, 12.33, 10.33, 12.33, 9.66, 10.99, 13.99, 23, 6.32,
11.32, 13.99, 14.66, 8.99, 14.33, 9.99, 7.33, 15.66, 14,
7.99, 23.32, 14.66, 9.99, 5.66, 6.99, 11.66, 10.33, 6.99,
19.32, NA, 10, 17.66, 13.66, 10.32, NA, NA, 8.66, 9, 6.99,
14.99, 9.66, 13.66, 15.32, 12, 14, 13.66, 11.99, 15.66, 16,
15, 16.99, 20, 11, 7.99, 8.33, 8.32, 14.99, 18.66, 10.33,
11.99, 9.32, 17, 14.33, 14.66, 16.6, 9.99), tests7 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT7",
"PVF_T7", "ROCF_CT7", "ROCF_IT7", "ROCF_RT7", "SVF_T7"), class = "factor"),
score7 = c(16, 9.32, 21.33, 17, 8.32, 11, 14.99, 10.99, 17,
18.33, 13.32, 14.34, 8.99, 7, 11.99, 15.33, 6.99, 5.33, 12.32,
13, 21.32, 7.99, 13.33, 11.99, 17.32, 16.32, 16.33, 14.66,
18.99, 17.33, 7.99, 9.33, 10.99, NA, 12.99, 16.33, 21.66,
9, 9.34, 8.66, 8.33, 13.66, 15.66, 6.66, 10.99, 13.33, 13.33,
7.99, 11.99, 11.32, 7.33, 9.66, 6.99, NA, 15.99, 15.66, 14.66,
6.32, 7, 11, 14, 10.33, 24.66, NA, 14.99, NA, 15.99, 9.32,
NA, NA, 9.99, 9.33, 7.66, 17.33, 10.32, 16, 17, 12.99, 15,
14.33, 10, 14.99, 19, 13.99, 19.33, NA, 10, 6.99, 11.66,
6.66, 14.33, 16, 8.66, 10, NA, 20, 14.99, 19.66, 26.66, 8.99
)), row.names = c(NA, -100L), class = c("tbl_df", "tbl",
"data.frame"))
>
下面你可以找到我想获得的方法。它是一种需要.x 操作的方法。
out %>%
group_by(across(all_of(tests0, GROUP))) %>% summarise(across(starts_with('score'),
list(mean = ~ mean(.x,na.rm = T),
stderr = ~ std.error(.x, na.rm = TRUE),
diff.std.mean = ~ t.test(.x, na.rm = T)))$estimate,
p.value = ~ t.test(.x, na.rm = T)))$p.value)),.groups = "drop")
您可以在 across()
:
.names
library(dplyr)
out %>%
group_by(tests0, GROUP) %>%
summarize(across(c(score0, score7), sd, na.rm = TRUE, .names = "sd_{.col}"),
across(c(score0, score7), mean, na.rm = TRUE, .names = "mean_{.col}"),
diff.std.mean = t.test(score0, score7, paired = T)$estimate,
p.value = t.test(score0, score7, paired = T)$p.value) %>%
ungroup()
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
#> # A tibble: 2 x 8
#> tests0 GROUP sd_score0 sd_score7 mean_score0 mean_score7 diff.std.mean p.value
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_~ CONT~ 3.72 4.81 12.5 13.5 -1.24 0.00471
#> 2 ADAS_~ TRAI~ 4.55 4.15 14.0 12.6 1.40 0.00295
由 reprex package (v2.0.1)
于 2021-11-26 创建编辑
如果您更喜欢列表,那么确定单独的部分然后将它们绑定在一起会更容易:
library(data.table)
by <- c("tests0", "GROUP")
out_dt <- data.table::data.table(out)
means <- out_dt[, sapply(.SD, function(x) list(mean = mean(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
sds <- out_dt[, sapply(.SD, function(x) list(sd = sd(x, na.rm = TRUE))),
by = by, .SDcols = patterns("^score")]
t_est <- out_dt[, .(diff.std.mean = t.test(score0, score7, paired = T)$estimate), by = by]
tpvalue <- out_dt[, .(p.value = t.test(score0, score7, paired = T)$p.value), by = by]
list(means = means, sds = sds, diff.std.mean = t_est, p.value = tpvalue)
这是您可能要考虑的另一种方法。首先,我把你的代码剪切并粘贴到一个函数中。提取列名称并删除对用于计算标准误差的 plotrix 包的依赖是唯一的更改。
g <- function (df)
{
nms <- c(names(df)[1:2],
paste0('mean', sub(".*[a-z]","",names(df)[3])),
paste0('stderr', sub(".*[a-z]","",names(df)[3])),
paste0('mean', sub(".*[a-z]","",names(df)[4])),
paste0('stderr', sub(".*[a-z]","",names(df)[4])),
'diff.std.mean', 'p.value')
z <- df %>% group_by(df[,1:2]) %>%
summarize(
x1 = mean(pull(df[,3]), na.rm = T),
x2 = sd(pull(df[,3]), na.rm=T) / sqrt(sum(!is.na(pull(df[,3])))),
x3 = mean(pull(df[,4]), na.rm = T),
x4 = sd(pull(df[,4]), na.rm=T) / sqrt(sum(!is.na(pull(df[,4])))),
x5 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$estimate,
x6 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$p.value)
colnames(z) <- nms
return(z)
}
然后,因为测试数据只有一个水平的因子,而且你使用的 plotrix::std.error 函数的样本量不足,所以我在 'test0' 因子中引入了变化,将样本大小,并删除未使用的级别,因为它们会导致空帧上的迭代。此外,我添加了一个 score8 来展示您如何 运行 其他变量。
s <- t %>% mutate(tests0 = case_when(Education <= 8 ~ 'ADAS_CogTO', T ~ 'PVF_T0'),
score8 = score0 + score7)
q <- rbind(s, s)
fct_drop(q$tests0)
然后我按因子级别拆分框架,将函数应用于每个拆分,然后将数据重新合并到一个允许您操作分数和组变量的函数中。我假设每个 2 个,这对分数变量是安全的,因为你正在进行配对 t 检验,并且它很容易用组变量扩展(如果你只是将分数变量移动到位置 1 和 2,并使用所有剩余的变量作为组变量传递给函数)。
h <- function(df, group_vars, score_vars)
{
z <- df %>% select(group_vars, score_vars)
z <- z %>% group_by(z[,1:2]) %>%
group_map( ~ g(.x), .keep = T) %>%
bind_rows()
}
请注意,如果您希望将此应用到其他数据,您只需更改传递给组和评分变量的列。如果你也想改变它应该相当容易,只是认为这是一个很好的框架,可以用来做你想做的事情。考虑一下您如何处理 test0 为空且 test7 为非空(或反之亦然)的情况,因为这些观察结果包含在您的汇总统计数据中,但必然被排除在 t 检验之外。祝你好运。
x <- h(q, c("tests0", "GROUP"), c("score0", "score7")) %>%
group_by(tests0) %>%
pivot_wider(id_cols = tests0,
names_from = GROUP,
values_from = c("mean0","stderr0","mean7","stderr7",
'diff.std.mean', 'p.value'))
我没有名为 std.error
的函数,所以我使用了 sd
,当然你可以更改它。
library(dplyr)
library(readr)
out %>%
group_by(tests0, GROUP) %>%
summarise(
across(c(score0, score7), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
with(t.test(score0, score7, paired = T),
tibble(diff.std.mean = estimate,
p.value)))
# # A tibble: 2 × 8
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
实际上我只是将上面的代码放在一个函数中,该函数接受一个 x
和 y
参数,然后是 运行 fun(df, x = score0, y = score7)
。但是,只是为了好玩,如果你必须使用 .x
和 .y
,这是一种方法(尽管我觉得这样做有点傻)
df %>%
group_by(tests0, GROUP) %>%
select(starts_with('score')) %>%
summarise(
across(everything(), list(mean = mean, stderr = sd), na.rm = TRUE,
.names = '{.fn}{parse_number(.col)}'),
across(everything(), list(list)) %>%
pmap_dfr(~ t.test(.x, .y, paired = TRUE)[c('estimate', 'p.value')]) %>%
transmute(diff.std.mean = estimate, p.value))
# # A tibble: 2 × 8
# # Groups: tests0 [1]
# tests0 GROUP mean0 stderr0 mean7 stderr7 diff.std.mean p.value
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ADAS_CogT0 CONTROL 12.5 3.72 13.5 4.81 -1.24 0.00471
# 2 ADAS_CogT0 TRAINING 14.0 4.55 12.6 4.15 1.40 0.00295
我想到了一种可能的解决方法(可能有帮助也可能没有帮助),方法是“手动”使用 across()
,而不是一次一列地应用函数。生成的输出是一个 data.frame
,其中的列表列深度嵌套,因此 unnest()
会派上用场。我还使用 possibly()
来解决不存在两列的情况,请记住 across()
可以匹配任意数量的列,而 t.test()
需要 x
和 y
参数。
代码:
library(tidyverse)
data <-
df %>%
group_by(tests0, GROUP) %>%
summarize(
all = list(across(starts_with("score")) %>%
{
tibble(
ttest = data.frame(possibly(~ reduce(., ~ t.test(.x, .y, paired = TRUE))[c("estimate", 'p.value')], NA)(.)),
means = data.frame(map(., ~ mean(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "mean"))),
stderrs = data.frame(map(., ~ sd(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "stederr")))
)
})
)
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
data %>%
unnest(all) %>%
unnest(-c("tests0", "GROUP"))
#> # A tibble: 2 × 8
#> # Groups: tests0 [1]
#> tests0 GROUP estimate p.value mean0 mean7 stederr0 stederr7
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ADAS_CogT0 CONTROL -1.24 0.00471 12.5 13.5 3.72 4.81
#> 2 ADAS_CogT0 TRAINING 1.40 0.00295 14.0 12.6 4.55 4.15
由 reprex package (v2.0.1)
于 2021-11-29 创建