如何用跨函数重写相同的代码

How to rewrite the same code with across function

我编写了以下代码

out %>% group_by(tests0, GROUP) %>% 
  summarise(
            mean0 = mean(score0, na.rm = T),
            stderr0 = std.error(score0, na.rm = T), 
            mean7 = mean(score7, na.rm = T), 
            stederr7 = std.error(score7, na.rm = T),
            diff.std.mean = t.test(score0, score7, paired = T)$estimate, 
            p.value = t.test(score0, score7, paired = T)$p.value, 
            )

我得到了以下输出

 tests0     GROUP    mean0 stderr0 mean7 stederr7 diff.std.mean p.value
   <fct>      <fct>    <dbl>   <dbl> <dbl>    <dbl>         <dbl>   <dbl>
 1 ADAS_CogT0 CONTROL   12.6   0.525  13.6    0.662        -1.15  0.00182
 2 ADAS_CogT0 TRAINING  14.0   0.613  12.6    0.570         1.40  0.00295
 3 PVF_T0     CONTROL   32.1   1.22   31.3    1.45          0.498 0.636  
 4 PVF_T0     TRAINING  31.6   1.37   34.3    1.51         -2.48  0.0102 
 5 ROCF_CT0   CONTROL   29.6   0.893  30.3    0.821        -0.180 0.835  
 6 ROCF_CT0   TRAINING  30.1   0.906  29.5    0.929         0.489 0.615  
 7 ROCF_IT0   CONTROL   12.8   0.563  12.2    0.683         0.580 0.356  
 8 ROCF_IT0   TRAINING  10.9   0.735  12.3    0.768        -1.44  0.0238 
 9 ROCF_RT0   CONTROL   12.1   0.725  12.5    0.797        -0.370 0.598  
10 ROCF_RT0   TRAINING  10.5   0.746  10.9    0.742        -0.534 0.370  
11 SVF_T0     CONTROL   35.5   1.05   34      1.15          1.42  0.107  
12 SVF_T0     TRAINING  34.1   1.04   32.9    1.16          0.962 0.231

如果我想通过 across function 做同样的事情,我应该怎么做才能达到上面代码中显示的相同结果?实际上我遇到了麻烦,因为我从这个问题 下发布的答案中抽取了一些例子,但我无法正确地适应它。

这里是数据集

> dput(head(out, 100))
structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 
93, 94, 95, 96, 97, 98, 99, 100), GROUP = structure(c(2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = c("CONTROL", "TRAINING"), class = "factor"), 
    Gender = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 
    2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
    2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 
    2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 
    1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
    2L), .Label = c("M", "F"), class = "factor"), Age = c(74, 
    76, 81, 74, 69, 72, 75, 83, 78, 72, 82, 68, 72, 72, 73, 80, 
    69, 72, 70, 80, 75, 80, 78, 74, 82, 74, 80, 82, 78, 81, 66, 
    71, 70, 79, 78, 73, 72, 77, 77, 71, 83, 74, 70, 71, 77, 69, 
    67, 64, 79, 71, 77, 77, 73, 67, 68, 79, 81, 67, 84, 75, 80, 
    73, 68, 74, 77, 79, 79, 72, 73, 78, 76, 78, 77, 74, 78, 77, 
    77, 82, 77, 70, 77, 81, 79, 75, 74, 78, 69, 77, 73, 77, 70, 
    79, 70, 72, 77, 72, 71, 71, 73, 81), Education = c(18, 4, 
    8, 5, 8, 11, 5, 5, 4, 8, 8, 12, 5, 18, 13, 5, 13, 13, 5, 
    5, 13, 5, 3, 8, 17, 5, 8, 5, 5, 8, 17, 8, 18, 18, 13, 13, 
    13, 13, 15, 17, 8, 5, 5, 13, 8, 5, 11, 13, 8, 8, 8, 5, 13, 
    8, 5, 17, 8, 12, 13, 5, 8, 8, 8, 5, 3, 8, 18, 5, 8, 13, 8, 
    5, 17, 8, 5, 17, 5, 8, 11, 8, 8, 5, 12, 3, 8, 8, 8, 13, 5, 
    5, 8, 8, 13, 5, 5, 8, 13, 5, 8, 12), tests0 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT0", 
    "PVF_T0", "ROCF_CT0", "ROCF_IT0", "ROCF_RT0", "SVF_T0"), class = "factor"), 
    score0 = c(14.66, 15.33, 17.33, 19, 7.66, 12.6, 18.67, 14.99, 
    17.99, 17.33, 13.66, 16.99, 10.66, 9.66, 14.99, 15.66, 13.33, 
    4.33, 14.33, 15.99, 16.33, 10.66, 14.66, 10.66, 19.33, 17.66, 
    15.99, 20.66, 20.6, 17, 10.33, 6.33, 6.66, 19.99, 13.33, 
    24.33, 12.33, 10.33, 12.33, 9.66, 10.99, 13.99, 23, 6.32, 
    11.32, 13.99, 14.66, 8.99, 14.33, 9.99, 7.33, 15.66, 14, 
    7.99, 23.32, 14.66, 9.99, 5.66, 6.99, 11.66, 10.33, 6.99, 
    19.32, NA, 10, 17.66, 13.66, 10.32, NA, NA, 8.66, 9, 6.99, 
    14.99, 9.66, 13.66, 15.32, 12, 14, 13.66, 11.99, 15.66, 16, 
    15, 16.99, 20, 11, 7.99, 8.33, 8.32, 14.99, 18.66, 10.33, 
    11.99, 9.32, 17, 14.33, 14.66, 16.6, 9.99), tests7 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("ADAS_CogT7", 
    "PVF_T7", "ROCF_CT7", "ROCF_IT7", "ROCF_RT7", "SVF_T7"), class = "factor"), 
    score7 = c(16, 9.32, 21.33, 17, 8.32, 11, 14.99, 10.99, 17, 
    18.33, 13.32, 14.34, 8.99, 7, 11.99, 15.33, 6.99, 5.33, 12.32, 
    13, 21.32, 7.99, 13.33, 11.99, 17.32, 16.32, 16.33, 14.66, 
    18.99, 17.33, 7.99, 9.33, 10.99, NA, 12.99, 16.33, 21.66, 
    9, 9.34, 8.66, 8.33, 13.66, 15.66, 6.66, 10.99, 13.33, 13.33, 
    7.99, 11.99, 11.32, 7.33, 9.66, 6.99, NA, 15.99, 15.66, 14.66, 
    6.32, 7, 11, 14, 10.33, 24.66, NA, 14.99, NA, 15.99, 9.32, 
    NA, NA, 9.99, 9.33, 7.66, 17.33, 10.32, 16, 17, 12.99, 15, 
    14.33, 10, 14.99, 19, 13.99, 19.33, NA, 10, 6.99, 11.66, 
    6.66, 14.33, 16, 8.66, 10, NA, 20, 14.99, 19.66, 26.66, 8.99
    )), row.names = c(NA, -100L), class = c("tbl_df", "tbl", 
"data.frame"))
> 

下面你可以找到我想获得的方法。它是一种需要.x 操作的方法。

out %>%    
group_by(across(all_of(tests0, GROUP))) %>%    summarise(across(starts_with('score'),                         
list(mean = ~ mean(.x,na.rm = T),            
stderr = ~ std.error(.x, na.rm = TRUE),            
diff.std.mean = ~ t.test(.x, na.rm = T)))$estimate,              
p.value = ~ t.test(.x, na.rm = T)))$p.value)),.groups = "drop")

您可以在 across():

中使用参数 .names
library(dplyr)
out %>%
  group_by(tests0, GROUP) %>%
  summarize(across(c(score0, score7), sd, na.rm = TRUE, .names = "sd_{.col}"), 
            across(c(score0, score7), mean, na.rm = TRUE, .names = "mean_{.col}"), 
            diff.std.mean = t.test(score0, score7, paired = T)$estimate, 
            p.value = t.test(score0, score7, paired = T)$p.value) %>%
  ungroup()
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.
#> # A tibble: 2 x 8
#>   tests0 GROUP sd_score0 sd_score7 mean_score0 mean_score7 diff.std.mean p.value
#>   <fct>  <fct>     <dbl>     <dbl>       <dbl>       <dbl>         <dbl>   <dbl>
#> 1 ADAS_~ CONT~      3.72      4.81        12.5        13.5         -1.24 0.00471
#> 2 ADAS_~ TRAI~      4.55      4.15        14.0        12.6          1.40 0.00295

reprex package (v2.0.1)

于 2021-11-26 创建

编辑

如果您更喜欢列表,那么确定单独的部分然后将它们绑定在一起会更容易:


library(data.table)
by <- c("tests0", "GROUP")
out_dt <- data.table::data.table(out)
means <- out_dt[, sapply(.SD, function(x) list(mean = mean(x, na.rm = TRUE))), 
                by = by, .SDcols = patterns("^score")]
sds <- out_dt[, sapply(.SD, function(x) list(sd = sd(x, na.rm = TRUE))), 
                by = by, .SDcols = patterns("^score")]
t_est <- out_dt[, .(diff.std.mean = t.test(score0, score7, paired = T)$estimate), by = by]
tpvalue <- out_dt[, .(p.value = t.test(score0, score7, paired = T)$p.value), by = by] 
list(means = means, sds = sds, diff.std.mean = t_est, p.value = tpvalue)

这是您可能要考虑的另一种方法。首先,我把你的代码剪切并粘贴到一个函数中。提取列名称并删除对用于计算标准误差的 plotrix 包的依赖是唯一的更改。

g <- function (df)
{
  nms <- c(names(df)[1:2],
           paste0('mean',   sub(".*[a-z]","",names(df)[3])),
           paste0('stderr', sub(".*[a-z]","",names(df)[3])),
           paste0('mean',   sub(".*[a-z]","",names(df)[4])),
           paste0('stderr', sub(".*[a-z]","",names(df)[4])),
           'diff.std.mean', 'p.value')
  
  z <- df %>% group_by(df[,1:2]) %>%
    summarize(
    x1 = mean(pull(df[,3]), na.rm = T),
    x2 = sd(pull(df[,3]), na.rm=T) / sqrt(sum(!is.na(pull(df[,3])))),
    x3 = mean(pull(df[,4]), na.rm = T),
    x4 = sd(pull(df[,4]), na.rm=T) / sqrt(sum(!is.na(pull(df[,4])))),
    x5 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$estimate, 
    x6 = t.test(pull(df[,3]), pull(df[,4]), paired = T)$p.value)
    colnames(z) <- nms
    return(z)
}

然后,因为测试数据只有一个水平的因子,而且你使用的 plotrix::std.error 函数的样本量不足,所以我在 'test0' 因子中引入了变化,将样本大小,并删除未使用的级别,因为它们会导致空帧上的迭代。此外,我添加了一个 score8 来展示您如何 运行 其他变量。

s <- t %>% mutate(tests0 = case_when(Education <= 8 ~ 'ADAS_CogTO', T ~ 'PVF_T0'),
                  score8 = score0 + score7)
q <- rbind(s, s)  
fct_drop(q$tests0)

然后我按因子级别拆分框架,将函数应用于每个拆分,然后将数据重新合并到一个允许您操作分数和组变量的函数中。我假设每个 2 个,这对分数变量是安全的,因为你正在进行配对 t 检验,并且它很容易用组变量扩展(如果你只是将分数变量移动到位置 1 和 2,并使用所有剩余的变量作为组变量传递给函数)。

h <- function(df, group_vars, score_vars)
{
  z <- df %>% select(group_vars, score_vars) 
  z <- z  %>% group_by(z[,1:2]) %>%
              group_map( ~ g(.x), .keep = T) %>%
              bind_rows()
}

请注意,如果您希望将此应用到其他数据,您只需更改传递给组和评分变量的列。如果你也想改变它应该相当容易,只是认为这是一个很好的框架,可以用来做你想做的事情。考虑一下您如何处理 test0 为空且 test7 为非空(或反之亦然)的情况,因为这些观察结果包含在您的汇总统计数据中,但必然被排除在 t 检验之外。祝你好运。

x <- h(q, c("tests0", "GROUP"), c("score0", "score7")) %>%
 group_by(tests0) %>%
 pivot_wider(id_cols     = tests0,
             names_from  = GROUP,
             values_from = c("mean0","stderr0","mean7","stderr7",
                             'diff.std.mean', 'p.value'))

我没有名为 std.error 的函数,所以我使用了 sd,当然你可以更改它。

library(dplyr)
library(readr)

out %>% 
  group_by(tests0, GROUP) %>% 
  summarise(
    across(c(score0, score7), list(mean = mean, stderr = sd), na.rm = TRUE,
           .names = '{.fn}{parse_number(.col)}'),
    with(t.test(score0, score7, paired = T),
         tibble(diff.std.mean = estimate,
                p.value)))
# # A tibble: 2 × 8
#   tests0     GROUP    mean0 stderr0 mean7 stderr7 diff.std.mean p.value
#   <fct>      <fct>    <dbl>   <dbl> <dbl>   <dbl>         <dbl>   <dbl>
# 1 ADAS_CogT0 CONTROL   12.5    3.72  13.5    4.81         -1.24 0.00471
# 2 ADAS_CogT0 TRAINING  14.0    4.55  12.6    4.15          1.40 0.00295

实际上我只是将上面的代码放在一个函数中,该函数接受一个 xy 参数,然后是 运行 fun(df, x = score0, y = score7)。但是,只是为了好玩,如果你必须使用 .x.y,这是一种方法(尽管我觉得这样做有点傻)

df %>% 
  group_by(tests0, GROUP) %>% 
  select(starts_with('score')) %>% 
  summarise(
    across(everything(), list(mean = mean, stderr = sd), na.rm = TRUE,
           .names = '{.fn}{parse_number(.col)}'),
    across(everything(), list(list)) %>% 
      pmap_dfr(~ t.test(.x, .y, paired = TRUE)[c('estimate', 'p.value')]) %>% 
      transmute(diff.std.mean = estimate, p.value))

# # A tibble: 2 × 8
# # Groups:   tests0 [1]
#   tests0     GROUP    mean0 stderr0 mean7 stderr7 diff.std.mean p.value
#   <fct>      <fct>    <dbl>   <dbl> <dbl>   <dbl>         <dbl>   <dbl>
# 1 ADAS_CogT0 CONTROL   12.5    3.72  13.5    4.81         -1.24 0.00471
# 2 ADAS_CogT0 TRAINING  14.0    4.55  12.6    4.15          1.40 0.00295

我想到了一种可能的解决方法(可能有帮助也可能没有帮助),方法是“手动”使用 across(),而不是一次一列地应用函数。生成的输出是一个 data.frame,其中的列表列深度嵌套,因此 unnest() 会派上用场。我还使用 possibly() 来解决不存在两列的情况,请记住 across() 可以匹配任意数量的列,而 t.test() 需要 xy参数。

代码:

library(tidyverse)

data <-
  df %>%
  group_by(tests0, GROUP) %>%
  summarize(
    all = list(across(starts_with("score")) %>%
      {
        tibble(
          ttest   = data.frame(possibly(~ reduce(., ~ t.test(.x, .y, paired = TRUE))[c("estimate", 'p.value')], NA)(.)),
          means   = data.frame(map(., ~ mean(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "mean"))),
          stderrs = data.frame(map(., ~ sd(.x, na.rm = TRUE)) %>% set_names(., str_replace(names(.), "\D+", "stederr")))
        )
      })
  )
#> `summarise()` has grouped output by 'tests0'. You can override using the `.groups` argument.


data %>%
  unnest(all) %>%
  unnest(-c("tests0", "GROUP"))
#> # A tibble: 2 × 8
#> # Groups:   tests0 [1]
#>   tests0     GROUP    estimate p.value mean0 mean7 stederr0 stederr7
#>   <fct>      <fct>       <dbl>   <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 ADAS_CogT0 CONTROL     -1.24 0.00471  12.5  13.5     3.72     4.81
#> 2 ADAS_CogT0 TRAINING     1.40 0.00295  14.0  12.6     4.55     4.15

reprex package (v2.0.1)

于 2021-11-29 创建