通过排列进行多组测试

Multiple groups tests via permutation

我有一个 df,其中包含与两个实验相关的两组值(value_1 和 value_2)。

一个实验包含两组(0 和 1),另一个包含三个组(0,1,2)。

test    group   Value_1    Value_2
AA      0           15.1    11.2
AA      0           12.4    8.6
AA      1           9.6     22.5
AA      1           10.2    22
BB      0           12.11   11
BB      0           14      1.2
BB      1           11      13.2
BB      1           12.3    9
BB      2           14.2    12
BB      2           15      13

df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, 
-10L))

我想对 value_1 和 value_2 应用置换测试 - 按测试,按组 - 覆盖:

到目前为止我所做的 - 分成几块是:

  1. 当组只有两个时,我简单地应用oneway.test():

    df %>%
      filter(test %in% 'AA') -> df_test_aa
    
    df_test_aa_value_1 <- oneway.test(df_test_aa$Value_1~df_test_aa$group)
    df_test_aa_value_1$p.value
    [1] 0.2011234
    
    
    df_test_aa_value_2 <- oneway.test(df_test_aa$Value_2~df_test_aa$group)
    df_test_aa_value_2$p.value
    [1] 0.05854026
    
  2. 每当组数超过 2 个时,我会测试所有可能的排列:

    • 前 0 对 1:

      df %>% filter(test %in% 'BB' & group %in% c(0,1)) -> df_test_bb_01
      
      df_test_bb_01_value_1 <- oneway.test(df_test_bb_01$Value_1~df_test_bb_01$group)
      df_test_bb_01_value_1$p.value
      [1] 0.3585415
      
      df_test_bb_01_value_2 <-    oneway.test(df_test_bb_01$Value_2~df_test_bb_01$group)
      df_test_bb_01_value_2$p.value
      [1] 0.4848446
      
    • 然后 0 对 2:

       df %>%
       filter(test %in% 'BB' & group %in% c(0,2)) -> df_test_bb_02
      
       df_test_bb_02_value_1 <-       oneway.test(df_test_bb_02$Value_1~df_test_bb_02$group)
       df_test_bb_02_value_1$p.value
       [1] 0.3246012
      
       df_test_bb_02_value_2 <- oneway.test(df_test_bb_02$Value_2~df_test_bb_02$group)
       df_test_bb_02_value_2$p.value
       [1] 0.4142838
      
    • 然后 1 对 2:

       df %>%
        filter(test %in% 'BB' & group %in% c(1,2)) -> df_test_bb_12
      
        df_test_bb_12_value_1 <- oneway.test(df_test_bb_12$Value_1~df_test_bb_12$group)
        df_test_bb_12_value_1$p.value
        [1] 0.08105404
      
      
        df_test_bb_12_value_2 <- oneway.test(df_test_bb_12$Value_2~df_test_bb_12$group)
        df_test_bb_12_value_2$p.value
        [1] 0.6245713
      

因此,我希望获得如下所示的 df:

test value  p_value_2sided  hypothesis
AA  Value_1   0.201         0,1
AA  Value_2   0.059         0,1
BB  Value_1   0.359         0,1
BB  Value_1   0.325         0,2
BB  Value_1   0.081         1,2
BB  Value_2   0.485         0,1
BB  Value_2   0.414         0,2
BB  Value_2   0.625         1,2

感谢您的提示!

嗯,这不是很漂亮,但是......

df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, 
-10L))

as.data.frame(
  do.call(rbind, by(df, factor(df$test), function(x) {
  h <- t(combn(unique(x$group), 2))
  p <- apply(h, 1, function(y) {
    with(x[x$group %in% y, ], {
      c(oneway.test(Value_1 ~ group)$p.value,
        oneway.test(Value_2 ~ group)$p.value)
      })
    })

  h <- rep(paste(h[, 1], h[, 2], sep = ","), each = 2)

  cbind(test = as.character(x$test[1]), value = c("Value_1", "Value_2"), p_value = as.vector(p), hypothesis = h)
  }))
)


  test   value            p_value hypothesis
1   AA Value_1  0.201123366107666        0,1
2   AA Value_2 0.0585402590546805        0,1
3   BB Value_1  0.358541470571387        0,1
4   BB Value_2  0.484844587956832        0,1
5   BB Value_1  0.324601180998953        0,2
6   BB Value_2  0.414283756097153        0,2
7   BB Value_1 0.0810540380817137        1,2
8   BB Value_2  0.624571310834221        1,2
df <- structure(list(test = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), .Label = c("AA", "BB"), class = "factor"), group = c(0L, 
0L, 1L, 1L, 0L, 0L, 1L, 1L, 2L, 2L), Value_1 = c(15.1, 12.4, 
9.6, 10.2, 12.11, 14, 11, 12.3, 14.2, 15), Value_2 = c(11.2, 
8.6, 22.5, 22, 11, 1.2, 13.2, 9, 12, 13)), .Names = c("test", 
"group", "Value_1", "Value_2"), class = "data.frame", row.names = c(NA, -10L))

library(tidyverse)

# reshape dataset
df2 = df %>% gather(value, v, -test, -group)

# function to compute p value
# vectorized version
f = function(t,val,x1,x2) {
  (df2 %>% 
     filter(test == t & value == val & group %in% c(x1,x2)) %>% 
     oneway.test(v~group, data = .))$p.value }
f = Vectorize(f)

df2 %>% 
  distinct(test, group, value) %>%       # get unique combinations
  group_by(test, value) %>%              # for each test and value
  nest() %>%                             # nest rest of columns
  mutate(d = map(data, ~data.frame(t(combn(.$group, 2)))),
         hypothesis = map(d, ~paste0(.$X1, ",", .$X2))) %>%  # get pairs/combinations of values
  unnest(d, hypothesis) %>%              # unnest data
  mutate(pval = f(test, value, X1, X2))  # apply vectorised function to get p value

# # A tibble: 8 x 6
#   test   value   hypothesis    X1    X2   pval
#   <fctr> <chr>   <chr>      <int> <int>  <dbl>
# 1 AA     Value_1 0,1            0     1 0.201 
# 2 BB     Value_1 0,1            0     1 0.359 
# 3 BB     Value_1 0,2            0     2 0.325 
# 4 BB     Value_1 1,2            1     2 0.0811
# 5 AA     Value_2 0,1            0     1 0.0585
# 6 BB     Value_2 0,1            0     1 0.485 
# 7 BB     Value_2 0,2            0     2 0.414 
# 8 BB     Value_2 1,2            1     2 0.625

如果确实不需要X1X2,可以删除它们。 但是,通过这种方式,您(也)可以将它们作为单独的数字变量,以防您想在分析的后期阶段在另一个过程中使用它们(例如,对特定组进行过滤)。