计算每组的 p 值

Question

我有一个像这样的长数据框：

df <- structure(list(Tumor_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("epithelial", 
"lymphoma", "healthy", "sarcoma"), class = "factor"), Gene = c("A", 
"B", "C", "A", "B", "C", "A", "B", "C", "A", "A", "B", "C", "A", 
"B", "C", "A", "B", "C", "A"), value = c(55.9228661170814, 63.4145784380524, 
207.26299945198, 14.3567918830159, 55.3521592504006, 185.331104118272, 
27.6007163612577, 239.913726358976, 153.875266787649, 13.5284557778013, 
22.5884252969717, 81.9416366341296, 197.881154317385, 13.3558001634159, 
167.58802932121, 79.1822202852964, 37.9496510246124, 85.245769235316, 
116.807115796469, 21.8110255970795)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

>df
Tumor_type  Gene    value
<fct>   <chr>   <dbl>
epithelial  A   55.92287
epithelial  B   63.41458
epithelial  C   207.26300
epithelial  A   14.35679
epithelial  B   55.35216
epithelial  C   185.33110
epithelial  A   27.60072
epithelial  B   239.91373
epithelial  C   153.87527
epithelial  A   13.52846
lymphoma    A   22.58843
lymphoma    B   81.94164
lymphoma    C   197.88115
lymphoma    A   13.35580
lymphoma    B   167.58803
lymphoma    C   79.18222
lymphoma    A   37.94965
lymphoma    B   85.24577
lymphoma    C   116.80712
lymphoma    A   21.81103

我想用 Log2FC 和来自 t 检验的相应 p 值创建一个数据框。现在我只能使用 Log2FC 值创建数据框，如下所示：

LFC <- df %>% 
    group_by(Tumor_type, Gene) %>% 
    summarise(mean = mean(value)) %>% 
    ungroup() %>% 
    group_by(Gene) %>% 
    mutate(lfc = log2(mean[2] / mean[1])) %>% 
    ungroup() %>% 
    filter(Tumor_type == "epithelial") %>% 
    select(-mean)

>LFC
Tumor_type  Gene    lfc
<fct>   <chr>   <dbl>
epithelial  A   -0.2191989
epithelial  B   -0.0995055
epithelial  C   -0.4724193

我想添加具有我可以手动计算的相应 p 值的列：

t.test(df[df$Tumor_type == "epithelial" & df$Gene == "A",]$value,
      df[df$Tumor_type == "lymphoma" & df$Gene == "A",]$value)$p.value

0.74

所以问题是我怎样才能用这样的 p 值列创建最终的 table？

Tumor_type  Gene    lfc p-value
<fct>   <chr>   <dbl>   <dbl>
epithelial  A   -0.2191989  0.7404867
epithelial  B   -0.0995055  0.9125193
epithelial  C   -0.4724193  0.2834448

Answer 1

一个更简单的选择是先使用 pivot_wider 重塑为 'wide' 格式，然后按 summarise

进行分组

library(dplyr)
library(tidyr)
library(data.table)
df %>%
  # create a sequence column by Tumor_type 
  mutate(rn = rowid(Tumor_type)) %>%
  # reshape to wide
  pivot_wider(names_from = Tumor_type, values_from = value) %>%  
  # grouped by Gene
  group_by(Gene) %>% 
  # get the log of divided mean values from lymphoma, epithelial
  summarise(lfc = log2(mean(lymphoma)/mean(epithelial)),
           # apply the t.test on the columns and get the p.value
            p_value = t.test(epithelial, lymphoma)$p.value) %>% 
  mutate(Tumor_type = 'epithelial', .before = 1)
# A tibble: 3 × 4
  Tumor_type Gene      lfc p_value
  <chr>      <chr>   <dbl>   <dbl>
1 epithelial A     -0.219    0.740
2 epithelial B     -0.0995   0.913
3 epithelial C     -0.472    0.283

Answer 2

此外，另一种可能的解决方案是将所有值堆叠为一个列表，以便调用 t.test。这有点像使用 pivot_wider 但代码更少。

df %>% 
  group_by(Tumor_type,Gene) %>% 
  summarise(values = list(value)) %>%
  group_by(Gene) %>% 
  summarise(lfc = log2(mean(values[[2]]) /mean(values[[1]])),
            p_value= t.test(values[[1]],values[[2]])$p.value)

输出：

# A tibble: 3 x 3
  Gene      lfc  p_value
  <chr>   <dbl> <dbl>
1 A     -0.219  0.740
2 B     -0.0995 0.913
3 C     -0.472  0.283

计算每组的 p 值

Calculate p-values for each group

r

dplyr