从 R 中的数据框列值均值和标准差创建 table

Create a table from dataframe column values mean and standard deviation in R

我是 R 的新手,所以对代码一无所知。我有两个数据框。一个数据框看起来像这样。

df
ID Disease
GSM239170 Control
GSM239323 Control
GSM239324 Control
GSM239326 Control
GSM239328 AML
GSM239329 AML
GSM239331 AML
GSM239332 Control
GSM239333 Control

另一个数据框如下所示:

df1
GSM239170 GSM239323 GSM239324 GSM239326 GSM239328 GSM239329 GSM239331 GSM239332 GSM239333
3.016704177 3.285669072 2.929482692 2.922820483 3.15950317 3.163327169 2.985901308 3.122708843 3.070948463
7.977735461 6.532514237 6.388007183 6.466679556 6.432795021 6.407321524 6.426470803 6.376394357 6.469070308
4.207280707 4.994965767 4.40159671 4.747114589 4.830045513 4.213762092 4.884418365 4.4318876 4.849665444
7.25609471 7.420807337 6.999340125 7.094488581 7.024332721 7.17928981 7.159898654 7.009977785 6.830979234
2.204955099 2.331625217 2.133305231 2.18332885 2.12778313 2.269697813 2.264705552 2.253940441 2.287924323
7.28437278 6.983593721 6.86337111 6.865970678 7.219840938 7.181113053 7.392230178 7.484052914 7.52498281
4.265792764 4.970684112 4.595545125 4.575545289 4.547957809 4.68215122 4.674495889 4.675841709 4.643311767
2.6943516 2.916324936 2.578130269 2.659717988 2.567436676 2.8095128 2.790110381 2.795882913 2.884588792
3.646303109 8.817891552 11.4248793 10.74738082 9.296043108 9.53150669 8.285160496 9.769919327 9.774610531
3.040292001 3.38486713 2.958851115 3.047880699 2.878562717 3.209319974 3.20260379 3.195993624 3.3004227
2.357625231 2.444753172 2.340767158 2.32143889 2.282608342 2.401218719 2.385568421 2.375334953 2.432634747
5.378494673 6.065038394 5.134842087 5.367342376 5.682051149 5.712072512 5.57179966 5.72082395 5.656674512
2.833814735 3.038434511 2.837711812 2.859800224 2.866040813 2.969167906 2.929449968 2.963530689 2.931065261
6.192932281 6.478439634 6.180169144 6.151689376 6.238949956 6.708196123 6.441437631 6.448280595 6.413562269
4.543042482 4.786227217 4.445131477 4.51471011 4.491645167 4.460114204 4.602482637 4.587221948 4.623125028
6.069437462 6.232738284 6.74644117 7.04995802 6.938928532 6.348253102 6.080950712 6.324619355 6.472893789

我想制作一个 table 以包含 mean_AML、sd_AML(标准差)、min_AML、max_AML、mean_Control , sd_Control, min_Control, max_Control, 和 Fold_change(即 mean_AML – mean_Control)每个基因。使用内置函数就可以了。

不知道我该怎么做。请帮忙。

谢谢。

---更新---

提示:将数据集拆分为AML数据集和正常数据集,然后对每个gene/probeset,分别计算其跨样本的均值、标准差、最小和最大表达值(使用内置函数), 并进一步将每个基因的这些统计值合并为一个table。应用 data.frame() 并为创建的 table 赋予与基因表达数据相同的行名称 table.

我们可以将 pivot_longerright_join 组合,然后在组上使用 summarise

library(dplyr)
library(tidyr)
df1 %>% 
  pivot_longer(
    everything(),
    names_to = "ID", 
    values_to = "value"
  ) %>% 
  right_join(df, by="ID") %>% 
  group_by(Disease) %>% 
  summarise(Min = min(value), Mean = mean(value), Max = max(value), Sd = sd(value)) %>%
  ungroup()
  Disease   Min  Mean   Max    Sd
  <chr>   <dbl> <dbl> <dbl> <dbl>
1 AML      2.13  4.91  9.53  2.04
2 Control  2.13  4.92 11.4   2.12

使用 pivot_longer():

将第二个数据帧放入长格式会有所帮助
library(tidyr)
newdf <- pivot_longer(df1, 
    cols = everything(),
    names_to = 'ID',
    values_to = 'value')

然后merge()将两个数据帧合二为一:

df.all <- merge(df, newdf, by = 'ID', all = T)

然后您可以 group_by(gene)summarise(),添加您需要的任何值:

library(dplyr)
df.all %>% group_by(gene) %>% summarise(sd = sd(value), min = min(value), max = max(value), mean = mean(value))

具有旧功能的另一个选项 tidyr::gather 有一个列 df:

library(tidyverse)

df2_spread <- df1 %>% 
  tidyr::gather(ID, val) %>% 
  left_join(df, by = 'ID')
  
result_1 <- df2_spread %>% 
  group_by(Disease, gene = ID) %>% 
  summarise(n = n(),
            mean = mean(val),
            sd = sd(val),
            min = min(val),
            max = max(val), .groups = "drop")
 A tibble: 9 × 7
  Disease gene          n  mean    sd   min   max
  <chr>   <chr>     <int> <dbl> <dbl> <dbl> <dbl>
1 AML     GSM239328    16  4.91  2.15  2.13  9.30
2 AML     GSM239329    16  4.95  2.13  2.27  9.53
3 AML     GSM239331    16  4.88  1.96  2.26  8.29
4 Control GSM239170    16  4.56  1.91  2.20  7.98
5 Control GSM239323    16  5.04  1.98  2.33  8.82
6 Control GSM239324    16  4.93  2.45  2.13 11.4 
7 Control GSM239326    16  4.97  2.34  2.18 10.7 
8 Control GSM239332    16  4.97  2.16  2.25  9.77
9 Control GSM239333    16  5.01  2.14  2.29  9.77

无论如何,我无法找到一种方法来计算每个基因的 Fold_change,因为这里的基因似乎只有一种疾病。

这是数据


df <- tibble::tribble(
          ~ID,  ~Disease,
  "GSM239170", "Control",
  "GSM239323", "Control",
  "GSM239324", "Control",
  "GSM239326", "Control",
  "GSM239328",     "AML",
  "GSM239329",     "AML",
  "GSM239331",     "AML",
  "GSM239332", "Control",
  "GSM239333", "Control"
  )



df1 <- tibble::tribble(
   ~GSM239170,  ~GSM239323,  ~GSM239324,  ~GSM239326,  ~GSM239328,  ~GSM239329,  ~GSM239331,  ~GSM239332,  ~GSM239333,
  3.016704177, 3.285669072, 2.929482692, 2.922820483,  3.15950317, 3.163327169, 2.985901308, 3.122708843, 3.070948463,
  7.977735461, 6.532514237, 6.388007183, 6.466679556, 6.432795021, 6.407321524, 6.426470803, 6.376394357, 6.469070308,
  4.207280707, 4.994965767,  4.40159671, 4.747114589, 4.830045513, 4.213762092, 4.884418365,   4.4318876, 4.849665444,
   7.25609471, 7.420807337, 6.999340125, 7.094488581, 7.024332721,  7.17928981, 7.159898654, 7.009977785, 6.830979234,
  2.204955099, 2.331625217, 2.133305231,  2.18332885,  2.12778313, 2.269697813, 2.264705552, 2.253940441, 2.287924323,
   7.28437278, 6.983593721,  6.86337111, 6.865970678, 7.219840938, 7.181113053, 7.392230178, 7.484052914,  7.52498281,
  4.265792764, 4.970684112, 4.595545125, 4.575545289, 4.547957809,  4.68215122, 4.674495889, 4.675841709, 4.643311767,
    2.6943516, 2.916324936, 2.578130269, 2.659717988, 2.567436676,   2.8095128, 2.790110381, 2.795882913, 2.884588792,
  3.646303109, 8.817891552,  11.4248793, 10.74738082, 9.296043108,  9.53150669, 8.285160496, 9.769919327, 9.774610531,
  3.040292001,  3.38486713, 2.958851115, 3.047880699, 2.878562717, 3.209319974,  3.20260379, 3.195993624,   3.3004227,
  2.357625231, 2.444753172, 2.340767158,  2.32143889, 2.282608342, 2.401218719, 2.385568421, 2.375334953, 2.432634747,
  5.378494673, 6.065038394, 5.134842087, 5.367342376, 5.682051149, 5.712072512,  5.57179966,  5.72082395, 5.656674512,
  2.833814735, 3.038434511, 2.837711812, 2.859800224, 2.866040813, 2.969167906, 2.929449968, 2.963530689, 2.931065261,
  6.192932281, 6.478439634, 6.180169144, 6.151689376, 6.238949956, 6.708196123, 6.441437631, 6.448280595, 6.413562269,
  4.543042482, 4.786227217, 4.445131477,  4.51471011, 4.491645167, 4.460114204, 4.602482637, 4.587221948, 4.623125028,
  6.069437462, 6.232738284,  6.74644117,  7.04995802, 6.938928532, 6.348253102, 6.080950712, 6.324619355, 6.472893789
  )

A 基础 R 方法。

df_new <- data.frame(t(df1), ID=colnames(df1))
df_new <- merge(df, df_new, by = 'ID')
out <- apply(df_new[, grep('^X', names(df_new))], 1, function(x) {
    data.frame(min=min(x), IQR_low=quantile(x, .25),
          mean=mean(x), median=median(x),IQR_high=quantile(x, .75),
          max=max(x), sd=sd(x))
})
out <- round(do.call(rbind, out), 2L)
out <- cbind(Disease=df_new$Disease, out)
rownames(out) <- df_new$ID

输出

> out
          Disease  min IQR_low mean median IQR_high   max   sd
GSM239170 Control 2.20    2.97 4.56   4.24     6.10  7.98 1.91
GSM239323 Control 2.33    3.22 5.04   4.98     6.49  8.82 1.98
GSM239324 Control 2.13    2.91 4.93   4.52     6.48 11.42 2.45
GSM239326 Control 2.18    2.91 4.97   4.66     6.57 10.75 2.34
GSM239328     AML 2.13    2.88 4.91   4.69     6.56  9.30 2.15
GSM239329     AML 2.27    3.11 4.95   4.57     6.48  9.53 2.13
GSM239331     AML 2.26    2.97 4.88   4.78     6.43  8.29 1.96
GSM239332 Control 2.25    3.08 4.97   4.63     6.39  9.77 2.16
GSM239333 Control 2.29    3.04 5.01   4.75     6.47  9.77 2.14

现在我们需要找到您定义为 mean_AML - mean_Control 的所有 18 倍变化。这可以通过以下函数完成,fold_change_fun(),其中 returns 折叠根据 meanmedian.

变化
fold_change_fun <- function(x, statistic = c('mean', 'median')) {
  AML_genes <- rownames(x[x$Disease %in% 'AML', ])
  Control_genes <- rownames(x[x$Disease %in% 'Control', ])
  AML <- x[x$Disease %in% 'AML', statistic]
  Control <- x[x$Disease %in% 'Control', statistic]
  nrow_AML <- nrow(out[out$Disease %in% 'AML', ])
  nrow_Control <- nrow(out[out$Disease %in% 'Control', ])
  out <- matrix(c(prod(nrow_AML, nrow_Control)),
                nrow = nrow_AML, ncol = nrow_Control)
  for(i in seq_len(nrow_AML)) {
    for(j in seq_len(nrow_Control)) {
      out[i, j] <- AML[i] - Control[j]
    }
  }
  out <- t(out)
  colnames(out) <- paste(AML_genes, 'AML')
  rownames(out) <- paste(Control_genes, 'Control')
  return(out)
}

输出

> fold_change_fun(x = out, statistic = 'mean')
                  GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control          0.35          0.39          0.32
GSM239323 Control         -0.13         -0.09         -0.16
GSM239324 Control         -0.02          0.02         -0.05
GSM239326 Control         -0.06         -0.02         -0.09
GSM239332 Control         -0.06         -0.02         -0.09
GSM239333 Control         -0.10         -0.06         -0.13

> fold_change_fun(x = out, statistic = 'median')
                  GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control          0.45          0.33          0.54
GSM239323 Control         -0.29         -0.41         -0.20
GSM239324 Control          0.17          0.05          0.26
GSM239326 Control          0.03         -0.09          0.12
GSM239332 Control          0.06         -0.06          0.15
GSM239333 Control         -0.06         -0.18          0.03

请注意,这些只是点估计。我们不知道这些样本倍数变化作为总体倍数变化的估计值有多精确。

数据

df <- structure(list(ID = c("GSM239170", "GSM239323", "GSM239324", 
"GSM239326", "GSM239328", "GSM239329", "GSM239331", "GSM239332", 
"GSM239333"), Disease = c("Control", "Control", "Control", "Control", 
"AML", "AML", "AML", "Control", "Control")), row.names = c(NA, 
-9L), class = "data.frame")

df1 <- structure(list(GSM239170 = c(3.016704177, 7.977735461, 4.207280707, 
7.25609471, 2.204955099, 7.28437278, 4.265792764, 2.6943516, 
3.646303109, 3.040292001, 2.357625231, 5.378494673, 2.833814735, 
6.192932281, 4.543042482, 6.069437462), GSM239323 = c(3.285669072, 
6.532514237, 4.994965767, 7.420807337, 2.331625217, 6.983593721, 
4.970684112, 2.916324936, 8.817891552, 3.38486713, 2.444753172, 
6.065038394, 3.038434511, 6.478439634, 4.786227217, 6.232738284
), GSM239324 = c(2.929482692, 6.388007183, 4.40159671, 6.999340125, 
2.133305231, 6.86337111, 4.595545125, 2.578130269, 11.4248793, 
2.958851115, 2.340767158, 5.134842087, 2.837711812, 6.180169144, 
4.445131477, 6.74644117), GSM239326 = c(2.922820483, 6.466679556, 
4.747114589, 7.094488581, 2.18332885, 6.865970678, 4.575545289, 
2.659717988, 10.74738082, 3.047880699, 2.32143889, 5.367342376, 
2.859800224, 6.151689376, 4.51471011, 7.04995802), GSM239328 = c(3.15950317, 
6.432795021, 4.830045513, 7.024332721, 2.12778313, 7.219840938, 
4.547957809, 2.567436676, 9.296043108, 2.878562717, 2.282608342, 
5.682051149, 2.866040813, 6.238949956, 4.491645167, 6.938928532
), GSM239329 = c(3.163327169, 6.407321524, 4.213762092, 7.17928981, 
2.269697813, 7.181113053, 4.68215122, 2.8095128, 9.53150669, 
3.209319974, 2.401218719, 5.712072512, 2.969167906, 6.708196123, 
4.460114204, 6.348253102), GSM239331 = c(2.985901308, 6.426470803, 
4.884418365, 7.159898654, 2.264705552, 7.392230178, 4.674495889, 
2.790110381, 8.285160496, 3.20260379, 2.385568421, 5.57179966, 
2.929449968, 6.441437631, 4.602482637, 6.080950712), GSM239332 = c(3.122708843, 
6.376394357, 4.4318876, 7.009977785, 2.253940441, 7.484052914, 
4.675841709, 2.795882913, 9.769919327, 3.195993624, 2.375334953, 
5.72082395, 2.963530689, 6.448280595, 4.587221948, 6.324619355
), GSM239333 = c(3.070948463, 6.469070308, 4.849665444, 6.830979234, 
2.287924323, 7.52498281, 4.643311767, 2.884588792, 9.774610531, 
3.3004227, 2.432634747, 5.656674512, 2.931065261, 6.413562269, 
4.623125028, 6.472893789)), row.names = c(NA, -16L), class = "data.frame")