从 R 中的数据框列值均值和标准差创建 table
Create a table from dataframe column values mean and standard deviation in R
我是 R 的新手,所以对代码一无所知。我有两个数据框。一个数据框看起来像这样。
df
ID
Disease
GSM239170
Control
GSM239323
Control
GSM239324
Control
GSM239326
Control
GSM239328
AML
GSM239329
AML
GSM239331
AML
GSM239332
Control
GSM239333
Control
另一个数据框如下所示:
df1
GSM239170
GSM239323
GSM239324
GSM239326
GSM239328
GSM239329
GSM239331
GSM239332
GSM239333
3.016704177
3.285669072
2.929482692
2.922820483
3.15950317
3.163327169
2.985901308
3.122708843
3.070948463
7.977735461
6.532514237
6.388007183
6.466679556
6.432795021
6.407321524
6.426470803
6.376394357
6.469070308
4.207280707
4.994965767
4.40159671
4.747114589
4.830045513
4.213762092
4.884418365
4.4318876
4.849665444
7.25609471
7.420807337
6.999340125
7.094488581
7.024332721
7.17928981
7.159898654
7.009977785
6.830979234
2.204955099
2.331625217
2.133305231
2.18332885
2.12778313
2.269697813
2.264705552
2.253940441
2.287924323
7.28437278
6.983593721
6.86337111
6.865970678
7.219840938
7.181113053
7.392230178
7.484052914
7.52498281
4.265792764
4.970684112
4.595545125
4.575545289
4.547957809
4.68215122
4.674495889
4.675841709
4.643311767
2.6943516
2.916324936
2.578130269
2.659717988
2.567436676
2.8095128
2.790110381
2.795882913
2.884588792
3.646303109
8.817891552
11.4248793
10.74738082
9.296043108
9.53150669
8.285160496
9.769919327
9.774610531
3.040292001
3.38486713
2.958851115
3.047880699
2.878562717
3.209319974
3.20260379
3.195993624
3.3004227
2.357625231
2.444753172
2.340767158
2.32143889
2.282608342
2.401218719
2.385568421
2.375334953
2.432634747
5.378494673
6.065038394
5.134842087
5.367342376
5.682051149
5.712072512
5.57179966
5.72082395
5.656674512
2.833814735
3.038434511
2.837711812
2.859800224
2.866040813
2.969167906
2.929449968
2.963530689
2.931065261
6.192932281
6.478439634
6.180169144
6.151689376
6.238949956
6.708196123
6.441437631
6.448280595
6.413562269
4.543042482
4.786227217
4.445131477
4.51471011
4.491645167
4.460114204
4.602482637
4.587221948
4.623125028
6.069437462
6.232738284
6.74644117
7.04995802
6.938928532
6.348253102
6.080950712
6.324619355
6.472893789
我想制作一个 table 以包含 mean_AML、sd_AML(标准差)、min_AML、max_AML、mean_Control , sd_Control, min_Control, max_Control, 和 Fold_change(即 mean_AML – mean_Control)每个基因。使用内置函数就可以了。
不知道我该怎么做。请帮忙。
谢谢。
---更新---
提示:将数据集拆分为AML数据集和正常数据集,然后对每个gene/probeset,分别计算其跨样本的均值、标准差、最小和最大表达值(使用内置函数), 并进一步将每个基因的这些统计值合并为一个table。应用 data.frame() 并为创建的 table 赋予与基因表达数据相同的行名称 table.
我们可以将 pivot_longer
与 right_join
组合,然后在组上使用 summarise
:
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(
everything(),
names_to = "ID",
values_to = "value"
) %>%
right_join(df, by="ID") %>%
group_by(Disease) %>%
summarise(Min = min(value), Mean = mean(value), Max = max(value), Sd = sd(value)) %>%
ungroup()
Disease Min Mean Max Sd
<chr> <dbl> <dbl> <dbl> <dbl>
1 AML 2.13 4.91 9.53 2.04
2 Control 2.13 4.92 11.4 2.12
使用 pivot_longer()
:
将第二个数据帧放入长格式会有所帮助
library(tidyr)
newdf <- pivot_longer(df1,
cols = everything(),
names_to = 'ID',
values_to = 'value')
然后merge()
将两个数据帧合二为一:
df.all <- merge(df, newdf, by = 'ID', all = T)
然后您可以 group_by(gene)
和 summarise()
,添加您需要的任何值:
library(dplyr)
df.all %>% group_by(gene) %>% summarise(sd = sd(value), min = min(value), max = max(value), mean = mean(value))
具有旧功能的另一个选项 tidyr::gather
有一个列 df:
library(tidyverse)
df2_spread <- df1 %>%
tidyr::gather(ID, val) %>%
left_join(df, by = 'ID')
result_1 <- df2_spread %>%
group_by(Disease, gene = ID) %>%
summarise(n = n(),
mean = mean(val),
sd = sd(val),
min = min(val),
max = max(val), .groups = "drop")
A tibble: 9 × 7
Disease gene n mean sd min max
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 AML GSM239328 16 4.91 2.15 2.13 9.30
2 AML GSM239329 16 4.95 2.13 2.27 9.53
3 AML GSM239331 16 4.88 1.96 2.26 8.29
4 Control GSM239170 16 4.56 1.91 2.20 7.98
5 Control GSM239323 16 5.04 1.98 2.33 8.82
6 Control GSM239324 16 4.93 2.45 2.13 11.4
7 Control GSM239326 16 4.97 2.34 2.18 10.7
8 Control GSM239332 16 4.97 2.16 2.25 9.77
9 Control GSM239333 16 5.01 2.14 2.29 9.77
无论如何,我无法找到一种方法来计算每个基因的 Fold_change,因为这里的基因似乎只有一种疾病。
这是数据
df <- tibble::tribble(
~ID, ~Disease,
"GSM239170", "Control",
"GSM239323", "Control",
"GSM239324", "Control",
"GSM239326", "Control",
"GSM239328", "AML",
"GSM239329", "AML",
"GSM239331", "AML",
"GSM239332", "Control",
"GSM239333", "Control"
)
df1 <- tibble::tribble(
~GSM239170, ~GSM239323, ~GSM239324, ~GSM239326, ~GSM239328, ~GSM239329, ~GSM239331, ~GSM239332, ~GSM239333,
3.016704177, 3.285669072, 2.929482692, 2.922820483, 3.15950317, 3.163327169, 2.985901308, 3.122708843, 3.070948463,
7.977735461, 6.532514237, 6.388007183, 6.466679556, 6.432795021, 6.407321524, 6.426470803, 6.376394357, 6.469070308,
4.207280707, 4.994965767, 4.40159671, 4.747114589, 4.830045513, 4.213762092, 4.884418365, 4.4318876, 4.849665444,
7.25609471, 7.420807337, 6.999340125, 7.094488581, 7.024332721, 7.17928981, 7.159898654, 7.009977785, 6.830979234,
2.204955099, 2.331625217, 2.133305231, 2.18332885, 2.12778313, 2.269697813, 2.264705552, 2.253940441, 2.287924323,
7.28437278, 6.983593721, 6.86337111, 6.865970678, 7.219840938, 7.181113053, 7.392230178, 7.484052914, 7.52498281,
4.265792764, 4.970684112, 4.595545125, 4.575545289, 4.547957809, 4.68215122, 4.674495889, 4.675841709, 4.643311767,
2.6943516, 2.916324936, 2.578130269, 2.659717988, 2.567436676, 2.8095128, 2.790110381, 2.795882913, 2.884588792,
3.646303109, 8.817891552, 11.4248793, 10.74738082, 9.296043108, 9.53150669, 8.285160496, 9.769919327, 9.774610531,
3.040292001, 3.38486713, 2.958851115, 3.047880699, 2.878562717, 3.209319974, 3.20260379, 3.195993624, 3.3004227,
2.357625231, 2.444753172, 2.340767158, 2.32143889, 2.282608342, 2.401218719, 2.385568421, 2.375334953, 2.432634747,
5.378494673, 6.065038394, 5.134842087, 5.367342376, 5.682051149, 5.712072512, 5.57179966, 5.72082395, 5.656674512,
2.833814735, 3.038434511, 2.837711812, 2.859800224, 2.866040813, 2.969167906, 2.929449968, 2.963530689, 2.931065261,
6.192932281, 6.478439634, 6.180169144, 6.151689376, 6.238949956, 6.708196123, 6.441437631, 6.448280595, 6.413562269,
4.543042482, 4.786227217, 4.445131477, 4.51471011, 4.491645167, 4.460114204, 4.602482637, 4.587221948, 4.623125028,
6.069437462, 6.232738284, 6.74644117, 7.04995802, 6.938928532, 6.348253102, 6.080950712, 6.324619355, 6.472893789
)
A 基础 R 方法。
df_new <- data.frame(t(df1), ID=colnames(df1))
df_new <- merge(df, df_new, by = 'ID')
out <- apply(df_new[, grep('^X', names(df_new))], 1, function(x) {
data.frame(min=min(x), IQR_low=quantile(x, .25),
mean=mean(x), median=median(x),IQR_high=quantile(x, .75),
max=max(x), sd=sd(x))
})
out <- round(do.call(rbind, out), 2L)
out <- cbind(Disease=df_new$Disease, out)
rownames(out) <- df_new$ID
输出
> out
Disease min IQR_low mean median IQR_high max sd
GSM239170 Control 2.20 2.97 4.56 4.24 6.10 7.98 1.91
GSM239323 Control 2.33 3.22 5.04 4.98 6.49 8.82 1.98
GSM239324 Control 2.13 2.91 4.93 4.52 6.48 11.42 2.45
GSM239326 Control 2.18 2.91 4.97 4.66 6.57 10.75 2.34
GSM239328 AML 2.13 2.88 4.91 4.69 6.56 9.30 2.15
GSM239329 AML 2.27 3.11 4.95 4.57 6.48 9.53 2.13
GSM239331 AML 2.26 2.97 4.88 4.78 6.43 8.29 1.96
GSM239332 Control 2.25 3.08 4.97 4.63 6.39 9.77 2.16
GSM239333 Control 2.29 3.04 5.01 4.75 6.47 9.77 2.14
现在我们需要找到您定义为 mean_AML - mean_Control 的所有 18 倍变化。这可以通过以下函数完成,fold_change_fun()
,其中 returns 折叠根据 mean
或 median
.
变化
fold_change_fun <- function(x, statistic = c('mean', 'median')) {
AML_genes <- rownames(x[x$Disease %in% 'AML', ])
Control_genes <- rownames(x[x$Disease %in% 'Control', ])
AML <- x[x$Disease %in% 'AML', statistic]
Control <- x[x$Disease %in% 'Control', statistic]
nrow_AML <- nrow(out[out$Disease %in% 'AML', ])
nrow_Control <- nrow(out[out$Disease %in% 'Control', ])
out <- matrix(c(prod(nrow_AML, nrow_Control)),
nrow = nrow_AML, ncol = nrow_Control)
for(i in seq_len(nrow_AML)) {
for(j in seq_len(nrow_Control)) {
out[i, j] <- AML[i] - Control[j]
}
}
out <- t(out)
colnames(out) <- paste(AML_genes, 'AML')
rownames(out) <- paste(Control_genes, 'Control')
return(out)
}
输出
> fold_change_fun(x = out, statistic = 'mean')
GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control 0.35 0.39 0.32
GSM239323 Control -0.13 -0.09 -0.16
GSM239324 Control -0.02 0.02 -0.05
GSM239326 Control -0.06 -0.02 -0.09
GSM239332 Control -0.06 -0.02 -0.09
GSM239333 Control -0.10 -0.06 -0.13
> fold_change_fun(x = out, statistic = 'median')
GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control 0.45 0.33 0.54
GSM239323 Control -0.29 -0.41 -0.20
GSM239324 Control 0.17 0.05 0.26
GSM239326 Control 0.03 -0.09 0.12
GSM239332 Control 0.06 -0.06 0.15
GSM239333 Control -0.06 -0.18 0.03
请注意,这些只是点估计。我们不知道这些样本倍数变化作为总体倍数变化的估计值有多精确。
数据
df <- structure(list(ID = c("GSM239170", "GSM239323", "GSM239324",
"GSM239326", "GSM239328", "GSM239329", "GSM239331", "GSM239332",
"GSM239333"), Disease = c("Control", "Control", "Control", "Control",
"AML", "AML", "AML", "Control", "Control")), row.names = c(NA,
-9L), class = "data.frame")
df1 <- structure(list(GSM239170 = c(3.016704177, 7.977735461, 4.207280707,
7.25609471, 2.204955099, 7.28437278, 4.265792764, 2.6943516,
3.646303109, 3.040292001, 2.357625231, 5.378494673, 2.833814735,
6.192932281, 4.543042482, 6.069437462), GSM239323 = c(3.285669072,
6.532514237, 4.994965767, 7.420807337, 2.331625217, 6.983593721,
4.970684112, 2.916324936, 8.817891552, 3.38486713, 2.444753172,
6.065038394, 3.038434511, 6.478439634, 4.786227217, 6.232738284
), GSM239324 = c(2.929482692, 6.388007183, 4.40159671, 6.999340125,
2.133305231, 6.86337111, 4.595545125, 2.578130269, 11.4248793,
2.958851115, 2.340767158, 5.134842087, 2.837711812, 6.180169144,
4.445131477, 6.74644117), GSM239326 = c(2.922820483, 6.466679556,
4.747114589, 7.094488581, 2.18332885, 6.865970678, 4.575545289,
2.659717988, 10.74738082, 3.047880699, 2.32143889, 5.367342376,
2.859800224, 6.151689376, 4.51471011, 7.04995802), GSM239328 = c(3.15950317,
6.432795021, 4.830045513, 7.024332721, 2.12778313, 7.219840938,
4.547957809, 2.567436676, 9.296043108, 2.878562717, 2.282608342,
5.682051149, 2.866040813, 6.238949956, 4.491645167, 6.938928532
), GSM239329 = c(3.163327169, 6.407321524, 4.213762092, 7.17928981,
2.269697813, 7.181113053, 4.68215122, 2.8095128, 9.53150669,
3.209319974, 2.401218719, 5.712072512, 2.969167906, 6.708196123,
4.460114204, 6.348253102), GSM239331 = c(2.985901308, 6.426470803,
4.884418365, 7.159898654, 2.264705552, 7.392230178, 4.674495889,
2.790110381, 8.285160496, 3.20260379, 2.385568421, 5.57179966,
2.929449968, 6.441437631, 4.602482637, 6.080950712), GSM239332 = c(3.122708843,
6.376394357, 4.4318876, 7.009977785, 2.253940441, 7.484052914,
4.675841709, 2.795882913, 9.769919327, 3.195993624, 2.375334953,
5.72082395, 2.963530689, 6.448280595, 4.587221948, 6.324619355
), GSM239333 = c(3.070948463, 6.469070308, 4.849665444, 6.830979234,
2.287924323, 7.52498281, 4.643311767, 2.884588792, 9.774610531,
3.3004227, 2.432634747, 5.656674512, 2.931065261, 6.413562269,
4.623125028, 6.472893789)), row.names = c(NA, -16L), class = "data.frame")
我是 R 的新手,所以对代码一无所知。我有两个数据框。一个数据框看起来像这样。
df
ID | Disease |
---|---|
GSM239170 | Control |
GSM239323 | Control |
GSM239324 | Control |
GSM239326 | Control |
GSM239328 | AML |
GSM239329 | AML |
GSM239331 | AML |
GSM239332 | Control |
GSM239333 | Control |
另一个数据框如下所示:
df1
GSM239170 | GSM239323 | GSM239324 | GSM239326 | GSM239328 | GSM239329 | GSM239331 | GSM239332 | GSM239333 |
---|---|---|---|---|---|---|---|---|
3.016704177 | 3.285669072 | 2.929482692 | 2.922820483 | 3.15950317 | 3.163327169 | 2.985901308 | 3.122708843 | 3.070948463 |
7.977735461 | 6.532514237 | 6.388007183 | 6.466679556 | 6.432795021 | 6.407321524 | 6.426470803 | 6.376394357 | 6.469070308 |
4.207280707 | 4.994965767 | 4.40159671 | 4.747114589 | 4.830045513 | 4.213762092 | 4.884418365 | 4.4318876 | 4.849665444 |
7.25609471 | 7.420807337 | 6.999340125 | 7.094488581 | 7.024332721 | 7.17928981 | 7.159898654 | 7.009977785 | 6.830979234 |
2.204955099 | 2.331625217 | 2.133305231 | 2.18332885 | 2.12778313 | 2.269697813 | 2.264705552 | 2.253940441 | 2.287924323 |
7.28437278 | 6.983593721 | 6.86337111 | 6.865970678 | 7.219840938 | 7.181113053 | 7.392230178 | 7.484052914 | 7.52498281 |
4.265792764 | 4.970684112 | 4.595545125 | 4.575545289 | 4.547957809 | 4.68215122 | 4.674495889 | 4.675841709 | 4.643311767 |
2.6943516 | 2.916324936 | 2.578130269 | 2.659717988 | 2.567436676 | 2.8095128 | 2.790110381 | 2.795882913 | 2.884588792 |
3.646303109 | 8.817891552 | 11.4248793 | 10.74738082 | 9.296043108 | 9.53150669 | 8.285160496 | 9.769919327 | 9.774610531 |
3.040292001 | 3.38486713 | 2.958851115 | 3.047880699 | 2.878562717 | 3.209319974 | 3.20260379 | 3.195993624 | 3.3004227 |
2.357625231 | 2.444753172 | 2.340767158 | 2.32143889 | 2.282608342 | 2.401218719 | 2.385568421 | 2.375334953 | 2.432634747 |
5.378494673 | 6.065038394 | 5.134842087 | 5.367342376 | 5.682051149 | 5.712072512 | 5.57179966 | 5.72082395 | 5.656674512 |
2.833814735 | 3.038434511 | 2.837711812 | 2.859800224 | 2.866040813 | 2.969167906 | 2.929449968 | 2.963530689 | 2.931065261 |
6.192932281 | 6.478439634 | 6.180169144 | 6.151689376 | 6.238949956 | 6.708196123 | 6.441437631 | 6.448280595 | 6.413562269 |
4.543042482 | 4.786227217 | 4.445131477 | 4.51471011 | 4.491645167 | 4.460114204 | 4.602482637 | 4.587221948 | 4.623125028 |
6.069437462 | 6.232738284 | 6.74644117 | 7.04995802 | 6.938928532 | 6.348253102 | 6.080950712 | 6.324619355 | 6.472893789 |
我想制作一个 table 以包含 mean_AML、sd_AML(标准差)、min_AML、max_AML、mean_Control , sd_Control, min_Control, max_Control, 和 Fold_change(即 mean_AML – mean_Control)每个基因。使用内置函数就可以了。
不知道我该怎么做。请帮忙。
谢谢。
---更新---
提示:将数据集拆分为AML数据集和正常数据集,然后对每个gene/probeset,分别计算其跨样本的均值、标准差、最小和最大表达值(使用内置函数), 并进一步将每个基因的这些统计值合并为一个table。应用 data.frame() 并为创建的 table 赋予与基因表达数据相同的行名称 table.
我们可以将 pivot_longer
与 right_join
组合,然后在组上使用 summarise
:
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(
everything(),
names_to = "ID",
values_to = "value"
) %>%
right_join(df, by="ID") %>%
group_by(Disease) %>%
summarise(Min = min(value), Mean = mean(value), Max = max(value), Sd = sd(value)) %>%
ungroup()
Disease Min Mean Max Sd
<chr> <dbl> <dbl> <dbl> <dbl>
1 AML 2.13 4.91 9.53 2.04
2 Control 2.13 4.92 11.4 2.12
使用 pivot_longer()
:
library(tidyr)
newdf <- pivot_longer(df1,
cols = everything(),
names_to = 'ID',
values_to = 'value')
然后merge()
将两个数据帧合二为一:
df.all <- merge(df, newdf, by = 'ID', all = T)
然后您可以 group_by(gene)
和 summarise()
,添加您需要的任何值:
library(dplyr)
df.all %>% group_by(gene) %>% summarise(sd = sd(value), min = min(value), max = max(value), mean = mean(value))
具有旧功能的另一个选项 tidyr::gather
有一个列 df:
library(tidyverse)
df2_spread <- df1 %>%
tidyr::gather(ID, val) %>%
left_join(df, by = 'ID')
result_1 <- df2_spread %>%
group_by(Disease, gene = ID) %>%
summarise(n = n(),
mean = mean(val),
sd = sd(val),
min = min(val),
max = max(val), .groups = "drop")
A tibble: 9 × 7
Disease gene n mean sd min max
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 AML GSM239328 16 4.91 2.15 2.13 9.30
2 AML GSM239329 16 4.95 2.13 2.27 9.53
3 AML GSM239331 16 4.88 1.96 2.26 8.29
4 Control GSM239170 16 4.56 1.91 2.20 7.98
5 Control GSM239323 16 5.04 1.98 2.33 8.82
6 Control GSM239324 16 4.93 2.45 2.13 11.4
7 Control GSM239326 16 4.97 2.34 2.18 10.7
8 Control GSM239332 16 4.97 2.16 2.25 9.77
9 Control GSM239333 16 5.01 2.14 2.29 9.77
无论如何,我无法找到一种方法来计算每个基因的 Fold_change,因为这里的基因似乎只有一种疾病。
这是数据
df <- tibble::tribble(
~ID, ~Disease,
"GSM239170", "Control",
"GSM239323", "Control",
"GSM239324", "Control",
"GSM239326", "Control",
"GSM239328", "AML",
"GSM239329", "AML",
"GSM239331", "AML",
"GSM239332", "Control",
"GSM239333", "Control"
)
df1 <- tibble::tribble(
~GSM239170, ~GSM239323, ~GSM239324, ~GSM239326, ~GSM239328, ~GSM239329, ~GSM239331, ~GSM239332, ~GSM239333,
3.016704177, 3.285669072, 2.929482692, 2.922820483, 3.15950317, 3.163327169, 2.985901308, 3.122708843, 3.070948463,
7.977735461, 6.532514237, 6.388007183, 6.466679556, 6.432795021, 6.407321524, 6.426470803, 6.376394357, 6.469070308,
4.207280707, 4.994965767, 4.40159671, 4.747114589, 4.830045513, 4.213762092, 4.884418365, 4.4318876, 4.849665444,
7.25609471, 7.420807337, 6.999340125, 7.094488581, 7.024332721, 7.17928981, 7.159898654, 7.009977785, 6.830979234,
2.204955099, 2.331625217, 2.133305231, 2.18332885, 2.12778313, 2.269697813, 2.264705552, 2.253940441, 2.287924323,
7.28437278, 6.983593721, 6.86337111, 6.865970678, 7.219840938, 7.181113053, 7.392230178, 7.484052914, 7.52498281,
4.265792764, 4.970684112, 4.595545125, 4.575545289, 4.547957809, 4.68215122, 4.674495889, 4.675841709, 4.643311767,
2.6943516, 2.916324936, 2.578130269, 2.659717988, 2.567436676, 2.8095128, 2.790110381, 2.795882913, 2.884588792,
3.646303109, 8.817891552, 11.4248793, 10.74738082, 9.296043108, 9.53150669, 8.285160496, 9.769919327, 9.774610531,
3.040292001, 3.38486713, 2.958851115, 3.047880699, 2.878562717, 3.209319974, 3.20260379, 3.195993624, 3.3004227,
2.357625231, 2.444753172, 2.340767158, 2.32143889, 2.282608342, 2.401218719, 2.385568421, 2.375334953, 2.432634747,
5.378494673, 6.065038394, 5.134842087, 5.367342376, 5.682051149, 5.712072512, 5.57179966, 5.72082395, 5.656674512,
2.833814735, 3.038434511, 2.837711812, 2.859800224, 2.866040813, 2.969167906, 2.929449968, 2.963530689, 2.931065261,
6.192932281, 6.478439634, 6.180169144, 6.151689376, 6.238949956, 6.708196123, 6.441437631, 6.448280595, 6.413562269,
4.543042482, 4.786227217, 4.445131477, 4.51471011, 4.491645167, 4.460114204, 4.602482637, 4.587221948, 4.623125028,
6.069437462, 6.232738284, 6.74644117, 7.04995802, 6.938928532, 6.348253102, 6.080950712, 6.324619355, 6.472893789
)
A 基础 R 方法。
df_new <- data.frame(t(df1), ID=colnames(df1))
df_new <- merge(df, df_new, by = 'ID')
out <- apply(df_new[, grep('^X', names(df_new))], 1, function(x) {
data.frame(min=min(x), IQR_low=quantile(x, .25),
mean=mean(x), median=median(x),IQR_high=quantile(x, .75),
max=max(x), sd=sd(x))
})
out <- round(do.call(rbind, out), 2L)
out <- cbind(Disease=df_new$Disease, out)
rownames(out) <- df_new$ID
输出
> out
Disease min IQR_low mean median IQR_high max sd
GSM239170 Control 2.20 2.97 4.56 4.24 6.10 7.98 1.91
GSM239323 Control 2.33 3.22 5.04 4.98 6.49 8.82 1.98
GSM239324 Control 2.13 2.91 4.93 4.52 6.48 11.42 2.45
GSM239326 Control 2.18 2.91 4.97 4.66 6.57 10.75 2.34
GSM239328 AML 2.13 2.88 4.91 4.69 6.56 9.30 2.15
GSM239329 AML 2.27 3.11 4.95 4.57 6.48 9.53 2.13
GSM239331 AML 2.26 2.97 4.88 4.78 6.43 8.29 1.96
GSM239332 Control 2.25 3.08 4.97 4.63 6.39 9.77 2.16
GSM239333 Control 2.29 3.04 5.01 4.75 6.47 9.77 2.14
现在我们需要找到您定义为 mean_AML - mean_Control 的所有 18 倍变化。这可以通过以下函数完成,fold_change_fun()
,其中 returns 折叠根据 mean
或 median
.
fold_change_fun <- function(x, statistic = c('mean', 'median')) {
AML_genes <- rownames(x[x$Disease %in% 'AML', ])
Control_genes <- rownames(x[x$Disease %in% 'Control', ])
AML <- x[x$Disease %in% 'AML', statistic]
Control <- x[x$Disease %in% 'Control', statistic]
nrow_AML <- nrow(out[out$Disease %in% 'AML', ])
nrow_Control <- nrow(out[out$Disease %in% 'Control', ])
out <- matrix(c(prod(nrow_AML, nrow_Control)),
nrow = nrow_AML, ncol = nrow_Control)
for(i in seq_len(nrow_AML)) {
for(j in seq_len(nrow_Control)) {
out[i, j] <- AML[i] - Control[j]
}
}
out <- t(out)
colnames(out) <- paste(AML_genes, 'AML')
rownames(out) <- paste(Control_genes, 'Control')
return(out)
}
输出
> fold_change_fun(x = out, statistic = 'mean')
GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control 0.35 0.39 0.32
GSM239323 Control -0.13 -0.09 -0.16
GSM239324 Control -0.02 0.02 -0.05
GSM239326 Control -0.06 -0.02 -0.09
GSM239332 Control -0.06 -0.02 -0.09
GSM239333 Control -0.10 -0.06 -0.13
> fold_change_fun(x = out, statistic = 'median')
GSM239328 AML GSM239329 AML GSM239331 AML
GSM239170 Control 0.45 0.33 0.54
GSM239323 Control -0.29 -0.41 -0.20
GSM239324 Control 0.17 0.05 0.26
GSM239326 Control 0.03 -0.09 0.12
GSM239332 Control 0.06 -0.06 0.15
GSM239333 Control -0.06 -0.18 0.03
请注意,这些只是点估计。我们不知道这些样本倍数变化作为总体倍数变化的估计值有多精确。
数据
df <- structure(list(ID = c("GSM239170", "GSM239323", "GSM239324",
"GSM239326", "GSM239328", "GSM239329", "GSM239331", "GSM239332",
"GSM239333"), Disease = c("Control", "Control", "Control", "Control",
"AML", "AML", "AML", "Control", "Control")), row.names = c(NA,
-9L), class = "data.frame")
df1 <- structure(list(GSM239170 = c(3.016704177, 7.977735461, 4.207280707,
7.25609471, 2.204955099, 7.28437278, 4.265792764, 2.6943516,
3.646303109, 3.040292001, 2.357625231, 5.378494673, 2.833814735,
6.192932281, 4.543042482, 6.069437462), GSM239323 = c(3.285669072,
6.532514237, 4.994965767, 7.420807337, 2.331625217, 6.983593721,
4.970684112, 2.916324936, 8.817891552, 3.38486713, 2.444753172,
6.065038394, 3.038434511, 6.478439634, 4.786227217, 6.232738284
), GSM239324 = c(2.929482692, 6.388007183, 4.40159671, 6.999340125,
2.133305231, 6.86337111, 4.595545125, 2.578130269, 11.4248793,
2.958851115, 2.340767158, 5.134842087, 2.837711812, 6.180169144,
4.445131477, 6.74644117), GSM239326 = c(2.922820483, 6.466679556,
4.747114589, 7.094488581, 2.18332885, 6.865970678, 4.575545289,
2.659717988, 10.74738082, 3.047880699, 2.32143889, 5.367342376,
2.859800224, 6.151689376, 4.51471011, 7.04995802), GSM239328 = c(3.15950317,
6.432795021, 4.830045513, 7.024332721, 2.12778313, 7.219840938,
4.547957809, 2.567436676, 9.296043108, 2.878562717, 2.282608342,
5.682051149, 2.866040813, 6.238949956, 4.491645167, 6.938928532
), GSM239329 = c(3.163327169, 6.407321524, 4.213762092, 7.17928981,
2.269697813, 7.181113053, 4.68215122, 2.8095128, 9.53150669,
3.209319974, 2.401218719, 5.712072512, 2.969167906, 6.708196123,
4.460114204, 6.348253102), GSM239331 = c(2.985901308, 6.426470803,
4.884418365, 7.159898654, 2.264705552, 7.392230178, 4.674495889,
2.790110381, 8.285160496, 3.20260379, 2.385568421, 5.57179966,
2.929449968, 6.441437631, 4.602482637, 6.080950712), GSM239332 = c(3.122708843,
6.376394357, 4.4318876, 7.009977785, 2.253940441, 7.484052914,
4.675841709, 2.795882913, 9.769919327, 3.195993624, 2.375334953,
5.72082395, 2.963530689, 6.448280595, 4.587221948, 6.324619355
), GSM239333 = c(3.070948463, 6.469070308, 4.849665444, 6.830979234,
2.287924323, 7.52498281, 4.643311767, 2.884588792, 9.774610531,
3.3004227, 2.432634747, 5.656674512, 2.931065261, 6.413562269,
4.623125028, 6.472893789)), row.names = c(NA, -16L), class = "data.frame")