如何使用 tidyr 包对不同时间点的复制数据进行平均并分别绘制?
How to use tidyr package to average replicate data for different time points and plot each separately?
我有一个超过 100 个不同样本的数据集。样品来自不同的基因型(例如 X、Y、Z)和 4 个不同的时间点(T0、1、2、3),具有 3 个生物学重复(R1、2、3)。我正在测量 50 个不同基因的值(按行)。
structure(list(Gene = structure(1:2, .Label = c("A", "B"), class = "factor"),
X_T0_R1 = c(1.46559502, 0.220140568), X_T0_R2 = c(1.087642983,
0.237500819), X_T0_R3 = c(1.424945196, 0.21066267), X_T1_R1 = c(1.289943948,
0.207778662), X_T1_R2 = c(1.376535013, 0.488774258), X_T1_R3 = c(1.833390311,
0.182798731), X_T2_R1 = c(1.450753714, 0.247576125), X_T2_R2 = c(1.3094609,
0.390028842), X_T2_R3 = c(0.5953716, 1.007079177), X_T3_R1 = c(0.7906009,
0.730242116), X_T3_R2 = c(1.215333041, 1.012914813), X_T3_R3 = c(1.069312467,
0.780421013), Y_T0_R1 = c(0.053317766, 3.316414959), Y_T0_R2 = c(0.506623748,
3.599442788), Y_T0_R3 = c(0.713670106, 2.516735845), Y_T1_R1 = c(0.740998252,
1.444496448), Y_T1_R2 = c(0.648231834, 0.097957459), Y_T1_R3 = c(0.780499252,
0.187840968), Y_T2_R1 = c(0.35344654, 1.190274584), Y_T2_R2 = c(0.220223951,
1.367784148), Y_T2_R3 = c(0.432856978, 1.403057729), Y_T3_R1 = c(0.234963735,
1.232129062), Y_T3_R2 = c(0.353770497, 0.885122768), Y_T3_R3 = c(0.396091395,
1.333921747), Z_T0_R1 = c(0.398000559, 1.286528398), Z_T0_R2 = c(0.384759325,
1.122251177), Z_T0_R3 = c(1.582230097, 0.697419716), Z_T1_R1 = c(1.136843842,
0.804552001), Z_T1_R2 = c(1.275683837, 1.227821594), Z_T1_R3 = c(0.963349308,
0.968589683), Z_T2_R1 = c(3.765036263, 0.477443352), Z_T2_R2 = c(1.901023385,
0.832736132), Z_T2_R3 = c(1.407713024, 0.911920317), Z_T3_R1 = c(0.988333629,
1.095130142), Z_T3_R2 = c(0.618606729, 0.497458337), Z_T3_R3 = c(0.429823986,
0.471389536)), .Names = c("Gene", "X_T0_R1", "X_T0_R2", "X_T0_R3",
"X_T1_R1", "X_T1_R2", "X_T1_R3", "X_T2_R1", "X_T2_R2", "X_T2_R3",
"X_T3_R1", "X_T3_R2", "X_T3_R3", "Y_T0_R1", "Y_T0_R2", "Y_T0_R3",
"Y_T1_R1", "Y_T1_R2", "Y_T1_R3", "Y_T2_R1", "Y_T2_R2", "Y_T2_R3",
"Y_T3_R1", "Y_T3_R2", "Y_T3_R3", "Z_T0_R1", "Z_T0_R2", "Z_T0_R3",
"Z_T1_R1", "Z_T1_R2", "Z_T1_R3", "Z_T2_R1", "Z_T2_R2", "Z_T2_R3",
"Z_T3_R1", "Z_T3_R2", "Z_T3_R3"), class = "data.frame", row.names =
c(NA, -2L))
对于每个基因(即每一列),我想绘制一个图表,其中包含每个基因型的重复平均值 + SE
expected final graph look
我想通过创建一个新的数据框来做到这一点;包含每组重复的平均值和标准误差。
这怎么可能使用 tidyr 包?如何包含标准错误?
我怎样才能改进这个编码?
data.mean<- data.frame(matrix(nrows=50))
for(col in seq(1,length(colnames(data)), by=3))
{data.mean <-cbind(data.mean,apply(subset(data, select=seq(col,length.out = 3)),1,mean, na.rm = TRUE))
colnames(data.mean)[ncol(data.mean)] <- colnames(data)[col]}
首先我们需要将数据转换为合理的格式。这完成了工作:
(longdata <- dataset %>%
gather(key, value, -Gene) %>%
separate(key, c("Genotype", "Time", "Replicate")))
#> # A tibble: 72 x 5
#> Gene Genotype Time Replicate value
#> <fct> <chr> <chr> <chr> <dbl>
#> 1 A X T0 R1 1.47
#> 2 B X T0 R1 0.220
#> 3 A X T0 R2 1.09
#> 4 B X T0 R2 0.238
#> 5 A X T0 R3 1.42
#> 6 B X T0 R3 0.211
#> 7 A X T1 R1 1.29
#> 8 B X T1 R1 0.208
#> 9 A X T1 R2 1.38
#> 10 B X T1 R2 0.489
#> # ... with 62 more rows
事实证明,预先计算我们想要绘制的度量值会更容易:
(longdata <- longdata %>%
group_by(Gene, Genotype, Time) %>%
summarize(Ave = mean(value), SE = sd(value)/sqrt(n())) %>%
ungroup() %>%
mutate(Time = factor(Time), Genotype = factor(Genotype)))
#> # A tibble: 24 x 5
#> Gene Genotype Time Ave SE
#> <fct> <fct> <fct> <dbl> <dbl>
#> 1 A X T0 1.33 0.120
#> 2 A X T1 1.50 0.169
#> 3 A X T2 1.12 0.265
#> 4 A X T3 1.03 0.125
#> 5 A Y T0 0.425 0.195
#> 6 A Y T1 0.723 0.0392
#> 7 A Y T2 0.336 0.0620
#> 8 A Y T3 0.328 0.0482
#> 9 A Z T0 0.788 0.397
#> 10 A Z T1 1.13 0.0903
#> # ... with 14 more rows
我将一些变量更改为因子,因为绘图需要这样。
现在是绘图,在这种情况下 "errorbars" 定义为 +/- 一个标准误差,因为您没有指定,但它可以更改。
longdata %>%
ggplot(aes(x = Time, y = Ave, fill = Genotype)) +
geom_bar(position = position_dodge(), stat = "identity") +
geom_errorbar(aes(ymin = Ave - SE, ymax = Ave + SE),
width = 0.1,
position = position_dodge(0.9)) +
facet_wrap(~ Gene)
我有一个超过 100 个不同样本的数据集。样品来自不同的基因型(例如 X、Y、Z)和 4 个不同的时间点(T0、1、2、3),具有 3 个生物学重复(R1、2、3)。我正在测量 50 个不同基因的值(按行)。
structure(list(Gene = structure(1:2, .Label = c("A", "B"), class = "factor"),
X_T0_R1 = c(1.46559502, 0.220140568), X_T0_R2 = c(1.087642983,
0.237500819), X_T0_R3 = c(1.424945196, 0.21066267), X_T1_R1 = c(1.289943948,
0.207778662), X_T1_R2 = c(1.376535013, 0.488774258), X_T1_R3 = c(1.833390311,
0.182798731), X_T2_R1 = c(1.450753714, 0.247576125), X_T2_R2 = c(1.3094609,
0.390028842), X_T2_R3 = c(0.5953716, 1.007079177), X_T3_R1 = c(0.7906009,
0.730242116), X_T3_R2 = c(1.215333041, 1.012914813), X_T3_R3 = c(1.069312467,
0.780421013), Y_T0_R1 = c(0.053317766, 3.316414959), Y_T0_R2 = c(0.506623748,
3.599442788), Y_T0_R3 = c(0.713670106, 2.516735845), Y_T1_R1 = c(0.740998252,
1.444496448), Y_T1_R2 = c(0.648231834, 0.097957459), Y_T1_R3 = c(0.780499252,
0.187840968), Y_T2_R1 = c(0.35344654, 1.190274584), Y_T2_R2 = c(0.220223951,
1.367784148), Y_T2_R3 = c(0.432856978, 1.403057729), Y_T3_R1 = c(0.234963735,
1.232129062), Y_T3_R2 = c(0.353770497, 0.885122768), Y_T3_R3 = c(0.396091395,
1.333921747), Z_T0_R1 = c(0.398000559, 1.286528398), Z_T0_R2 = c(0.384759325,
1.122251177), Z_T0_R3 = c(1.582230097, 0.697419716), Z_T1_R1 = c(1.136843842,
0.804552001), Z_T1_R2 = c(1.275683837, 1.227821594), Z_T1_R3 = c(0.963349308,
0.968589683), Z_T2_R1 = c(3.765036263, 0.477443352), Z_T2_R2 = c(1.901023385,
0.832736132), Z_T2_R3 = c(1.407713024, 0.911920317), Z_T3_R1 = c(0.988333629,
1.095130142), Z_T3_R2 = c(0.618606729, 0.497458337), Z_T3_R3 = c(0.429823986,
0.471389536)), .Names = c("Gene", "X_T0_R1", "X_T0_R2", "X_T0_R3",
"X_T1_R1", "X_T1_R2", "X_T1_R3", "X_T2_R1", "X_T2_R2", "X_T2_R3",
"X_T3_R1", "X_T3_R2", "X_T3_R3", "Y_T0_R1", "Y_T0_R2", "Y_T0_R3",
"Y_T1_R1", "Y_T1_R2", "Y_T1_R3", "Y_T2_R1", "Y_T2_R2", "Y_T2_R3",
"Y_T3_R1", "Y_T3_R2", "Y_T3_R3", "Z_T0_R1", "Z_T0_R2", "Z_T0_R3",
"Z_T1_R1", "Z_T1_R2", "Z_T1_R3", "Z_T2_R1", "Z_T2_R2", "Z_T2_R3",
"Z_T3_R1", "Z_T3_R2", "Z_T3_R3"), class = "data.frame", row.names =
c(NA, -2L))
对于每个基因(即每一列),我想绘制一个图表,其中包含每个基因型的重复平均值 + SE
expected final graph look
我想通过创建一个新的数据框来做到这一点;包含每组重复的平均值和标准误差。 这怎么可能使用 tidyr 包?如何包含标准错误? 我怎样才能改进这个编码?
data.mean<- data.frame(matrix(nrows=50))
for(col in seq(1,length(colnames(data)), by=3))
{data.mean <-cbind(data.mean,apply(subset(data, select=seq(col,length.out = 3)),1,mean, na.rm = TRUE))
colnames(data.mean)[ncol(data.mean)] <- colnames(data)[col]}
首先我们需要将数据转换为合理的格式。这完成了工作:
(longdata <- dataset %>%
gather(key, value, -Gene) %>%
separate(key, c("Genotype", "Time", "Replicate")))
#> # A tibble: 72 x 5
#> Gene Genotype Time Replicate value
#> <fct> <chr> <chr> <chr> <dbl>
#> 1 A X T0 R1 1.47
#> 2 B X T0 R1 0.220
#> 3 A X T0 R2 1.09
#> 4 B X T0 R2 0.238
#> 5 A X T0 R3 1.42
#> 6 B X T0 R3 0.211
#> 7 A X T1 R1 1.29
#> 8 B X T1 R1 0.208
#> 9 A X T1 R2 1.38
#> 10 B X T1 R2 0.489
#> # ... with 62 more rows
事实证明,预先计算我们想要绘制的度量值会更容易:
(longdata <- longdata %>%
group_by(Gene, Genotype, Time) %>%
summarize(Ave = mean(value), SE = sd(value)/sqrt(n())) %>%
ungroup() %>%
mutate(Time = factor(Time), Genotype = factor(Genotype)))
#> # A tibble: 24 x 5
#> Gene Genotype Time Ave SE
#> <fct> <fct> <fct> <dbl> <dbl>
#> 1 A X T0 1.33 0.120
#> 2 A X T1 1.50 0.169
#> 3 A X T2 1.12 0.265
#> 4 A X T3 1.03 0.125
#> 5 A Y T0 0.425 0.195
#> 6 A Y T1 0.723 0.0392
#> 7 A Y T2 0.336 0.0620
#> 8 A Y T3 0.328 0.0482
#> 9 A Z T0 0.788 0.397
#> 10 A Z T1 1.13 0.0903
#> # ... with 14 more rows
我将一些变量更改为因子,因为绘图需要这样。
现在是绘图,在这种情况下 "errorbars" 定义为 +/- 一个标准误差,因为您没有指定,但它可以更改。
longdata %>%
ggplot(aes(x = Time, y = Ave, fill = Genotype)) +
geom_bar(position = position_dodge(), stat = "identity") +
geom_errorbar(aes(ymin = Ave - SE, ymax = Ave + SE),
width = 0.1,
position = position_dodge(0.9)) +
facet_wrap(~ Gene)