ggplot2 R 上比例堆积面积图的不均匀间距和重叠线

Question

我正在尝试在我的模拟数据中创建一个比例堆积面积图，如下所示（图 1）。当我尝试用我的真实数据执行此操作时，结果如图 2 所示。

数据的class在mock和16S之间转换为百分比后都是一样的，如下：Timepoint - integer, Taxa - character, n - integer, percentage - numeric.

我希望像模拟一样在 16S 数据中对 x 轴进行分类和数字处理（对于两个单独的图表），并整理重叠线（例如，从美学角度看，16S 的图将看起来像模拟数据）。

dput(S1_RA1[1:40,])
 structure(list(Timepoint = c(-10L, -10L, -10L, -10L, -10L, -10L, 
-10L, -10L, -10L, -3L, -3L, -3L, -3L, -3L, -3L, -3L, -3L, -3L, 
-3L, -3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), Taxa = c(" Anaerococcus", " Bacteroides", 
" Bifidobacterium", " Bilophila", " Collinsella", " Lachnoclostridium", 
" Streptococcus", " Veillonella", "Enterobacter", " Acinetobacter", 
" Anaerococcus", " Bacteroides", " Bifidobacterium", " Escherichia-Shigella", 
" Flavobacterium", " Lachnoclostridium", " Parabacteroides", 
" Peptoniphilus", " Veillonella", "Enterobacter", " Acinetobacter", 
" Bacteroides", " Bifidobacterium", " Bilophila", " Collinsella", 
" Desemzia", " Escherichia-Shigella", " Lachnoclostridium", " Parabacteroides", 
" Streptococcus", " Veillonella", " Bacteroides", " Bifidobacterium", 
" Bilophila", " Desemzia", " Escherichia-Shigella", " Lachnoclostridium", 
" Parabacteroides", " Streptococcus", " Veillonella"), n = c(40L, 
2188L, 665L, 84L, 55L, 131L, 153L, 11325L, 185L, 127L, 62L, 1123L, 
172L, 63L, 2L, 118L, 100L, 9L, 23123L, 109L, 253L, 2658L, 348L, 
163L, 204L, 27L, 163L, 245L, 290L, 41L, 17497L, 2325L, 50L, 197L, 
13L, 255L, 152L, 478L, 92L, 19692L), percentage = c(0.00269796303790638, 
0.147578578173479, 0.0448536355051936, 0.0056657223796034, 0.00370969917712127, 
0.0088358289491434, 0.0103197086199919, 0.763860785107244, 0.012478079050317, 
0.00507837492002559, 0.00247920665387076, 0.0449056301983365, 
0.00687779910428663, 0.00251919385796545, 7.99744081893794e-05, 
0.00471849008317338, 0.00399872040946897, 0.000359884836852207, 
0.92462412028151, 0.00435860524632118, 0.0115583169628581, 0.121430855680936, 
0.0158983964548403, 0.0074466627072959, 0.00931974964594088, 
0.00123349627666865, 0.0074466627072959, 0.0111928365845859, 
0.0132486637123669, 0.00187308693864498, 0.799351272328567, 0.0978823727529154, 
0.00210499726350356, 0.00829368921820402, 0.000547299288510925, 
0.0107354860438681, 0.00639919168105081, 0.020123773839094, 0.00387319496484655, 
0.829032122258241)), row.names = c(NA, -40L), groups = structure(list(
    Timepoint = c(-10L, -3L, 0L, 1L), .rows = structure(list(
        1:9, 10:20, 21:31, 32:40), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我试过以下方法：

正在将 scale_x_discrete 设置为 scale_x_continuous
转换aes(x = as.factor(时间点)..
更改 scale_x_discrete 代码中的 limits/expand 参数
删除负时间点
更改 S1_RA2 文件中的数字列以匹配 Table 1
中的数字系统

我的 16S 代码如下，除颜色外与模拟几乎相同：

library(ggplot2)
library(dplyr)
RA1 <- read.csv("RA1.csv", header=TRUE)

#Transform relative abundance from RA1.csv to percentages
S1_RA1 <- RA1 %>%
  group_by(Timepoint, Taxa) %>%
  summarise(n = sum(Relative.Abundance)) %>%
  mutate (percentage = n / sum(n))
head(Shime1_RA2)

#Set color palette to be able to include 15 colors
nb.cols <- 16
getPalette <- colorRampPalette(brewer.pal(9, 'Set1'))(nb.cols)



#Revised code - The code below works courtesy of Gregor's comment
 library(tidyr)
Shime1_RA2 <- Shime1_RA2 %>% ungroup %>%
  complete(Timepoint, Taxa, fill = list(n = 0, percentage = 0))

ggplot(Shime1_RA2, aes(x = factor(Timepoint), y = percentage, fill = Taxa, group = Taxa)) +
  geom_area(position = "fill", colour = "black", size = .5, alpha = .7) +
  scale_y_continuous(name="Relative Abundance", expand=c(0,0)) +
  scale_x_discrete(name="Timepoint (d)", expand=c(0,0)) + 
  scale_fill_manual(values = getPalette) + 
  theme(legend.position='bottom')

Answer 1

我修复了三件事：

您希望对 x 尺度进行分类处理，因此我们需要 factor(Timepoint)。（然后默认比例就可以了，所以我们删除你手动指定的limitsl）
当我们使用离散的 x 轴刻度时，我们必须明确地告诉 ggplot 我们想要连接哪些点。我们通过添加 group = Taxa 美学来做到这一点。
奇怪的线穿过其他多边形的中间是因为你没有在每个时间点对每个类群进行观察，所以当点连接时它们可能会穿过中间时间点。使用 tidyr::complete 用 0 填充缺失的观察值。

library(tidyr)
S1_RA1 = S1_RA1 %>% ungroup %>%
  complete(Timepoint, Taxa, fill = list(n = 0, percentage = 0))

ggplot(S1_RA1, aes(x = factor(Timepoint), y = percentage, fill = Taxa, group = Taxa)) +
  geom_area(position = "fill", colour = "black", size = .5, alpha = .7) +
  scale_y_continuous(name="Relative Abundance", expand=c(0,0)) +
  scale_x_discrete(
    name="Timepoint (d)", expand=c(0,0) 
  ) + 
  scale_fill_manual(values = getPalette) + 
  theme(legend.position='bottom')

ggplot2 R 上比例堆积面积图的不均匀间距和重叠线

Unevenly spaced and overlapping lines for proportional stacked area graph on ggplot2 R

r

area

stacked

ggplot2