使用 ggplot2 在 R 中的分组箱线图上分组散点图

Grouped scatterplot over grouped boxplot in R using ggplot2

我正在使用 ggplot2 创建一个带有散点图叠加的分组箱线图。我想将每个散点图数据点与其对应的分组箱线图分组。

不过,我也希望散点图的点是不同的符号。我似乎能够让我的散点图点与我的分组箱线图分组,或者让我的散点图点成为不同的符号......但不能同时。下面是一些示例代码来说明正在发生的事情:

library(scales)
library(ggplot2) 

# Generates Data frame to plot
Gene <- c(rep("GeneA",24),rep("GeneB",24),rep("GeneC",24),rep("GeneD",24),rep("GeneE",24))
Clone <- c(rep(c("D1","D2","D3","D4","D5","D6"),20))
variable <- c(rep(c(rep("Day10",6),rep("Day20",6),rep("Day30",6),rep("Day40",6)),5))
value <- c(rnorm(24, mean = 0.5, sd = 0.5),rnorm(24, mean = 10, sd = 8),rnorm(24, mean = 1000, sd = 900), 
           rnorm(24, mean = 25000, sd = 9000), rnorm(24, mean = 8000, sd = 3000))
    value <- sqrt(value*value)
        Tdata <- cbind(Gene, Clone, variable)
        Tdata <- data.frame(Tdata)
            Tdata <- cbind(Tdata,value)

# Creates the Plot of All Data
# The below code groups the data exactly how I'd like but the scatter plot points are all the same shape
# and I'd like them to each have different shapes.                        
ln_clr <- "black"
bk_clr <- "white"
point_shapes <- c(0,15,1,16,2,17)
blue_cols <- c("#EFF2FB","#81BEF7","#0174DF","#0000FF","#0404B4")

lp1 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
    stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25, 
                 size = 0.7, coef = 4) +
    geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3, 
                  alpha = 1, colour = ln_clr) +
    geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7, 
               pch=15)


lp1 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
    expand_limits(y=c(0.01,10^5)) +
    scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
                  labels = trans_format("log10", math_format(10^.x)))

ggsave("Scatter Grouped-Wrong Symbols.png")

#*************************************************************************************************************************************
# The below code doesn't group the scatterplot data how I'd like but the points each have different shapes
lp2 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
    stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25, 
                 size = 0.7, coef = 4) +
    geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3, 
                  alpha = 1, colour = ln_clr) +
    geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7, 
               aes(shape=Clone))


lp2 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
    expand_limits(y=c(0.01,10^5)) +
    scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
                  labels = trans_format("log10", math_format(10^.x)))

ggsave("Scatter Ungrouped-Right Symbols.png")

如果有人有任何建议,我将不胜感激。

谢谢 内森

要显示箱线图,shape 审美需要在 geom_point 内部,而不是在对 ggplot 的主要调用中。这样做的原因是,当 shape 审美在主 ggplot 调用中时,它适用于所有几何对象,包括 geom_boxplot。但是,应用 shape=Clone 美学会导致 geom_boxplotClone 的每个级别创建一个单独的箱线图。由于 variableClone 的每个组合只有一行数据,因此不会生成箱线图。

shape 美学影响 geom_boxplot 对我来说似乎违反直觉,但也许有一个我不知道的原因。在任何情况下,将 shape 美学移至 geom_point 可以通过仅将 shape 美学应用于 geom_point.

来解决问题。

然后,为了让点出现在正确的箱线图中,我们需要 groupGene。我还加了theme_classic方便看剧情(虽然还是很忙):

ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
  stat_boxplot(geom ='errorbar', width=0.25, size=0.7, coef=4, position=position_dodge(0.85)) +
  geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, position=position_dodge(0.85)) +
  geom_point(position=position_jitterdodge(dodge.width=0.85), size=1.8, alpha=0.7, 
             aes(shape=Clone, group=Gene)) +
  scale_fill_manual(values=blue_cols) + labs(y="Fold Change") +
  expand_limits(y=c(0.01,10^5)) +
  scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
                labels=trans_format("log10", math_format(10^.x))) +
  theme_classic()

我认为如果 Gene 使用分面,variable 使用 x 轴,情节会更容易理解。将时间放在 x 轴上似乎更直观,而使用分面可以释放点的颜色美感。有六个不同的克隆,仍然很难(至少对我而言)区分点标记,但对我来说这看起来比以前的版本更清晰。

library(dplyr)

ggplot(Tdata %>% mutate(Gene=gsub("Gene","Gene ", Gene)), 
       aes(x=gsub("Day","",variable), y=value)) +
  stat_boxplot(geom='errorbar', width=0.25, size=0.7, coef=4) +
  geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, width=0.5) +
  geom_point(aes(fill=Clone), position=position_jitter(0.2), size=1.5, alpha=0.7, shape=21) +
  theme_classic() +
  facet_grid(. ~ Gene) +
  labs(y = "Fold Change", x="Day") +
  expand_limits(y=c(0.01,10^5)) +
  scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
                labels=trans_format("log10", math_format(10^.x)))

如果你真的需要保留点,也许通过一些手动躲避将箱线图和点分开会更好:

set.seed(10)
ggplot(Tdata %>% mutate(Day=as.numeric(substr(variable,4,5)),
                        Gene = gsub("Gene","Gene ", Gene)), 
       aes(x=Day - 2, y=value, group=Day)) +
  stat_boxplot(geom ='errorbar', width=0.5, size=0.5, coef=4) +
  geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, width=4) +
  geom_point(aes(x=Day + 2, fill=Clone), size=1.5, alpha=0.7, shape=21,
             position=position_jitter(width=1, height=0)) +
  theme_classic() +
  facet_grid(. ~ Gene) +
  labs(y="Fold Change", x="Day") +
  expand_limits(y=c(0.01,10^5)) +
  scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
                labels=trans_format("log10", math_format(10^.x)))

还有一点:为了将来参考,您可以简化数据创建代码:

Gene = rep(paste0("Gene",LETTERS[1:5]), each=24)
Clone = rep(paste0("D",1:6), 20)
variable = rep(rep(paste0("Day", seq(10,40,10)), each=6), 5)
value = rnorm(24*5, mean=rep(c(0.5,10,1000,25000,8000), each=24), 
              sd=rep(c(0.5,8,900,9000,3000), each=24))

Tdata = data.frame(Gene, Clone, variable, value)