使用 ggplot2 在 R 中的分组箱线图上分组散点图
Grouped scatterplot over grouped boxplot in R using ggplot2
我正在使用 ggplot2 创建一个带有散点图叠加的分组箱线图。我想将每个散点图数据点与其对应的分组箱线图分组。
不过,我也希望散点图的点是不同的符号。我似乎能够让我的散点图点与我的分组箱线图分组,或者让我的散点图点成为不同的符号......但不能同时。下面是一些示例代码来说明正在发生的事情:
library(scales)
library(ggplot2)
# Generates Data frame to plot
Gene <- c(rep("GeneA",24),rep("GeneB",24),rep("GeneC",24),rep("GeneD",24),rep("GeneE",24))
Clone <- c(rep(c("D1","D2","D3","D4","D5","D6"),20))
variable <- c(rep(c(rep("Day10",6),rep("Day20",6),rep("Day30",6),rep("Day40",6)),5))
value <- c(rnorm(24, mean = 0.5, sd = 0.5),rnorm(24, mean = 10, sd = 8),rnorm(24, mean = 1000, sd = 900),
rnorm(24, mean = 25000, sd = 9000), rnorm(24, mean = 8000, sd = 3000))
value <- sqrt(value*value)
Tdata <- cbind(Gene, Clone, variable)
Tdata <- data.frame(Tdata)
Tdata <- cbind(Tdata,value)
# Creates the Plot of All Data
# The below code groups the data exactly how I'd like but the scatter plot points are all the same shape
# and I'd like them to each have different shapes.
ln_clr <- "black"
bk_clr <- "white"
point_shapes <- c(0,15,1,16,2,17)
blue_cols <- c("#EFF2FB","#81BEF7","#0174DF","#0000FF","#0404B4")
lp1 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25,
size = 0.7, coef = 4) +
geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3,
alpha = 1, colour = ln_clr) +
geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7,
pch=15)
lp1 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
labels = trans_format("log10", math_format(10^.x)))
ggsave("Scatter Grouped-Wrong Symbols.png")
#*************************************************************************************************************************************
# The below code doesn't group the scatterplot data how I'd like but the points each have different shapes
lp2 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25,
size = 0.7, coef = 4) +
geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3,
alpha = 1, colour = ln_clr) +
geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7,
aes(shape=Clone))
lp2 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
labels = trans_format("log10", math_format(10^.x)))
ggsave("Scatter Ungrouped-Right Symbols.png")
如果有人有任何建议,我将不胜感激。
谢谢
内森
要显示箱线图,shape
审美需要在 geom_point
内部,而不是在对 ggplot 的主要调用中。这样做的原因是,当 shape
审美在主 ggplot 调用中时,它适用于所有几何对象,包括 geom_boxplot
。但是,应用 shape=Clone
美学会导致 geom_boxplot
为 Clone
的每个级别创建一个单独的箱线图。由于 variable
和 Clone
的每个组合只有一行数据,因此不会生成箱线图。
shape
美学影响 geom_boxplot
对我来说似乎违反直觉,但也许有一个我不知道的原因。在任何情况下,将 shape
美学移至 geom_point
可以通过仅将 shape
美学应用于 geom_point
.
来解决问题。
然后,为了让点出现在正确的箱线图中,我们需要 group
到 Gene
。我还加了theme_classic
方便看剧情(虽然还是很忙):
ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', width=0.25, size=0.7, coef=4, position=position_dodge(0.85)) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, position=position_dodge(0.85)) +
geom_point(position=position_jitterdodge(dodge.width=0.85), size=1.8, alpha=0.7,
aes(shape=Clone, group=Gene)) +
scale_fill_manual(values=blue_cols) + labs(y="Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x))) +
theme_classic()
我认为如果 Gene
使用分面,variable
使用 x 轴,情节会更容易理解。将时间放在 x 轴上似乎更直观,而使用分面可以释放点的颜色美感。有六个不同的克隆,仍然很难(至少对我而言)区分点标记,但对我来说这看起来比以前的版本更清晰。
library(dplyr)
ggplot(Tdata %>% mutate(Gene=gsub("Gene","Gene ", Gene)),
aes(x=gsub("Day","",variable), y=value)) +
stat_boxplot(geom='errorbar', width=0.25, size=0.7, coef=4) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, width=0.5) +
geom_point(aes(fill=Clone), position=position_jitter(0.2), size=1.5, alpha=0.7, shape=21) +
theme_classic() +
facet_grid(. ~ Gene) +
labs(y = "Fold Change", x="Day") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x)))
如果你真的需要保留点,也许通过一些手动躲避将箱线图和点分开会更好:
set.seed(10)
ggplot(Tdata %>% mutate(Day=as.numeric(substr(variable,4,5)),
Gene = gsub("Gene","Gene ", Gene)),
aes(x=Day - 2, y=value, group=Day)) +
stat_boxplot(geom ='errorbar', width=0.5, size=0.5, coef=4) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, width=4) +
geom_point(aes(x=Day + 2, fill=Clone), size=1.5, alpha=0.7, shape=21,
position=position_jitter(width=1, height=0)) +
theme_classic() +
facet_grid(. ~ Gene) +
labs(y="Fold Change", x="Day") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x)))
还有一点:为了将来参考,您可以简化数据创建代码:
Gene = rep(paste0("Gene",LETTERS[1:5]), each=24)
Clone = rep(paste0("D",1:6), 20)
variable = rep(rep(paste0("Day", seq(10,40,10)), each=6), 5)
value = rnorm(24*5, mean=rep(c(0.5,10,1000,25000,8000), each=24),
sd=rep(c(0.5,8,900,9000,3000), each=24))
Tdata = data.frame(Gene, Clone, variable, value)
我正在使用 ggplot2 创建一个带有散点图叠加的分组箱线图。我想将每个散点图数据点与其对应的分组箱线图分组。
不过,我也希望散点图的点是不同的符号。我似乎能够让我的散点图点与我的分组箱线图分组,或者让我的散点图点成为不同的符号......但不能同时。下面是一些示例代码来说明正在发生的事情:
library(scales)
library(ggplot2)
# Generates Data frame to plot
Gene <- c(rep("GeneA",24),rep("GeneB",24),rep("GeneC",24),rep("GeneD",24),rep("GeneE",24))
Clone <- c(rep(c("D1","D2","D3","D4","D5","D6"),20))
variable <- c(rep(c(rep("Day10",6),rep("Day20",6),rep("Day30",6),rep("Day40",6)),5))
value <- c(rnorm(24, mean = 0.5, sd = 0.5),rnorm(24, mean = 10, sd = 8),rnorm(24, mean = 1000, sd = 900),
rnorm(24, mean = 25000, sd = 9000), rnorm(24, mean = 8000, sd = 3000))
value <- sqrt(value*value)
Tdata <- cbind(Gene, Clone, variable)
Tdata <- data.frame(Tdata)
Tdata <- cbind(Tdata,value)
# Creates the Plot of All Data
# The below code groups the data exactly how I'd like but the scatter plot points are all the same shape
# and I'd like them to each have different shapes.
ln_clr <- "black"
bk_clr <- "white"
point_shapes <- c(0,15,1,16,2,17)
blue_cols <- c("#EFF2FB","#81BEF7","#0174DF","#0000FF","#0404B4")
lp1 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25,
size = 0.7, coef = 4) +
geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3,
alpha = 1, colour = ln_clr) +
geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7,
pch=15)
lp1 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
labels = trans_format("log10", math_format(10^.x)))
ggsave("Scatter Grouped-Wrong Symbols.png")
#*************************************************************************************************************************************
# The below code doesn't group the scatterplot data how I'd like but the points each have different shapes
lp2 <- ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', position = position_dodge(width = .83), width = 0.25,
size = 0.7, coef = 4) +
geom_boxplot( coef=1, outlier.shape = NA, position = position_dodge(width = .83), lwd = 0.3,
alpha = 1, colour = ln_clr) +
geom_point(position = position_jitterdodge(dodge.width = 0.83), size = 1.8, alpha = 0.7,
aes(shape=Clone))
lp2 + scale_fill_manual(values = blue_cols) + labs(y = "Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand = c(0, 0), breaks = c(0.01,1,100,10000,100000),
labels = trans_format("log10", math_format(10^.x)))
ggsave("Scatter Ungrouped-Right Symbols.png")
如果有人有任何建议,我将不胜感激。
谢谢 内森
要显示箱线图,shape
审美需要在 geom_point
内部,而不是在对 ggplot 的主要调用中。这样做的原因是,当 shape
审美在主 ggplot 调用中时,它适用于所有几何对象,包括 geom_boxplot
。但是,应用 shape=Clone
美学会导致 geom_boxplot
为 Clone
的每个级别创建一个单独的箱线图。由于 variable
和 Clone
的每个组合只有一行数据,因此不会生成箱线图。
shape
美学影响 geom_boxplot
对我来说似乎违反直觉,但也许有一个我不知道的原因。在任何情况下,将 shape
美学移至 geom_point
可以通过仅将 shape
美学应用于 geom_point
.
然后,为了让点出现在正确的箱线图中,我们需要 group
到 Gene
。我还加了theme_classic
方便看剧情(虽然还是很忙):
ggplot(Tdata, aes(x=variable, y=value, fill=Gene)) +
stat_boxplot(geom ='errorbar', width=0.25, size=0.7, coef=4, position=position_dodge(0.85)) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, position=position_dodge(0.85)) +
geom_point(position=position_jitterdodge(dodge.width=0.85), size=1.8, alpha=0.7,
aes(shape=Clone, group=Gene)) +
scale_fill_manual(values=blue_cols) + labs(y="Fold Change") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x))) +
theme_classic()
我认为如果 Gene
使用分面,variable
使用 x 轴,情节会更容易理解。将时间放在 x 轴上似乎更直观,而使用分面可以释放点的颜色美感。有六个不同的克隆,仍然很难(至少对我而言)区分点标记,但对我来说这看起来比以前的版本更清晰。
library(dplyr)
ggplot(Tdata %>% mutate(Gene=gsub("Gene","Gene ", Gene)),
aes(x=gsub("Day","",variable), y=value)) +
stat_boxplot(geom='errorbar', width=0.25, size=0.7, coef=4) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, colour=ln_clr, width=0.5) +
geom_point(aes(fill=Clone), position=position_jitter(0.2), size=1.5, alpha=0.7, shape=21) +
theme_classic() +
facet_grid(. ~ Gene) +
labs(y = "Fold Change", x="Day") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x)))
如果你真的需要保留点,也许通过一些手动躲避将箱线图和点分开会更好:
set.seed(10)
ggplot(Tdata %>% mutate(Day=as.numeric(substr(variable,4,5)),
Gene = gsub("Gene","Gene ", Gene)),
aes(x=Day - 2, y=value, group=Day)) +
stat_boxplot(geom ='errorbar', width=0.5, size=0.5, coef=4) +
geom_boxplot(coef=1, outlier.shape=NA, lwd=0.3, alpha=1, width=4) +
geom_point(aes(x=Day + 2, fill=Clone), size=1.5, alpha=0.7, shape=21,
position=position_jitter(width=1, height=0)) +
theme_classic() +
facet_grid(. ~ Gene) +
labs(y="Fold Change", x="Day") +
expand_limits(y=c(0.01,10^5)) +
scale_y_log10(expand=c(0, 0), breaks=10^(-2:5),
labels=trans_format("log10", math_format(10^.x)))
还有一点:为了将来参考,您可以简化数据创建代码:
Gene = rep(paste0("Gene",LETTERS[1:5]), each=24)
Clone = rep(paste0("D",1:6), 20)
variable = rep(rep(paste0("Day", seq(10,40,10)), each=6), 5)
value = rnorm(24*5, mean=rep(c(0.5,10,1000,25000,8000), each=24),
sd=rep(c(0.5,8,900,9000,3000), each=24))
Tdata = data.frame(Gene, Clone, variable, value)