使用 ggplot facet_wrap 在 R 中覆盖不同的 vlines
Overlaying different vlines in R with ggplot facet_wrap
我正在尝试生成一组密度图,显示四种细胞类型中两组基因的表达水平分布差异。除了密度图之外,我还想将两组的中值表达水平叠加到每个图上。根据对几个类似问题的回答,我已经能够得到正确的图或正确的中位数,但不能同时得到两者。我没有想法,希望有人能纠正我。谢谢!
此处提供示例数据:https://github.com/adadiehl/sample_data/blob/master/sample.data
第一次尝试。生成正确的图,但在所有四个上绘制相同的中位数:
dat = read.table("sample.data")
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "Other") ]) ), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "a_MCKG") ]) ), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
第二次尝试:正确的绘图但将所有中位数放在所有绘图上:
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
第三次尝试:地块都一样,但中位数正确。
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~z, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
传递预汇总数据是可行的方法:
library(plyr)
names(dat) <- c("FPKM", "class", "source")
dat2 <- subset(dat, FPKM > 0)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = ddply(dat2, .(source, class), summarize, mmed = median(FPKM)),
aes(xintercept = mmed, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
或者,您可以使用 base R 实现相同的效果:
dat3 <- aggregate(FPKM ~ source + class, data = dat2, FUN = median)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = dat3,
aes(xintercept = FPKM, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
N.B。您可能希望避免使用诸如 source
和 class
之类的列名称,因为它们会与内置函数发生冲突。
我正在尝试生成一组密度图,显示四种细胞类型中两组基因的表达水平分布差异。除了密度图之外,我还想将两组的中值表达水平叠加到每个图上。根据对几个类似问题的回答,我已经能够得到正确的图或正确的中位数,但不能同时得到两者。我没有想法,希望有人能纠正我。谢谢!
此处提供示例数据:https://github.com/adadiehl/sample_data/blob/master/sample.data
第一次尝试。生成正确的图,但在所有四个上绘制相同的中位数:
dat = read.table("sample.data")
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "Other") ]) ), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "a_MCKG") ]) ), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
第二次尝试:正确的绘图但将所有中位数放在所有绘图上:
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
第三次尝试:地块都一样,但中位数正确。
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~z, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
传递预汇总数据是可行的方法:
library(plyr)
names(dat) <- c("FPKM", "class", "source")
dat2 <- subset(dat, FPKM > 0)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = ddply(dat2, .(source, class), summarize, mmed = median(FPKM)),
aes(xintercept = mmed, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
或者,您可以使用 base R 实现相同的效果:
dat3 <- aggregate(FPKM ~ source + class, data = dat2, FUN = median)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = dat3,
aes(xintercept = FPKM, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
N.B。您可能希望避免使用诸如 source
和 class
之类的列名称,因为它们会与内置函数发生冲突。