R:如何将数据聚合成百分比而不丢失 ggplot2 中堆积条形图的数据?
R: How to aggregate data into percentages without missing data for stacked-bar plot in ggplot2?
我想按位置和底物(参见下面的示例数据)将我的 "karyotype" 分子数据总结为百分比,以便在 ggplot2 中创建堆栈条形图。
我已经弄清楚如何使用 'dcast' 获得每种核型的总数,但无法弄清楚如何获得三种核型(即 'BB'、'BD', 'DD').
数据的格式应能在 'ggplot2' 中制作堆积条形图。
示例数据:
library(reshape2)
Karotype.Data <- structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle", "Steninge"
), class = "factor"), Substrate = structure(c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 2L, 2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle",
"Steninge"), class = "factor"), Karyotype = structure(c(1L, 3L,
4L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 1L, 4L, 3L, 4L, 4L, 3L, 1L, 4L,
3L, 3L, 4L, 3L, 4L, 3L, 3L), .Label = c("", "BB", "BD", "DD"), class = "factor")), .Names = c("Location",
"Substrate", "Karyotype"), row.names = c(135L, 136L, 137L, 138L,
139L, 165L, 166L, 167L, 168L, 169L, 236L, 237L, 238L, 239L, 240L,
326L, 327L, 328L, 329L, 330L, 426L, 427L, 428L, 429L, 430L), class = "data.frame")
## Summary count for each karoytype ##
Karyotype.Summary <- dcast(Karotype.Data , Location + Substrate ~ Karyotype, value.var="Karyotype", length)
您可以使用 dplyr
包:
library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarize(freq=n())
z.freq <- z.counts %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq)*100)
此处,数据保留为长格式,因此可以直接使用 ggplot
:
构建条形图
library(ggplot2)
ggplot(z.freq) +
aes(x=Karyotype,y=freq) +
facet_grid(Location~Substrate) +
geom_bar(stat='identity')
在 'Marat Talipov' 的一些帮助和 Whosebug 上的许多其他问题答案的帮助下,我发现在 'dplyr' 之前加载 'plyr' 并使用 'summarise' 很重要而不是 'summarize'。然后使用 'filter'.
删除丢失的数据是最后一步
library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarise(freq=n())
z.freq <- z.counts %>% filter(Karyotype != '') %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq))
z.freq
library (ggplot2)
ggplot(z.freq, aes(x=Substrate, y=freq, fill=Karyotype)) +
geom_bar(stat="identity") +
facet_wrap(~ Location)
现在我已经创建了我正在寻找的情节:
我想按位置和底物(参见下面的示例数据)将我的 "karyotype" 分子数据总结为百分比,以便在 ggplot2 中创建堆栈条形图。
我已经弄清楚如何使用 'dcast' 获得每种核型的总数,但无法弄清楚如何获得三种核型(即 'BB'、'BD', 'DD').
数据的格式应能在 'ggplot2' 中制作堆积条形图。
示例数据:
library(reshape2)
Karotype.Data <- structure(list(Location = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle", "Steninge"
), class = "factor"), Substrate = structure(c(1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 2L, 2L, 2L, 2L), .Label = c("Kampinge", "Kaseberga", "Molle",
"Steninge"), class = "factor"), Karyotype = structure(c(1L, 3L,
4L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 1L, 4L, 3L, 4L, 4L, 3L, 1L, 4L,
3L, 3L, 4L, 3L, 4L, 3L, 3L), .Label = c("", "BB", "BD", "DD"), class = "factor")), .Names = c("Location",
"Substrate", "Karyotype"), row.names = c(135L, 136L, 137L, 138L,
139L, 165L, 166L, 167L, 168L, 169L, 236L, 237L, 238L, 239L, 240L,
326L, 327L, 328L, 329L, 330L, 426L, 427L, 428L, 429L, 430L), class = "data.frame")
## Summary count for each karoytype ##
Karyotype.Summary <- dcast(Karotype.Data , Location + Substrate ~ Karyotype, value.var="Karyotype", length)
您可以使用 dplyr
包:
library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarize(freq=n())
z.freq <- z.counts %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq)*100)
此处,数据保留为长格式,因此可以直接使用 ggplot
:
library(ggplot2)
ggplot(z.freq) +
aes(x=Karyotype,y=freq) +
facet_grid(Location~Substrate) +
geom_bar(stat='identity')
在 'Marat Talipov' 的一些帮助和 Whosebug 上的许多其他问题答案的帮助下,我发现在 'dplyr' 之前加载 'plyr' 并使用 'summarise' 很重要而不是 'summarize'。然后使用 'filter'.
删除丢失的数据是最后一步library(dplyr)
z.counts <- Karotype.Data %>%
group_by(Location,Substrate,Karyotype) %>%
summarise(freq=n())
z.freq <- z.counts %>% filter(Karyotype != '') %>%
group_by(Location,Substrate) %>%
mutate(freq=freq/sum(freq))
z.freq
library (ggplot2)
ggplot(z.freq, aes(x=Substrate, y=freq, fill=Karyotype)) +
geom_bar(stat="identity") +
facet_wrap(~ Location)
现在我已经创建了我正在寻找的情节: