避免在带有小平面网格和未知数量的组的 ggplot 函数中重复调用 geom?
Avoid repeated geom calls in ggplot function with facet grid and unknown number of groups?
我正在尝试编写一个密度函数,它将应用正态曲线作为每个面(组)的参考。下面,我试图通过避免直接定义函数来简化核心问题。
尝试
# Initial setup
library(dplyr)
data <- mtcars
group = "cyl"
variable = "mpg"
gform <- reformulate(".", response=group)
data[[group]] <- as.factor(data[[group]])
# Make data for normal curves
dat_norm <- data %>% group_by(.data[[group]]) %>%
summarise(mpg=seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
density=dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))
# Make plot
library(ggplot2)
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
geom_line(data=dat_norm, aes_string(x=variable, y="density", group=group), size=1.2) +
facet_grid(gform)
你可以看到这里的问题是ggplot似乎将相同的数据应用于所有方面并且没有按组自定义。我们可以手动完成但是问题是这种方法不允许最终函数使用未知数量的组。
期望的结果
# As explained above, the previous figure has the same line for each facet.
# I would like to have the following instead:
norm.1 <- data %>%
filter(.[[group]]==levels(.[[group]])[1]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[1],levels = levels(data[[group]])))
norm.2 <- data %>%
filter(.[[group]]==levels(.[[group]])[2]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[2],levels = levels(data[[group]])))
norm.3 <- data %>%
filter(.[[group]]==levels(.[[group]])[3]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[3],levels = levels(data[[group]])))
# Make plot
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
facet_grid(gform) +
geom_line(data = norm.1, aes(x = x, y = y), size=1.2) +
geom_line(data = norm.2, aes(x = x, y = y), size=1.2) +
geom_line(data = norm.3, aes(x = x, y = y), size=1.2)
问题
如前所述,后一种方法迫使我重复 geom_line()
调用,次数与组数一样多。但是,在函数内部,我们不会提前知道组数。解决方案是什么?
注意:这是 的后续问题。
ggplot 运行正常。您正在创建的数据框 (dat_norm) 只是将整体分布重复 3 次。对您的摘要做一个小改动,使其符合分组:
# Initial setup
library(dplyr)
data <- mtcars
group = "cyl"
variable = "mpg"
gform <- reformulate(".", response=group)
data[[group]] <- as.factor(data[[group]])
# Make data for normal curves
dat_norm <- data %>% group_by(.data[[group]]) %>%
# HERE IS THE CHANGE: do(
do(summarise(.,mpg=seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
density=dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]]))))
# Make plot
library(ggplot2)
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
geom_line(data=dat_norm, aes_string(x=variable, y="density", group=group), size=1.2) +
facet_grid(gform)
我正在尝试编写一个密度函数,它将应用正态曲线作为每个面(组)的参考。下面,我试图通过避免直接定义函数来简化核心问题。
尝试
# Initial setup
library(dplyr)
data <- mtcars
group = "cyl"
variable = "mpg"
gform <- reformulate(".", response=group)
data[[group]] <- as.factor(data[[group]])
# Make data for normal curves
dat_norm <- data %>% group_by(.data[[group]]) %>%
summarise(mpg=seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
density=dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))
# Make plot
library(ggplot2)
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
geom_line(data=dat_norm, aes_string(x=variable, y="density", group=group), size=1.2) +
facet_grid(gform)
你可以看到这里的问题是ggplot似乎将相同的数据应用于所有方面并且没有按组自定义。我们可以手动完成但是问题是这种方法不允许最终函数使用未知数量的组。
期望的结果
# As explained above, the previous figure has the same line for each facet.
# I would like to have the following instead:
norm.1 <- data %>%
filter(.[[group]]==levels(.[[group]])[1]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[1],levels = levels(data[[group]])))
norm.2 <- data %>%
filter(.[[group]]==levels(.[[group]])[2]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[2],levels = levels(data[[group]])))
norm.3 <- data %>%
filter(.[[group]]==levels(.[[group]])[3]) %>%
with(data.frame(x = seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
y = dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]])))) %>%
mutate_(cyl = factor(levels(data[[group]])[3],levels = levels(data[[group]])))
# Make plot
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
facet_grid(gform) +
geom_line(data = norm.1, aes(x = x, y = y), size=1.2) +
geom_line(data = norm.2, aes(x = x, y = y), size=1.2) +
geom_line(data = norm.3, aes(x = x, y = y), size=1.2)
问题
如前所述,后一种方法迫使我重复 geom_line()
调用,次数与组数一样多。但是,在函数内部,我们不会提前知道组数。解决方案是什么?
注意:这是
ggplot 运行正常。您正在创建的数据框 (dat_norm) 只是将整体分布重复 3 次。对您的摘要做一个小改动,使其符合分组:
# Initial setup
library(dplyr)
data <- mtcars
group = "cyl"
variable = "mpg"
gform <- reformulate(".", response=group)
data[[group]] <- as.factor(data[[group]])
# Make data for normal curves
dat_norm <- data %>% group_by(.data[[group]]) %>%
# HERE IS THE CHANGE: do(
do(summarise(.,mpg=seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
density=dnorm(seq(min(.[[variable]]),
max(.[[variable]]),
length.out=100),
mean(.[[variable]]),
sd(.[[variable]]))))
# Make plot
library(ggplot2)
ggplot(data, aes_string(x=variable, fill=group)) +
geom_density() +
geom_line(data=dat_norm, aes_string(x=variable, y="density", group=group), size=1.2) +
facet_grid(gform)