geom_histogram如何添加密度曲线和均值线?

How to add a density curve and mean line to geom_histogram?

我正在写论文,有一件事我无法通过网络搜索解决。我有一些数据集,我必须比较前后的结果,我想通过比较两组直方图(两组都包含 5 个图)来可视化它。认为这对你们来说是一个非常简单的问题,但仍然需要一些帮助。

我尝试了一些方法,但最终还是搞乱了 ggplot。 我知道我可能必须添加两小行代码,但我真的很难找到它们。

我得到了以下暂时可用的代码。

df <- as.data.frame(clust1_mat[,1:5])

p1 <- ggplot(gather(df), aes(value)) + 
        geom_histogram(bins = 10) +
        facet_wrap(~key, scales = 'free_x', nrow= 1) +
        xlab("Average results students in CLuster 1")

p1 + geom_density(fill="lightblue")

df <- as.data.frame(cijfers_list[,1:5])


p2 <- ggplot(gather(df), aes(value)) +  
        geom_histogram(bins = 10) + 
        facet_wrap(~key, scales = 'free_x', nrow=1) +
        xlab("Average results students before clustering")

p2 + geom_density(fill="lightblue")



grid.arrange(p1, p2, nrow=2)

我想在每个直方图的平均值上添加阴影密度曲线和红色垂直线。

clust1_mat数据:


structure(list(`BSTAT-TH` = c(6.9, 7, 8.1, 7.1, 6.2, 7, 6.2, 
7.7, 9.3, 6.3, 6.7, 6.9, 6.6, 5.3, 6.5, 6.3, 6.8, 7.3, 7.1, 6.9, 
7, 7, 6.5, 5.8, 6.2, 6.4, 7, 6.6, 9.5, 8, 6.5, 9, 7.3, 6.5, 7.4, 
6.9, 7.3, 6.2, 7.6, 7.1, 7.7, 5.2, 7, 6.5, 7.5, 6.9, 6.8, 7.4, 
9.2, 6.2, 9.2, 7.4, 9, 7.1, 5.7, 7.1, 8.4, 7.2, 8.8, 8.9, 5.7, 
7.1), `GRAAF-TH` = c(9.1, 6.5, 5.9, 7.3, 6.9, 7, 8.6, 8.4, 7.7, 
7, 7.2, 7.7, 8.3, 6.5, 7.7, 8.6, 8.5, 7.5, 7, 7.1, 5.9, 6.3, 
7.8, 8.3, 7.9, 8.1, 7.7, 7.5, 7.2, 9.2, 7.5, 9.4, 8.4, 5.8, 7.9, 
7.2, 7.6, 7.8, 8.7, 7.9, 7, 8.1, 7.3, 7.8, 7.7, 6.3, 6.2, 7.6, 
9.1, 7, 9.4, 9.2, 9.3, 7.4, 8.3, 7.2, 5.7, 8.7, 5.4, 7.7, 6.7, 
6.6), `BWISK-TH` = c(5.5, 6.1, 7.7, 5.2, 5.4, 6.3, 6.3, 3.8, 
5.4, 5.7, 4.7, 6.6, 6.9, 5.8, 4.8, 6.3, 6, 6.1, 7.1, 6.2, 6.3, 
6.1, 4.7, 5.9, 6.2, 4.9, 3.4, 5.5, 5.3, 4.2, 5.3, 5.2, 6, 5.9, 
5.9, 5.4, 6.2, 6.2, 5.7, 3.3, 6.5, 5.3, 6.3, 6.2, 6.5, 6.1, 5.8, 
4, 5.2, 6.4, 5.8, 3.8, 5.1, 5.8, 6, 6.1, 4.2, 5.4, 4.3, 5.4, 
4.7, 6.4), `CALEID-TH` = c(7.1, 6, 5.1, 6.6, 6.3, 4.9, 6.9, 4.7, 
6.4, 5.8, 5.7, 7.2, 5.8, 5.8, 5.5, 6.4, 5.8, 4.7, 5.7, 4.9, 5.1, 
5.8, 6, 6.9, 6.2, 5, 4.3, 5.5, 5.9, 4.4, 6.2, 6.2, 5.6, 6, 6.5, 
7.5, 4.3, 6.2, 6, 4.7, 6.3, 6.6, 4.4, 6.6, 6.1, 6.2, 5.3, 5.8, 
6.5, 6.1, 6.1, 4.8, 6, 5, 6.3, 7.4, 6.2, 6.2, 5.9, 6.2, 4.3, 
7.1), `COVA1-PR` = c(7.5, 8, 7.5, 7.5, 6, 7, 6.5, 7.5, 6.5, 6, 
7.5, 6, 7.5, 6.5, 7.5, 7, 8.5, 8, 7, 8, 6.5, 7, 7.5, 7.5, 8, 
7.7, 7.5, 6, 6, 6.5, 5.5, 6, 8, 8.5, 8, 7, 7.5, 8.5, 8.5, 7.5, 
6, 7, 8, 7, 8, 8, 6.5, 7.5, 6, 6.5, 6.5, 6.5, 6, 6.5, 7, 6, 6.5, 
8, 6, 6, 6.5, 6), cluster = c(`4` = 1L, `8` = 1L, `9` = 1L, `10` = 1L, 
`11` = 1L, `13` = 1L, `16` = 1L, `20` = 1L, `25` = 1L, `28` = 1L, 
`31` = 1L, `32` = 1L, `34` = 1L, `35` = 1L, `36` = 1L, `39` = 1L, 
`40` = 1L, `41` = 1L, `43` = 1L, `44` = 1L, `45` = 1L, `47` = 1L, 
`49` = 1L, `51` = 1L, `52` = 1L, `53` = 1L, `57` = 1L, `63` = 1L, 
`66` = 1L, `68` = 1L, `70` = 1L, `71` = 1L, `73` = 1L, `74` = 1L, 
`76` = 1L, `77` = 1L, `78` = 1L, `79` = 1L, `81` = 1L, `82` = 1L, 
`86` = 1L, `89` = 1L, `90` = 1L, `92` = 1L, `93` = 1L, `96` = 1L, 
`97` = 1L, `99` = 1L, `101` = 1L, `106` = 1L, `107` = 1L, `108` = 1L, 
`109` = 1L, `111` = 1L, `115` = 1L, `116` = 1L, `118` = 1L, `120` = 1L, 
`124` = 1L, `125` = 1L, `126` = 1L, `127` = 1L)), row.names = c(4L, 
8L, 9L, 10L, 11L, 13L, 16L, 20L, 25L, 28L, 31L, 32L, 34L, 35L, 
36L, 39L, 40L, 41L, 43L, 44L, 45L, 47L, 49L, 51L, 52L, 53L, 57L, 
63L, 66L, 68L, 70L, 71L, 73L, 74L, 76L, 77L, 78L, 79L, 81L, 82L, 
86L, 89L, 90L, 92L, 93L, 96L, 97L, 99L, 101L, 106L, 107L, 108L, 
109L, 111L, 115L, 116L, 118L, 120L, 124L, 125L, 126L, 127L), class = "data.frame")

谢谢!

已编辑以添加提供的数据。

添加密度曲线以拟合直方图可能很棘手 - 关键是将密度设置为 ..count.. 并确保将其乘以您在其中使用的 bins 的数量你的直方图。

这里有一些虚拟数据和几个例子:

library(tidyverse)

df <-
  tibble(
    a = rlnorm(1000, meanlog = 2, sdlog = .4),
    b = rlnorm(1000, meanlog = 2.2, sdlog = .4),
    c = rlnorm(1000, meanlog = 1.9, sdlog = .4),
    d = rlnorm(1000, meanlog = 2.1, sdlog = .4)
  ) %>%
  gather() %>% 
  group_by(key) %>% 
  mutate(mean = mean(value)) %>% # calculate mean for plotting as well
  ungroup()

bin <- 1 # set number of bins

df %>% 
  ggplot(aes(value)) +
  geom_density(aes(y = ..count.. * bin), # multiply count by bins
               fill = "blue", alpha = .3, col = NA) + 
  geom_histogram(binwidth = bin, alpha = .5) + # use the same bins here
  geom_vline(aes(xintercept = mean), col = "red") + 
  theme_minimal() + 
  labs(y = "count") + 
  facet_wrap(~ key, ncol = 2)

让我们尝试不同数量的箱子:

bin <- 2.5

df %>% 
  ggplot(aes(value)) +
  geom_density(aes(y = ..count.. * bin), fill = "blue", alpha = .3, col = NA) + 
  geom_histogram(binwidth = bin, alpha = .5) +
  geom_vline(aes(xintercept = mean), col = "red") + 
  theme_minimal() + 
  labs(y = "count") + 
  facet_wrap(~ key, ncol = 2)

希望这就是您要找的!

可能需要更多技巧才能使情节完美,但这是对您提供的数据的第一次打击:

library(tidyverse)

df <- your_data %>% 
  select(1:5) %>% 
  gather() %>% 
  group_by(key) %>% 
  mutate(mean = mean(value)) %>% 
  ungroup()

bin <- 1

df %>% 
  ggplot(aes(value)) +
  geom_density(aes(y = ..count.. * bin), fill = "blue", alpha = .3, col = NA) + 
  geom_histogram(binwidth = bin, alpha = .5) +
  geom_vline(aes(xintercept = mean), col = "red") + 
  theme_minimal() + 
  labs(y = "count") + 
  facet_wrap(~ key, ncol = 1) +
  coord_fixed(ratio = .04) + 
  scale_x_continuous(limits = c(1,10), breaks = 1:10, minor_breaks = NULL)

reprex package (v0.3.0)

于 2019-10-25 创建