使用ggplot将直方图转换为r中的小提琴图

transform histogram to violin plot in r with ggplot

我目前正在尝试借助 Hadley Wickham 的大量资源 ("r for data scientists"、"ggplot2 Elegant Graphics for Data Analysis") 学习 r。到目前为止,我能够在那里找到所有问题的答案(非常感谢你,Hadley!),但这次不行。

目前,我正在处理来自通过粒子散射光估计粒子大小的仪器(DLS、Zetasizer Nano、Malvern Instruments)的数据。从该设备中提取的数据是一些汇总统计数据(例如平均粒径)和直方图数据:x = 大小(分箱),y = 强度 [%]。 这是我的一项测量结果的一小部分:

   # A tibble: 70 x 3
   sample_name        intensities      bins
   <chr>                    <dbl>     <dbl>
 1 core formulation 1         0       0.4  
 2 core formulation 1         0       0.463
 3 core formulation 1         0       0.536
 4 core formulation 1         0       0.621
 5 core formulation 1         0       0.720
 6 core formulation 1         0       0.833
 7 core formulation 1         0       0.965
 8 core formulation 1         0       1.12 
 9 core formulation 1         0       1.29 
10 core formulation 1         0       1.50 
11 core formulation 1         0       1.74 
12 core formulation 1         0       2.01 
13 core formulation 1         0       2.33 
14 core formulation 1         0       2.70 
15 core formulation 1         0       3.12 
16 core formulation 1         0       3.62 
17 core formulation 1         0       4.19 
18 core formulation 1         0       4.85 
19 core formulation 1         0       5.62 
20 core formulation 1         0       6.50 
21 core formulation 1         0       7.53 
22 core formulation 1         0       8.72 
23 core formulation 1         0      10.1  
24 core formulation 1         0      11.7  
25 core formulation 1         0      13.5  
26 core formulation 1         0      15.7  
27 core formulation 1         0      18.2  
28 core formulation 1         0      21.0  
29 core formulation 1         0      24.4  
30 core formulation 1         0      28.2  
31 core formulation 1         0      32.7  
32 core formulation 1         0      37.8  
33 core formulation 1         0      43.8  
34 core formulation 1         0.2    50.8  
35 core formulation 1         1.4    58.8  
36 core formulation 1         3.7    68.1  
37 core formulation 1         6.9    78.8  
38 core formulation 1        10.2    91.3  
39 core formulation 1        12.9   106.   
40 core formulation 1        14.4   122.   
41 core formulation 1        14.4   142.   
42 core formulation 1        13     164.   
43 core formulation 1        10.3   190.   
44 core formulation 1         7.1   220.   
45 core formulation 1         3.9   255    
46 core formulation 1         1.5   295.   
47 core formulation 1         0.2   342    
48 core formulation 1         0     396.   
49 core formulation 1         0     459.   
50 core formulation 1         0     531.   
51 core formulation 1         0     615.   
52 core formulation 1         0     712.   
53 core formulation 1         0     825    
54 core formulation 1         0     955.   
55 core formulation 1         0    1106    
56 core formulation 1         0    1281    
57 core formulation 1         0    1484    
58 core formulation 1         0    1718    
59 core formulation 1         0    1990    
60 core formulation 1         0    2305    
61 core formulation 1         0    2669    
62 core formulation 1         0    3091    
63 core formulation 1         0    3580    
64 core formulation 1         0    4145    
65 core formulation 1         0    4801    
66 core formulation 1         0    5560    
67 core formulation 1         0    6439    
68 core formulation 1         0    7456    
69 core formulation 1         0    8635    
70 core formulation 1         0   10000    

这是使用 dput() 命令生成的数据:

structure(list(sample_name = c("core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1", "core formulation 1", 
"core formulation 1", "core formulation 1"), intensities = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 1.4, 3.7, 6.9, 10.2, 12.9, 
14.4, 14.4, 13, 10.3, 7.1, 3.9, 1.5, 0.2, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bins = c(0.4, 
0.4632, 0.5365, 0.6213, 0.7195, 0.8332, 0.9649, 1.117, 1.294, 
1.499, 1.736, 2.01, 2.328, 2.696, 3.122, 3.615, 4.187, 4.849, 
5.615, 6.503, 7.531, 8.721, 10.1, 11.7, 13.54, 15.69, 18.17, 
21.04, 24.36, 28.21, 32.67, 37.84, 43.82, 50.75, 58.77, 68.06, 
78.82, 91.28, 105.7, 122.4, 141.8, 164.2, 190.1, 220.2, 255, 
295.3, 342, 396.1, 458.7, 531.2, 615.1, 712.4, 825, 955.4, 1106, 
1281, 1484, 1718, 1990, 2305, 2669, 3091, 3580, 4145, 4801, 5560, 
6439, 7456, 8635, 10000)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -70L))

我可以根据这些数据生成一个没有问题的直方图:

library(tidyverse)
ggplot (DLS_intensities_core, aes(bins,intensities) ) + 
  geom_line() + 
  scale_x_continuous(trans = 'log10')

为了显示我的粒径的总体分布,我想将此数据转换为小提琴图,并在我的绘图的第二层中使用设备提供的汇总统计数据。

因此,我想转换此数据以便能够从中创建小提琴图。

我已经尝试将它提供给小提琴情节的 stat_density () 参数,但到目前为止没有成功。

你知道如何根据这些数据创建小提琴图吗?

非常感谢!

最佳,

多米尼克

我会在您回复第二条评论后更新此内容(如果需要)。您可以获得 binsintensities 的小提琴图:

library(hrbrthemes)

gather(DLS_intensities_core, measure, value, -sample_name) %>% 
  ggplot(aes(measure, value)) +
  geom_violin(scale = "count") +
  scale_y_comma() +
  facet_wrap(~measure, scales="free") +
  labs(
    x = NULL, y = "A better label than this",
    title = "A better title than this",
    caption = "NOTE: Free Y scales"
  ) +
  theme_ipsum_rc(grid="Y") +
  theme(axis.text.x = element_blank())

我通常也喜欢分层:

gather(DLS_intensities_core, measure, value, -sample_name) %>% 
  ggplot(aes(measure, value)) +
  geom_violin(scale = "count") +
  ggbeeswarm::geom_quasirandom() +
  scale_y_comma() +
  facet_wrap(~measure, scales="free") +
  labs(
    x = NULL, y = "A better label than this",
    title = "A better title than this",
    caption = "NOTE: Free Y scales"
  ) +
  theme_ipsum_rc(grid="Y") +
  theme(axis.text.x = element_blank())

根据您的评论,这可能是显示 bins 分布以及与 intensities:

的关系的更好方法
library(hrbrthemes)
library(tidyverse)

ggplot(DLS_intensities_core, aes(x="", bins)) +
  geom_violin(scale = "count") +
  ggbeeswarm::geom_quasirandom(
    aes(size = intensities, fill = intensities), shape = 21
  ) +
  scale_y_comma(trans="log10") +
  viridis::scale_fill_viridis(direction = -1, trans = "log1p") +
  scale_size_continuous(trans = "log1p", range = c(2, 10)) +
  guides(fill = guide_legend()) +
  labs(
    x = NULL, y = "A better label than this",
    title = "A better title than this"
  ) +
  theme_ipsum_rc(grid="Y")

你必须做一些其他的自定义转换来尝试让小提琴形状随强度变化(并且它不会真正反映那一点的分布)。

我找到了我的问题的解决方案,它可能不是很优雅:

library (tidyverse)

DLS_intensities_core <- DLS_intensities_core %>% 
  mutate(counts = intensities * 10 )

vectors <- DLS_intensities_core %>%
  filter(counts > 0) 

bins_v <- vectors$bins
count_v <- vectors$counts

violin_DLSdata <- as.tibble(rep.int(bins_v, count_v))
violin_DLSdata$sample_name <- "core formulation 1"

ggplot (violin_DLSdata, aes(sample_name, value)) + 
  geom_violin() + 
  labs(
    x = NULL, y = "size"
  ) +
  scale_y_continuous(trans = 'log10', limits = c(1, 1000))

对于我的整个数据集,它看起来像这样: 我添加了:汇总统计数据,带有错误栏的红点。

你怎么看?