大纲 scatterplot/barplot 与 ggplot 中的分类数据和组的折线图

outline scatterplot/barplot with line graph with categorical data and groups in ggplot

我有一个包含约 150 个国家/地区的数据集、一个分组变量以及每个国家/地区和组 (0-6) 的值。我试图表明,GDP 较高的国家在一组中的价值高于另一组。我制作了一个散点图,按组显示每个国家/地区的值(这些国家/地区按 GDP 排序)。我想在这些点周围画一条线,这样就更清楚哪个组在哪个 GDP 范围内具有更高的值。 然而,我很茫然。

data %>% ggplot(aes(x = fct_rev(fct_inorder(country)), 
    y = count, color = group))+
    geom_point()

显然,我需要清理它,但是有人知道该怎么做吗,由于 x 轴上的因素,密度不起作用,线条也不起作用,因为它会到达每个点。 基于我当前的数据集模拟连续数据也可以(我只是不知道该怎么做)。 我只想强调基于组和 GDP 的值范围。 也许不同类型的情节可能会有所帮助,但是哪个?所有评论表示赞赏! 下面是我的数据,已经按GDP排序了

structure(list(count = c(1, 0, 1, 3, 0, 1, 0, 1, 0, 1, 0, 2, 
2, 0, 1, 0, 0, 2, 3, 0, 0, 1, 0, 2, 2, 0, 1, 0, 1, 0, 0, 1, 1, 
0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0, 
0, 1, 2, 0, 0, 6, 1, 1, 1, 0, 2, 1, 1, 0, 4, 2, 1, 0, 2, 4, 0, 
1, 1, 0, 2, 1, 2, 1, 0, 2, 0, 1, 2, 1, 0, 2, 3, 2, 1, 0, 1, 1, 
2, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 4, 0, 2, 0, 
2, 0, 2, 0, 1, 1, 1, 1, 1, 5, 0, 0, 3, 3, 0, 0, 2, 1, 2, 0, 2, 
1, 0, 0, 2, 1, 0, 1, 2, 3, 0, 0, 1, 4, 0, 0, 3, 0, 1, 0, 2, 4, 
0, 1, 0, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 1, 
0, 1, 1, 2, 0, 1, 1, 1, 0, 2, 0, 2, 0, 3, 1, 0, 4, 0, 2, 0, 0, 
1, 1, 0, 0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 0, 1, 0, 2, 2, 0), country = c("CHE", 
"CHE", "NOR", "NOR", "IRL", "IRL", "SGP", "SGP", "USA", "USA", 
"AUS", "AUS", "DNK", "DNK", "SWE", "SWE", "NLD", "NLD", "GBR", 
"GBR", "SCT", "SCT", "FIN", "FIN", "CAN", "CAN", "DEU", "DEU", 
"BEL", "BEL", "ARE", "ARE", "JPN", "JPN", "ITA", "ITA", "KOR", 
"KOR", "BRN", "BRN", "SVN", "SVN", "BHR", "BHR", "PRT", "PRT", 
"SAU", "SAU", "EST", "EST", "GRC", "GRC", "LTU", "LTU", "SYC", 
"SYC", "LVA", "LVA", "CHL", "CHL", "HRV", "HRV", "CRI", "CRI", 
"TUR", "TUR", "MYS", "MYS", "KAZ", "KAZ", "LCA", "LCA", "ROU", 
"ROU", "MUS", "MUS", "GRD", "GRD", "MEX", "MEX", "RUS", "RUS", 
"CHN", "CHN", "SUR", "SUR", "BRA", "BRA", "DOM", "DOM", "BGR", 
"BGR", "MNE", "MNE", "THA", "THA", "COL", "COL", "SRB", "SRB", 
"ZAF", "ZAF", "GUY", "GUY", "FJI", "FJI", "LBY", "LBY", "BIH", 
"BIH", "AZE", "AZE", "MKD", "MKD", "JAM", "JAM", "IRQ", "IRQ", 
"NAM", "NAM", "GEO", "GEO", "ALB", "ALB", "XKX", "XKX", "WSM", 
"WSM", "LKA", "LKA", "JOR", "JOR", "ARM", "ARM", "EGY", "EGY", 
"SWZ", "SWZ", "IDN", "IDN", "PSE", "PSE", "CPV", "CPV", "MDA", 
"MDA", "MAR", "MAR", "VUT", "VUT", "PNG", "PNG", "NGA", "NGA", 
"LAO", "LAO", "VNM", "VNM", "SLB", "SLB", "GHA", "GHA", "MRT", 
"MRT", "KEN", "KEN", "PAK", "PAK", "BGD", "BGD", "HTI", "HTI", 
"ZMB", "ZMB", "SEN", "SEN", "YEM", "YEM", "SDN", "SDN", "TLS", 
"TLS", "KGZ", "KGZ", "BEN", "BEN", "TJK", "TJK", "TZA", "TZA", 
"EAZ", "EAZ", "NPL", "NPL", "GIN", "GIN", "UGA", "UGA", "MLI", 
"MLI", "ETH", "ETH", "BFA", "BFA", "GMB", "GMB", "LBR", "LBR", 
"GNB", "GNB", "SLE", "SLE", "MOZ", "MOZ", "AFG", "AFG", "COD", 
"COD", "MWI", "MWI"), group = c(1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 
2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 
2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 
2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 
2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 
1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 
1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1
)), row.names = c(33L, 34L, 151L, 152L, 89L, 90L, 175L, 176L, 
205L, 206L, 9L, 10L, 49L, 50L, 187L, 188L, 149L, 150L, 65L, 66L,
169L, 170L, 61L, 62L, 31L, 32L, 47L, 48L, 13L, 14L, 5L, 6L, 99L, 
100L, 93L, 94L, 107L, 108L, 29L, 30L, 185L, 186L, 23L, 24L, 159L, 
160L, 167L, 168L, 57L, 58L, 77L, 78L, 119L, 120L, 191L, 192L, 
121L, 122L, 35L, 36L, 83L, 84L, 45L, 46L, 199L, 200L, 143L, 144L, 
101L, 102L, 115L, 116L, 163L, 164L, 139L, 140L, 79L, 80L, 127L, 
128L, 165L, 166L, 37L, 38L, 183L, 184L, 27L, 28L, 51L, 52L, 21L, 
22L, 133L, 134L, 193L, 194L, 41L, 42L, 181L, 182L, 217L, 218L, 
81L, 82L, 63L, 64L, 113L, 114L, 25L, 26L, 11L, 12L, 129L, 130L, 
95L, 96L, 91L, 92L, 145L, 146L, 67L, 68L, 3L, 4L, 213L, 214L, 
211L, 212L, 117L, 118L, 97L, 98L, 7L, 8L, 55L, 56L, 189L, 190L, 
87L, 88L, 161L, 162L, 43L, 44L, 125L, 126L, 123L, 124L, 209L, 
210L, 157L, 158L, 147L, 148L, 109L, 110L, 207L, 208L, 177L, 178L, 
69L, 70L, 137L, 138L, 103L, 104L, 155L, 156L, 19L, 20L, 85L, 
86L, 219L, 220L, 173L, 174L, 215L, 216L, 171L, 172L, 197L, 198L, 
105L, 106L, 15L, 16L, 195L, 196L, 201L, 202L, 53L, 54L, 153L, 
154L, 71L, 72L, 203L, 204L, 131L, 132L, 59L, 60L, 17L, 18L, 73L, 
74L, 111L, 112L, 75L, 76L, 179L, 180L, 135L, 136L, 1L, 2L, 39L, 
40L, 141L, 142L), class = "data.frame")

这里有一个想法可以帮助形象化您试图展示的差异。首先,无论您如何标记,x 轴上的国家/地区名称都可能难以辨认。因此,x 轴上的国家 排名 可能会更好。

围绕这些点绘制一个多边形可能会使该点在视觉上更直观,但在统计方面没有多大意义。这里可能更好的是为每个组绘制一个单独的线的回归。由于我们处理的是计数数据,我们可以使用泊松回归,并且由于我们在 x 轴上有一个数字等级,所以可以让线条穿过您的绘图以显示回归。

library(ggplot2)
library(dplyr)
library(forcats)

data %>%
  mutate(group = factor(group),
        country = fct_rev(fct_inorder(country)),
        rank = as.numeric(country)) %>% 
  ggplot(aes(rank, count, colour = group)) +
  geom_point(position = position_jitter(width = 0.1, height = 0.05),
             alpha = 0.5) +
  geom_smooth(formula = y ~ x, method = glm,
              method.args = list(family = poisson), alpha = 0.2,
              aes(fill = after_scale(colour))) +
  theme_bw() +
  scale_colour_manual(values = c("orange", "deepskyblue4")) +
  ggtitle("Counts per group according to per capita GDP") +
  labs(x = "Country rank by GDP per capita",
       y = "Count (jittered for clarity)")

我认为这个情节很好地说明了这一点。为清楚起见,我可能会为国家/地区添加几个标签,以帮助您的观众适应比例尺。

如果你真的想要围绕点绘制多边形,那么最好在每个集合周围绘制凸包,你可以这样做:

data <- data %>%
  mutate(group = factor(group),
        country = fct_rev(fct_inorder(country)),
        rank = as.numeric(country)) 

hull <- data %>%
  group_by(group) %>% 
  slice(chull(rank, count))

ggplot(data, aes(rank, count)) +
  geom_polygon(aes(colour = group, fill = after_scale(colour)),
               alpha = 0.3, data = hull) +
  geom_point(aes(colour = group)) +
  theme_bw() +
  scale_colour_manual(values = c("orange", "deepskyblue4")) +
  ggtitle("Counts per group according to per capita GDP") +
  labs(x = "Country rank by GDP per capita",
       y = "Count (jittered for clarity)")

尽管正如我所说,这没有太大的统计意义。

另一种想法是,您可以旋转数据框并找到第 1 组和第 2 组之间的差异,然后显示排名国家/地区的趋势线:

library(tidyr)

data %>%
  mutate(group = factor(group),
        country = fct_inorder(country),
        rank = as.numeric(country)) %>%
  pivot_wider(names_from = group, values_from = count) %>% 
  mutate(difference = `1` - `2`) %>%
  ggplot(aes(rank, difference)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_point() +
  geom_smooth(aes(group = 1), method = lm, formula = y~x,
              colour = "red", fill = "orange", alpha = 0.2) +
  theme_bw() +
  ggtitle("Counts per group according to per capita GDP") +
  labs(x = "Country rank by GDP per capita",
       y = "Difference in counts between group 1 and group 2")

这样做的好处是您可以更轻松地标记各个国家/地区,并且非常清楚地显示您想要展示的关系。

reprex package (v2.0.1)

创建于 2022-03-05