大纲 scatterplot/barplot 与 ggplot 中的分类数据和组的折线图
outline scatterplot/barplot with line graph with categorical data and groups in ggplot
我有一个包含约 150 个国家/地区的数据集、一个分组变量以及每个国家/地区和组 (0-6) 的值。我试图表明,GDP 较高的国家在一组中的价值高于另一组。我制作了一个散点图,按组显示每个国家/地区的值(这些国家/地区按 GDP 排序)。我想在这些点周围画一条线,这样就更清楚哪个组在哪个 GDP 范围内具有更高的值。
然而,我很茫然。
data %>% ggplot(aes(x = fct_rev(fct_inorder(country)),
y = count, color = group))+
geom_point()
显然,我需要清理它,但是有人知道该怎么做吗,由于 x 轴上的因素,密度不起作用,线条也不起作用,因为它会到达每个点。
基于我当前的数据集模拟连续数据也可以(我只是不知道该怎么做)。
我只想强调基于组和 GDP 的值范围。
也许不同类型的情节可能会有所帮助,但是哪个?所有评论表示赞赏!
下面是我的数据,已经按GDP排序了
structure(list(count = c(1, 0, 1, 3, 0, 1, 0, 1, 0, 1, 0, 2,
2, 0, 1, 0, 0, 2, 3, 0, 0, 1, 0, 2, 2, 0, 1, 0, 1, 0, 0, 1, 1,
0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0,
0, 1, 2, 0, 0, 6, 1, 1, 1, 0, 2, 1, 1, 0, 4, 2, 1, 0, 2, 4, 0,
1, 1, 0, 2, 1, 2, 1, 0, 2, 0, 1, 2, 1, 0, 2, 3, 2, 1, 0, 1, 1,
2, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 4, 0, 2, 0,
2, 0, 2, 0, 1, 1, 1, 1, 1, 5, 0, 0, 3, 3, 0, 0, 2, 1, 2, 0, 2,
1, 0, 0, 2, 1, 0, 1, 2, 3, 0, 0, 1, 4, 0, 0, 3, 0, 1, 0, 2, 4,
0, 1, 0, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 1,
0, 1, 1, 2, 0, 1, 1, 1, 0, 2, 0, 2, 0, 3, 1, 0, 4, 0, 2, 0, 0,
1, 1, 0, 0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 0, 1, 0, 2, 2, 0), country = c("CHE",
"CHE", "NOR", "NOR", "IRL", "IRL", "SGP", "SGP", "USA", "USA",
"AUS", "AUS", "DNK", "DNK", "SWE", "SWE", "NLD", "NLD", "GBR",
"GBR", "SCT", "SCT", "FIN", "FIN", "CAN", "CAN", "DEU", "DEU",
"BEL", "BEL", "ARE", "ARE", "JPN", "JPN", "ITA", "ITA", "KOR",
"KOR", "BRN", "BRN", "SVN", "SVN", "BHR", "BHR", "PRT", "PRT",
"SAU", "SAU", "EST", "EST", "GRC", "GRC", "LTU", "LTU", "SYC",
"SYC", "LVA", "LVA", "CHL", "CHL", "HRV", "HRV", "CRI", "CRI",
"TUR", "TUR", "MYS", "MYS", "KAZ", "KAZ", "LCA", "LCA", "ROU",
"ROU", "MUS", "MUS", "GRD", "GRD", "MEX", "MEX", "RUS", "RUS",
"CHN", "CHN", "SUR", "SUR", "BRA", "BRA", "DOM", "DOM", "BGR",
"BGR", "MNE", "MNE", "THA", "THA", "COL", "COL", "SRB", "SRB",
"ZAF", "ZAF", "GUY", "GUY", "FJI", "FJI", "LBY", "LBY", "BIH",
"BIH", "AZE", "AZE", "MKD", "MKD", "JAM", "JAM", "IRQ", "IRQ",
"NAM", "NAM", "GEO", "GEO", "ALB", "ALB", "XKX", "XKX", "WSM",
"WSM", "LKA", "LKA", "JOR", "JOR", "ARM", "ARM", "EGY", "EGY",
"SWZ", "SWZ", "IDN", "IDN", "PSE", "PSE", "CPV", "CPV", "MDA",
"MDA", "MAR", "MAR", "VUT", "VUT", "PNG", "PNG", "NGA", "NGA",
"LAO", "LAO", "VNM", "VNM", "SLB", "SLB", "GHA", "GHA", "MRT",
"MRT", "KEN", "KEN", "PAK", "PAK", "BGD", "BGD", "HTI", "HTI",
"ZMB", "ZMB", "SEN", "SEN", "YEM", "YEM", "SDN", "SDN", "TLS",
"TLS", "KGZ", "KGZ", "BEN", "BEN", "TJK", "TJK", "TZA", "TZA",
"EAZ", "EAZ", "NPL", "NPL", "GIN", "GIN", "UGA", "UGA", "MLI",
"MLI", "ETH", "ETH", "BFA", "BFA", "GMB", "GMB", "LBR", "LBR",
"GNB", "GNB", "SLE", "SLE", "MOZ", "MOZ", "AFG", "AFG", "COD",
"COD", "MWI", "MWI"), group = c(1, 2, 2, 1, 2, 1, 1, 2, 1, 2,
2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1,
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1,
2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1,
2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2,
1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1
)), row.names = c(33L, 34L, 151L, 152L, 89L, 90L, 175L, 176L,
205L, 206L, 9L, 10L, 49L, 50L, 187L, 188L, 149L, 150L, 65L, 66L,
169L, 170L, 61L, 62L, 31L, 32L, 47L, 48L, 13L, 14L, 5L, 6L, 99L,
100L, 93L, 94L, 107L, 108L, 29L, 30L, 185L, 186L, 23L, 24L, 159L,
160L, 167L, 168L, 57L, 58L, 77L, 78L, 119L, 120L, 191L, 192L,
121L, 122L, 35L, 36L, 83L, 84L, 45L, 46L, 199L, 200L, 143L, 144L,
101L, 102L, 115L, 116L, 163L, 164L, 139L, 140L, 79L, 80L, 127L,
128L, 165L, 166L, 37L, 38L, 183L, 184L, 27L, 28L, 51L, 52L, 21L,
22L, 133L, 134L, 193L, 194L, 41L, 42L, 181L, 182L, 217L, 218L,
81L, 82L, 63L, 64L, 113L, 114L, 25L, 26L, 11L, 12L, 129L, 130L,
95L, 96L, 91L, 92L, 145L, 146L, 67L, 68L, 3L, 4L, 213L, 214L,
211L, 212L, 117L, 118L, 97L, 98L, 7L, 8L, 55L, 56L, 189L, 190L,
87L, 88L, 161L, 162L, 43L, 44L, 125L, 126L, 123L, 124L, 209L,
210L, 157L, 158L, 147L, 148L, 109L, 110L, 207L, 208L, 177L, 178L,
69L, 70L, 137L, 138L, 103L, 104L, 155L, 156L, 19L, 20L, 85L,
86L, 219L, 220L, 173L, 174L, 215L, 216L, 171L, 172L, 197L, 198L,
105L, 106L, 15L, 16L, 195L, 196L, 201L, 202L, 53L, 54L, 153L,
154L, 71L, 72L, 203L, 204L, 131L, 132L, 59L, 60L, 17L, 18L, 73L,
74L, 111L, 112L, 75L, 76L, 179L, 180L, 135L, 136L, 1L, 2L, 39L,
40L, 141L, 142L), class = "data.frame")
这里有一个想法可以帮助形象化您试图展示的差异。首先,无论您如何标记,x 轴上的国家/地区名称都可能难以辨认。因此,x 轴上的国家 排名 可能会更好。
围绕这些点绘制一个多边形可能会使该点在视觉上更直观,但在统计方面没有多大意义。这里可能更好的是为每个组绘制一个单独的线的回归。由于我们处理的是计数数据,我们可以使用泊松回归,并且由于我们在 x 轴上有一个数字等级,所以可以让线条穿过您的绘图以显示回归。
library(ggplot2)
library(dplyr)
library(forcats)
data %>%
mutate(group = factor(group),
country = fct_rev(fct_inorder(country)),
rank = as.numeric(country)) %>%
ggplot(aes(rank, count, colour = group)) +
geom_point(position = position_jitter(width = 0.1, height = 0.05),
alpha = 0.5) +
geom_smooth(formula = y ~ x, method = glm,
method.args = list(family = poisson), alpha = 0.2,
aes(fill = after_scale(colour))) +
theme_bw() +
scale_colour_manual(values = c("orange", "deepskyblue4")) +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Count (jittered for clarity)")
我认为这个情节很好地说明了这一点。为清楚起见,我可能会为国家/地区添加几个标签,以帮助您的观众适应比例尺。
如果你真的想要围绕点绘制多边形,那么最好在每个集合周围绘制凸包,你可以这样做:
data <- data %>%
mutate(group = factor(group),
country = fct_rev(fct_inorder(country)),
rank = as.numeric(country))
hull <- data %>%
group_by(group) %>%
slice(chull(rank, count))
ggplot(data, aes(rank, count)) +
geom_polygon(aes(colour = group, fill = after_scale(colour)),
alpha = 0.3, data = hull) +
geom_point(aes(colour = group)) +
theme_bw() +
scale_colour_manual(values = c("orange", "deepskyblue4")) +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Count (jittered for clarity)")
尽管正如我所说,这没有太大的统计意义。
另一种想法是,您可以旋转数据框并找到第 1 组和第 2 组之间的差异,然后显示排名国家/地区的趋势线:
library(tidyr)
data %>%
mutate(group = factor(group),
country = fct_inorder(country),
rank = as.numeric(country)) %>%
pivot_wider(names_from = group, values_from = count) %>%
mutate(difference = `1` - `2`) %>%
ggplot(aes(rank, difference)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_point() +
geom_smooth(aes(group = 1), method = lm, formula = y~x,
colour = "red", fill = "orange", alpha = 0.2) +
theme_bw() +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Difference in counts between group 1 and group 2")
这样做的好处是您可以更轻松地标记各个国家/地区,并且非常清楚地显示您想要展示的关系。
由 reprex package (v2.0.1)
创建于 2022-03-05
我有一个包含约 150 个国家/地区的数据集、一个分组变量以及每个国家/地区和组 (0-6) 的值。我试图表明,GDP 较高的国家在一组中的价值高于另一组。我制作了一个散点图,按组显示每个国家/地区的值(这些国家/地区按 GDP 排序)。我想在这些点周围画一条线,这样就更清楚哪个组在哪个 GDP 范围内具有更高的值。 然而,我很茫然。
data %>% ggplot(aes(x = fct_rev(fct_inorder(country)),
y = count, color = group))+
geom_point()
显然,我需要清理它,但是有人知道该怎么做吗,由于 x 轴上的因素,密度不起作用,线条也不起作用,因为它会到达每个点。 基于我当前的数据集模拟连续数据也可以(我只是不知道该怎么做)。 我只想强调基于组和 GDP 的值范围。 也许不同类型的情节可能会有所帮助,但是哪个?所有评论表示赞赏! 下面是我的数据,已经按GDP排序了
structure(list(count = c(1, 0, 1, 3, 0, 1, 0, 1, 0, 1, 0, 2,
2, 0, 1, 0, 0, 2, 3, 0, 0, 1, 0, 2, 2, 0, 1, 0, 1, 0, 0, 1, 1,
0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 2, 0,
0, 1, 2, 0, 0, 6, 1, 1, 1, 0, 2, 1, 1, 0, 4, 2, 1, 0, 2, 4, 0,
1, 1, 0, 2, 1, 2, 1, 0, 2, 0, 1, 2, 1, 0, 2, 3, 2, 1, 0, 1, 1,
2, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 4, 0, 2, 0,
2, 0, 2, 0, 1, 1, 1, 1, 1, 5, 0, 0, 3, 3, 0, 0, 2, 1, 2, 0, 2,
1, 0, 0, 2, 1, 0, 1, 2, 3, 0, 0, 1, 4, 0, 0, 3, 0, 1, 0, 2, 4,
0, 1, 0, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 1,
0, 1, 1, 2, 0, 1, 1, 1, 0, 2, 0, 2, 0, 3, 1, 0, 4, 0, 2, 0, 0,
1, 1, 0, 0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 0, 1, 0, 2, 2, 0), country = c("CHE",
"CHE", "NOR", "NOR", "IRL", "IRL", "SGP", "SGP", "USA", "USA",
"AUS", "AUS", "DNK", "DNK", "SWE", "SWE", "NLD", "NLD", "GBR",
"GBR", "SCT", "SCT", "FIN", "FIN", "CAN", "CAN", "DEU", "DEU",
"BEL", "BEL", "ARE", "ARE", "JPN", "JPN", "ITA", "ITA", "KOR",
"KOR", "BRN", "BRN", "SVN", "SVN", "BHR", "BHR", "PRT", "PRT",
"SAU", "SAU", "EST", "EST", "GRC", "GRC", "LTU", "LTU", "SYC",
"SYC", "LVA", "LVA", "CHL", "CHL", "HRV", "HRV", "CRI", "CRI",
"TUR", "TUR", "MYS", "MYS", "KAZ", "KAZ", "LCA", "LCA", "ROU",
"ROU", "MUS", "MUS", "GRD", "GRD", "MEX", "MEX", "RUS", "RUS",
"CHN", "CHN", "SUR", "SUR", "BRA", "BRA", "DOM", "DOM", "BGR",
"BGR", "MNE", "MNE", "THA", "THA", "COL", "COL", "SRB", "SRB",
"ZAF", "ZAF", "GUY", "GUY", "FJI", "FJI", "LBY", "LBY", "BIH",
"BIH", "AZE", "AZE", "MKD", "MKD", "JAM", "JAM", "IRQ", "IRQ",
"NAM", "NAM", "GEO", "GEO", "ALB", "ALB", "XKX", "XKX", "WSM",
"WSM", "LKA", "LKA", "JOR", "JOR", "ARM", "ARM", "EGY", "EGY",
"SWZ", "SWZ", "IDN", "IDN", "PSE", "PSE", "CPV", "CPV", "MDA",
"MDA", "MAR", "MAR", "VUT", "VUT", "PNG", "PNG", "NGA", "NGA",
"LAO", "LAO", "VNM", "VNM", "SLB", "SLB", "GHA", "GHA", "MRT",
"MRT", "KEN", "KEN", "PAK", "PAK", "BGD", "BGD", "HTI", "HTI",
"ZMB", "ZMB", "SEN", "SEN", "YEM", "YEM", "SDN", "SDN", "TLS",
"TLS", "KGZ", "KGZ", "BEN", "BEN", "TJK", "TJK", "TZA", "TZA",
"EAZ", "EAZ", "NPL", "NPL", "GIN", "GIN", "UGA", "UGA", "MLI",
"MLI", "ETH", "ETH", "BFA", "BFA", "GMB", "GMB", "LBR", "LBR",
"GNB", "GNB", "SLE", "SLE", "MOZ", "MOZ", "AFG", "AFG", "COD",
"COD", "MWI", "MWI"), group = c(1, 2, 2, 1, 2, 1, 1, 2, 1, 2,
2, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1,
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1,
2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1,
2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2,
1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1
)), row.names = c(33L, 34L, 151L, 152L, 89L, 90L, 175L, 176L,
205L, 206L, 9L, 10L, 49L, 50L, 187L, 188L, 149L, 150L, 65L, 66L,
169L, 170L, 61L, 62L, 31L, 32L, 47L, 48L, 13L, 14L, 5L, 6L, 99L,
100L, 93L, 94L, 107L, 108L, 29L, 30L, 185L, 186L, 23L, 24L, 159L,
160L, 167L, 168L, 57L, 58L, 77L, 78L, 119L, 120L, 191L, 192L,
121L, 122L, 35L, 36L, 83L, 84L, 45L, 46L, 199L, 200L, 143L, 144L,
101L, 102L, 115L, 116L, 163L, 164L, 139L, 140L, 79L, 80L, 127L,
128L, 165L, 166L, 37L, 38L, 183L, 184L, 27L, 28L, 51L, 52L, 21L,
22L, 133L, 134L, 193L, 194L, 41L, 42L, 181L, 182L, 217L, 218L,
81L, 82L, 63L, 64L, 113L, 114L, 25L, 26L, 11L, 12L, 129L, 130L,
95L, 96L, 91L, 92L, 145L, 146L, 67L, 68L, 3L, 4L, 213L, 214L,
211L, 212L, 117L, 118L, 97L, 98L, 7L, 8L, 55L, 56L, 189L, 190L,
87L, 88L, 161L, 162L, 43L, 44L, 125L, 126L, 123L, 124L, 209L,
210L, 157L, 158L, 147L, 148L, 109L, 110L, 207L, 208L, 177L, 178L,
69L, 70L, 137L, 138L, 103L, 104L, 155L, 156L, 19L, 20L, 85L,
86L, 219L, 220L, 173L, 174L, 215L, 216L, 171L, 172L, 197L, 198L,
105L, 106L, 15L, 16L, 195L, 196L, 201L, 202L, 53L, 54L, 153L,
154L, 71L, 72L, 203L, 204L, 131L, 132L, 59L, 60L, 17L, 18L, 73L,
74L, 111L, 112L, 75L, 76L, 179L, 180L, 135L, 136L, 1L, 2L, 39L,
40L, 141L, 142L), class = "data.frame")
这里有一个想法可以帮助形象化您试图展示的差异。首先,无论您如何标记,x 轴上的国家/地区名称都可能难以辨认。因此,x 轴上的国家 排名 可能会更好。
围绕这些点绘制一个多边形可能会使该点在视觉上更直观,但在统计方面没有多大意义。这里可能更好的是为每个组绘制一个单独的线的回归。由于我们处理的是计数数据,我们可以使用泊松回归,并且由于我们在 x 轴上有一个数字等级,所以可以让线条穿过您的绘图以显示回归。
library(ggplot2)
library(dplyr)
library(forcats)
data %>%
mutate(group = factor(group),
country = fct_rev(fct_inorder(country)),
rank = as.numeric(country)) %>%
ggplot(aes(rank, count, colour = group)) +
geom_point(position = position_jitter(width = 0.1, height = 0.05),
alpha = 0.5) +
geom_smooth(formula = y ~ x, method = glm,
method.args = list(family = poisson), alpha = 0.2,
aes(fill = after_scale(colour))) +
theme_bw() +
scale_colour_manual(values = c("orange", "deepskyblue4")) +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Count (jittered for clarity)")
我认为这个情节很好地说明了这一点。为清楚起见,我可能会为国家/地区添加几个标签,以帮助您的观众适应比例尺。
如果你真的想要围绕点绘制多边形,那么最好在每个集合周围绘制凸包,你可以这样做:
data <- data %>%
mutate(group = factor(group),
country = fct_rev(fct_inorder(country)),
rank = as.numeric(country))
hull <- data %>%
group_by(group) %>%
slice(chull(rank, count))
ggplot(data, aes(rank, count)) +
geom_polygon(aes(colour = group, fill = after_scale(colour)),
alpha = 0.3, data = hull) +
geom_point(aes(colour = group)) +
theme_bw() +
scale_colour_manual(values = c("orange", "deepskyblue4")) +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Count (jittered for clarity)")
尽管正如我所说,这没有太大的统计意义。
另一种想法是,您可以旋转数据框并找到第 1 组和第 2 组之间的差异,然后显示排名国家/地区的趋势线:
library(tidyr)
data %>%
mutate(group = factor(group),
country = fct_inorder(country),
rank = as.numeric(country)) %>%
pivot_wider(names_from = group, values_from = count) %>%
mutate(difference = `1` - `2`) %>%
ggplot(aes(rank, difference)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_point() +
geom_smooth(aes(group = 1), method = lm, formula = y~x,
colour = "red", fill = "orange", alpha = 0.2) +
theme_bw() +
ggtitle("Counts per group according to per capita GDP") +
labs(x = "Country rank by GDP per capita",
y = "Difference in counts between group 1 and group 2")
这样做的好处是您可以更轻松地标记各个国家/地区,并且非常清楚地显示您想要展示的关系。
由 reprex package (v2.0.1)
创建于 2022-03-05