为什么来自 forcats 的 fct_lump 无法处理我的数据？

Question

我已经离开 R 几个月了，所以这可能会产生一些后果。

我在互联网上找到了这个数据集。我对它做了一些处理，所以我会在这里 dput() 它，但它最初来自 https://ourworldindata.org/terrorism.

> dput(ter)
structure(list(region = c("Afghanistan", "Albania", "Algeria", 
"Angola", "Argentina", "Australasia & Oceania", "Australia", 
"Austria", "Azerbaijan", "Bahrain", "Bangladesh", "Belgium", 
"Brazil", "Burkina Faso", "Burundi", "Cameroon", "Canada", "Central African Republic", 
"Central America & Caribbean", "Central Asia", "Chad", "Chile", 
"China", "Colombia", "Cote d'Ivoire", "Czech Republic", "Democratic Republic of the Congo", 
"Djibouti", "Dominican Republic", "East Asia", "Eastern Europe", 
"Ecuador", "Egypt", "Ethiopia", "Finland", "France", "Gabon", 
"Georgia", "Germany", "Greece", "Honduras", "India", "Indonesia", 
"Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Jordan", 
"Kenya", "Kosovo", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", 
"Liberia", "Libya", "Malawi", "Malaysia", "Maldives", "Mali", 
"Malta", "Mexico", "Middle East & North Africa", "Mozambique", 
"Myanmar", "Nepal", "Netherlands", "Niger", "Nigeria", "North America", 
"Macedonia", "Norway", "Pakistan", "Palestine", "Papua New Guinea", 
"Paraguay", "Peru", "Philippines", "Poland", "Russia", "Rwanda", 
"Saudi Arabia", "Serbia", "Sierra Leone", "Somalia", "South Africa", 
"South America", "South Asia", "South Sudan", "Southeast Asia", 
"Spain", "Sri Lanka", "Sub-Saharan Africa", "Sudan", "Sweden", 
"Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Tunisia", 
"Turkey", "Uganda", "Ukraine", "UK", "USA", "Venezuela", "Vietnam", 
"Western Europe", "World", "Yemen", "Zambia", "Zimbabwe"), Code = c("AFG", 
"ALB", "DZA", "AGO", "ARG", NA, "AUS", "AUT", "AZE", "BHR", "BGD", 
"BEL", "BRA", "BFA", "BDI", "CMR", "CAN", "CAF", NA, NA, "TCD", 
"CHL", "CHN", "COL", "CIV", "CZE", "COD", "DJI", "DOM", NA, NA, 
"ECU", "EGY", "ETH", "FIN", "FRA", "GAB", "GEO", "DEU", "GRC", 
"HND", "IND", "IDN", "IRN", "IRQ", "IRL", "ISR", "ITA", "JAM", 
"JOR", "KEN", "OWID_KOS", "KGZ", "LAO", "LVA", "LBN", "LBR", 
"LBY", "MWI", "MYS", "MDV", "MLI", "MLT", "MEX", NA, "MOZ", "MMR", 
"NPL", "NLD", "NER", "NGA", NA, "MKD", "NOR", "PAK", "PSE", "PNG", 
"PRY", "PER", "PHL", "POL", "RUS", "RWA", "SAU", "SRB", "SLE", 
"SOM", "ZAF", NA, NA, "SSD", NA, "ESP", "LKA", NA, "SDN", "SWE", 
"SYR", "TWN", "TJK", "TZA", "THA", "TUN", "TUR", "UGA", "UKR", 
"GBR", "USA", "VEN", "VNM", NA, "OWID_WRL", "YEM", "ZMB", "ZWE"
), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), `Terrorism fatalities (GTD, 2018)` = c(6092, 
0, 12, 7, 0, 4, 4, 2, 5, 6, 25, 2, 0, 53, 20, 228, 6, 601, 4, 
6, 62, 0, 16, 84, 3, 0, 596, 0, 2, 16, 101, 0, 877, 67, 2, 7, 
0, 0, 1, 0, 2, 465, 20, 39, 6476, 0, 3, 0, 0, 4, 126, 0, 0, 1, 
0, 17, 0, 289, 0, 4, 1, 361, 1, 23, 10819, 22, 218, 4, 0, 148, 
1805, 124, 0, 0, 1076, 50, 0, 4, 8, 496, 0, 61, 2, 31, 0, 0, 
1912, 21, 101, 7664, 581, 811, 21, 1, 6712, 82, 5, 2026, 0, 1, 
8, 72, 5, 222, 7, 40, 42, 95, 5, 0, 83, 26445, 762, 0, 0)), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -115L), spec = structure(list(
    cols = list(Entity = structure(list(), class = c("collector_character", 
    "collector")), Code = structure(list(), class = c("collector_character", 
    "collector")), Year = structure(list(), class = c("collector_double", 
    "collector")), `Terrorism fatalities (GTD, 2018)` = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), skip = 1), class = "col_spec"))

我试了很多东西，总是把重要的代码缩小。最后，我使用了 vignette("forcats") 中的示例代码（我已经 tidyverse 加载），并尝试使用不同的数据集：

starwars %>%
  mutate(skin_color = fct_lump(skin_color, n = 5)) %>%
  count(skin_color, sort = TRUE)
  
ter %>%
  mutate(hair = fct_lump(region, n = 5)) %>%
  count(hair, sort = TRUE)

gss_cat %>%
  mutate(relig = fct_lump(relig, n = 5)) %>%
  count(relig, sort = TRUE)

它在 starwars 和 gss_cat 上都按预期工作，但在 ter 上却不行（我的数据）：

> ter %>%
+   mutate(hair = fct_lump(region, n = 5)) %>%
+   count(hair, sort = TRUE)
# A tibble: 115 x 2
   hair                      n
   <fct>                 <int>
 1 Afghanistan               1
 2 Albania                   1
 3 Algeria                   1
 4 Angola                    1
 5 Argentina                 1
 6 Australasia & Oceania     1
 7 Australia                 1
 8 Austria                   1
 9 Azerbaijan                1
10 Bahrain                   1
# … with 105 more rows

为什么会这样？为什么 fct_lump() 不在这里工作？

Answer 1

您似乎希望将死亡人数少于 5 人的地区归为“其他”类别。这在 base R

中很简单

ter$region <- as.character(ter$region)
ter$region[which(ter$`Terrorism fatalities (GTD, 2018)` < 5)] <- "Other"
ter$region <- factor(ter$region)

如果你愿意，你可以使用 forcats 根据死亡率等级重新升级：

ter$region <- fct_reorder(ter$region, ter$`Terrorism fatalities (GTD, 2018)`)

ggplot(ter, aes(region, `Terrorism fatalities (GTD, 2018)`)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

或者，如果您运行以上代码，但将所有死亡率低于 500 的地区汇总在一起，您会得到：

为什么来自 forcats 的 fct_lump 无法处理我的数据？

Why is fct_lump from forcats not working on my data?

r

forcats