在 R 中绘制疾病发生率

Question

我需要绘制路易斯安那州疾病发病率的基本地图。

我有一个包含比率和教区信息的数据集。这是输出信息：

structure(list(Jurisdiction = c("Acadia", "Allen", "Ascension", 
"Assumption", "Avoyelles", "Beauregard", "Bienville", "Bossier", 
"Caddo", "Calcasieu", "Caldwell", "Catahoula", "Claiborne", "Concordia", 
"De Soto", "East Baton Rouge", "East Feliciana", "Evangeline", 
"Franklin", "Grant", "Iberia", "Iberville", "Jefferson", "Jefferson Davis", 
"Lafayette", "Lafourche", "Lincoln", "Livingston", "Madison", 
"Morehouse", "Natchitoches", "Orleans", "Ouachita", "Pointe Coupee", 
"Rapides", "Red River", "Richland", "Sabine", "Saint Bernard", 
"Saint Charles", "Saint Helena", "Saint James", "Saint John the Baptist", 
"Saint Landry", "Saint Martin", "Saint Tammany", "Tangipahoa", 
"Terrebonne", "Union", "Vermilion", "Vernon", "Washington", "Webster", 
"West Baton Rouge", "West Carroll", "West Feliciana", "Winn"), 
    Years = c("2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021", "2002-2021", "2002-2021", 
    "2002-2021", "2002-2021", "2002-2021"), rate100000 = c(0.163400216570434, 
    0.773806761122469, 1.65227799921151, 0.867207167609851, 0.956441805703603, 
    0.858602956670678, 0.704622322435175, 2.1463852687784, 3.03975879085029, 
    0.992596177497003, 2.00031262926478, 2.31216939229976, 0.601442519925039, 
    0.974473826664487, 1.12761018589066, 1.78582784205704, 1.69595218218634, 
    0.721123176565945, 1.16899967441146, 1.60279309368021, 0.810239968126418, 
    1.40034303718522, 0.731834312215526, 0.782371614520909, 0.484822337840045, 
    0.572633660703915, 0.881070491209755, 2.56515209988791, 1.27978003550004, 
    1.52480076954115, 0.641417097809484, 0.787839051984175, 2.30239642601005, 
    1.70341830192293, 2.90244465147414, 0.549510935267612, 1.16919425332635, 
    0.416023139389566, 0.192913247026751, 0.190299725272222, 
    2.88451607972779, 0.93669884525235, 0.336595944311048, 0.353212942215535, 
    0.374148069226462, 1.81203420478071, 2.3269113699404, 0.229665795513672, 
    0.889087973266384, 0.177886189636677, 0.297148057013978, 
    3.00397880577133, 1.08076419655843, 1.60933044309869, 2.01777233984116, 
    0.639093235215252, 0.875034298442136)), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -57L))

我使用以下代码为路易斯安那州创建了一个地图数据集：LAmap <- map_data("state", region = "Louisiana")。这是数据集的样子：

我想合并这两个数据集，但是 LAmap 一个给了我纬度和经度，但没有关于它对应的教区（县）的信息。我错过了什么吗？

谢谢！

Answer 1

使用 map_data，您需要使用 county 而不是 state 来获得正确的子区域。然后，我们可以使用 left_join 按县将它们合并在一起（即 subregion 和 Jurisdiction）。存在字母大小写差异，因此我首先将 Jurisdiction 转换为小写以匹配来自 map_data 的数据。注：df是来自dput.

的OP数据

library(tidyverse)

results <- map_data("county", region = "Louisiana") %>%
  left_join(.,
            df %>% mutate(Jurisdiction = tolower(Jurisdiction)),
            by = c("subregion" = "Jurisdiction"))

输出

head(results)

       long      lat group order    region subregion     Years rate100000
1 -92.61863 30.48136     1     1 louisiana    acadia 2002-2021  0.1634002
2 -92.48685 30.48709     1     2 louisiana    acadia 2002-2021  0.1634002
3 -92.48685 30.48709     1     3 louisiana    acadia 2002-2021  0.1634002
4 -92.24048 30.48709     1     4 louisiana    acadia 2002-2021  0.1634002
5 -92.24048 30.44698     1     5 louisiana    acadia 2002-2021  0.1634002
6 -92.17745 30.44698     1     6 louisiana    acadia 2002-2021  0.1634002

或者我们可以使用fuzzy_join，在这里我们可以在函数中使用忽略字母大小写。

library(fuzzyjoin)

regex_inner_join(
  map_data("county", region = "Louisiana"),
  df,
  by = c("subregion" = "Jurisdiction"),
  ignore_case = TRUE
)

或者 merge 可以用基数 R:

df$Jurisdiction <- tolower(df$Jurisdiction)

results_baseR <- merge(map_data("county", region = "Louisiana"), df, by.x = "subregion", by.y = "Jurisdiction", ignore.case=TRUE)

Answer 2

让我们尝试使用 OpenStreetMap 数据：

library(sf)
library(tidyverse)
library(osmdata)

你的结构：

rt <- structure(list(Jurisdiction = c("Acadia", "Allen", "Ascension", 
[...] class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -57L))

让我们找到路易斯安那州的边界框并获取一些边界

lbb <- getbb("Louisiana, US", format_out = "matrix")
addM <- matrix(data = c(-0.01, -0.01, 0.01, 0.01), nrow = 2, ncol = 2)
lbb <- lbb + addM

boundaries <- opq(lbb, timeout = 120) |> 
  add_osm_feature(key = "boundary") |>
  osmdata_sf()

作为 Overpass returns bbox（带邻域）的完整数据，我们必须找到洛杉矶州的多边形。它将用于仅过滤掉 LA'a 县：

LA <- boundaries$osm_multipolygons |>
  filter(name == "Louisiana")

最后，我们将过滤掉县（教区），更改名称以匹配您的数据，将 St. 更改为 Saint 和 left_join() 您的数据并绘制它：

a <- boundaries$osm_multipolygons |>
  filter(border_type == "county") |>
  select(osm_id, name, geometry) |>
  mutate(name = str_replace(name, " Parish", "")) |>
  mutate(name = str_replace(name, "St.", "Saint")) |>
  filter(st_within(geometry, LA$geometry, sparse = FALSE)) |>
  left_join(rt, by = c("name" = "Jurisdiction"))

plot(a["rate100000"])

在 R 中绘制疾病发生率

Mapping disease rates in R

mapping

r