R - ggmap - 通过地理编码计算城市之间的最短距离

R - ggmap - calculate shortest distance between cities via geocoding

我有一个城市列表和我放在数据框中的相关信息,如下所示:

library(plyr)
library(dplyr)
library(ggmap)
library(Imap)

cities <- c("washington, dc", "wilmington, de", "amarillo, tx", 
            "denver, co", "needham, ma", "philadelphia, pa", 
            "doylestown, pa", "galveston, tx", "tuscaloosa, al", 
            "hollywood, fl"
            )

id <- c(156952, 154222, 785695, 154423, 971453, 149888, 1356987,
        178946, 169944, 136421)

month <- c(201811, 201811, 201912, 201912, 202005, 202005, 
           202005, 202106, 202106, 202106 )

category<- c("home", "work", "home", "home", "home", "work",
             "cell", "home", "work", "cell")

places <- data.frame(cities, id, category, month)

使用 Imapggmap 包,我可以检索每个城市的经度和纬度:

lat <- geocode(location = places$cities, source = "google")$lat
lon <- geocode(location = places$cities, source = "google")$lon

places <- cbind(places, lat, lon)

我想做的是:

  1. 按月份和类别计算每个城市之间的距离
  2. return第二短的距离和对应的城市和id分列在places

我写了一个for循环来计算距离:

for (i in 1:nrow(places)) {




dist_list[[i]] <- gdist(lon.1 = places$lon[i], 
                          lat.1 = places$lat[i], 
                          lon.2 = places$lon, 
                          lat.2 = places$lat,
                          units="miles")
  
}

产生以下数据:

dput(dist_list)
list(c(0, 98.3464717885451, 1386.25425677199, 1489.87718040776, 
383.083760289456, 123.232894969413, 140.284537078237, 1209.23510542932, 
706.670452283757, 906.79542720295), c(98.4762434610638, 0, 1472.06660056474, 
1560.93398322985, 285.23618862797, 24.9195071209828, 44.8853561530985, 
1308.60741637919, 805.755084908157, 983.102810248198), c(1389.07354011351, 
1472.06660056474, 0, 356.573530670257, 1712.29111612461, 1493.39302974566, 
1497.2125164277, 579.329313217289, 827.577713357261, 1434.82691622332
), c(1492.80130415651, 1560.93398322985, 356.573530670257, 0, 
1761.3773163288, 1578.71125031146, 1576.80713231756, 923.725006795209, 
1067.04809350934, 1717.32991551111), c(383.551997010915, 285.23618862797, 
1712.29111612461, 1761.3773163288, 0, 260.382178510916, 243.947043197789, 
1588.85470703957, 1088.38640303169, 1230.47219244291), c(123.395655314093, 
24.9195071209827, 1493.39302974566, 1578.71125031146, 260.382178510916, 
0, 24.7382114555287, 1333.29925285915, 830.581742827321, 1002.94777739349
), c(140.431447025301, 44.8853561530986, 1497.2125164277, 1576.80713231756, 
243.947043197789, 24.7382114555285, 0, 1346.44527983873, 844.827513981938, 
1026.98263808807), c(1211.16392416136, 1308.60741637919, 579.329313217289, 
923.725006795209, 1588.85470703957, 1333.29925285915, 1346.44527983873, 
0, 505.292529136012, 925.512554201542), c(707.73957320737, 805.755084908157, 
827.577713357261, 1067.04809350934, 1088.38640303169, 830.581742827321, 
844.827513981938, 505.292529136012, 0, 666.837848781548), c(906.880841903584, 
983.102810248198, 1434.82691622332, 1717.32991551111, 1230.47219244291, 
1002.94777739349, 1026.98263808807, 925.512554201542, 666.837848781548, 
0))

所需的结果如下所示(第一行):

cities          id         category  month      lat        lon   min.dist  closest city  closest city id  
washington, dc  156952     home      201811 38.90719  -77.03687   98.34647  wilmington, de  154222 

然后通过Rfast中的nth函数我可以得到第二小的距离

nth(dist_list[[1]], 2)

我遇到的问题是我不知道如何将列表中的信息连接到 df places。任何帮助或建议将不胜感激。

# get min distance:
min_d <- sapply(dist_list, function(x) sort(x)[2])
places$min_dist <- min_d
# index:
i <- sapply(dist_list, function(x) which(sort(x)[2] == x))
# add name:
places$min_name <- places$cities[i]

分组:

# prepare dist matrix outside loop
m <- t(as.data.frame(dist_list))
row.names(m) <- NULL
diag(m) <- NA

# create grouping variable:
gv <- as.integer(factor(places$month)) # or:
# gv <- as.integer(factor(paste(places$month, places$category)))

# set distance to NA if not in relevant group:
i <- sapply(gv, function(x) gv == x)
m[!i] <- NA

l <- sapply(as.data.frame(t(m)), function(x) {
  if (all(is.na(x))) return(list(NA, NA))
  mv <- min(x, na.rm = T)
  i <- which(x == mv)
  list(mv, i)
})
l
places <- cbind(places, min_dist = unlist(l[1, ]), min_nr = unlist(l[2, ]))

places$min_name <- places$cities[places$min_nr] # add name
places$min_id <- places$id[places$min_nr] # add id
places

结果:

              cities      id category  month  min_dist min_nr         min_name  min_id
V1    washington, dc  156952     home 201811  98.34647      2   wilmington, de  154222
V2    wilmington, de  154222     work 201811  98.47624      1   washington, dc  156952
V3      amarillo, tx  785695     home 201912 356.57353      4       denver, co  154423
V4        denver, co  154423     home 201912 356.57353      3     amarillo, tx  785695
V5       needham, ma  971453     home 202005 243.94704      7   doylestown, pa 1356987
V6  philadelphia, pa  149888     work 202005  24.73821      7   doylestown, pa 1356987
V7    doylestown, pa 1356987     cell 202005  24.73821      6 philadelphia, pa  149888
V8     galveston, tx  178946     home 202106 505.29253      9   tuscaloosa, al  169944
V9    tuscaloosa, al  169944     work 202106 505.29253      8    galveston, tx  178946
V10    hollywood, fl  136421     cell 202106 666.83785      9   tuscaloosa, al  169944

更新

假设我们只按month分组,我们可以试试下面的代码

f <- function(df) {
    r <- list()
    for (i in 1:nrow(df)) {
        x <- c()
        for (j in 1:nrow(df)) {
            x <- append(
                x,
                with(df, gdist(lat[i], lon[i], lat[j], lon[j], units = "miles"))
            )
        }
        x <- replace(x, x == 0, Inf)
        idx <- which.min(x)
        r[[i]] <- data.frame(
            min.dist = min(x),
            closest_city = df$cities[idx],
            closest_city_id = df$id[idx]
        )
    }
    do.call(rbind, r)
}

places %>%
    group_by(month) %>%
    do(cbind(., f(.))) %>%
    ungroup()

这给出了

# A tibble: 10 x 9
   cities               id category  month   lat    lon min.dist closest_city   
   <chr>             <int> <chr>     <int> <dbl>  <dbl>    <dbl> <chr>
 1 washington, dc   156952 home     201811  38.9  -77.0   104.   wilmington, de
 2 wilmington, de   154222 work     201811  39.7  -75.5   104.   washington, dc
 3 amarillo, tx     785695 home     201912  35.2 -102.    232.   denver, co
 4 denver, co       154423 home     201912  39.8 -105.    232.   amarillo, tx
 5 needham, ma      971453 home     202005  42.3  -71.2   273.   doylestown, pa
 6 philadelphia, ~  149888 work     202005  40.0  -75.2     6.81 doylestown, pa
 7 doylestown, pa  1356987 cell     202005  40.3  -75.1     6.81 philadelphia, ~
 8 galveston, tx    178946 home     202106  29.2  -94.9 11405.   hollywood, fl
 9 tuscaloosa, al   169944 work     202106  33.2  -87.6   517.   hollywood, fl
10 hollywood, fl    136421 cell     202106  26.0  -80.1   517.   tuscaloosa, al
# ... with 1 more variable: closest_city_id <int>

根据你得到的dist_list,我们可以试试下面的代码

closest <- do.call(
    rbind,
    lapply(
        dist_list,
        function(x) {
            x <- replace(x, x == 0, Inf)
            idx <- which.min(x)
            with(
                places,
                data.frame(
                    min.dist = min(x),
                    closest_city = cities[idx],
                    closest_city_id = id[idx]
                )
            )
        }
    )
)

这给出了

    min.dist     closest_city closest_city_id
1   98.34647   wilmington, de          154222
2   24.91951 philadelphia, pa          149888
3  356.57353       denver, co          154423
4  356.57353     amarillo, tx          785695
5  243.94704   doylestown, pa         1356987
6   24.73821   doylestown, pa         1356987
7   24.73821 philadelphia, pa          149888
8  505.29253   tuscaloosa, al          169944
9  505.29253    galveston, tx          178946
10 666.83785   tuscaloosa, al          169944

此外,如果您想将上述数据框附加到现有的 places,您可以使用

places <- cbind(places, closest)

使用sf::st_distance()

鉴于您正在处理空间数据,我建议使用一种基于空间库的方法,例如 {sf}

library(tidyverse)
library(tidygeocoder)
library(sf)

# clean location, geocode, and convert to sf object
places <- places %>% 
  separate(cities, into = c("city", "state"), sep = ", ") %>% 
  geocode(city = city, state = state) %>% 
  st_as_sf(coords = c("long", "lat"), crs = 4269)

# sanity check
mapview::mapview(places)

# calculate distances between point pairs with st_distance()
compute_close_city <- function(i){
  # compute distances btwn a point and its neighbors (excluding itself)
  distances = st_distance(places[i, ], places[-i, ])
  # index of the nearest neighbor
  j = which.min(distances)
  
  # organize and return the result
  result <- tibble(
    close_city   = places$city[-i][j],  # closest city
    close_state  = places$state[-i][j], # closest state
    close_dist_m = distances[j]         # distance in meters
  )
  
  return(result)
}

# calculate close cities and distances, bind results into dataframe
close_df <- map_df(1:nrow(places), ~compute_close_city(.x))

# bind the result to the places data frame
places <- bind_cols(places, close_df)

# view the result and verify it works
select(places, city, close_city, close_dist_m)

Returns:

Simple feature collection with 10 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -104.9849 ymin: 26.0112 xmax: -75.13046 ymax: 40.31004
Geodetic CRS:  NAD83
# A tibble: 10 x 4
   city         close_city   close_dist_m             geometry
   <chr>        <chr>                 [m]          <POINT [°]>
 1 washington   wilmington      159476.85 (-77.03656 38.89499)
 2 wilmington   philadelphia     40022.81 (-75.54659 39.74595)
 3 amarillo     denver          574956.04 (-101.8338 35.20722)
 4 denver       amarillo        574956.04 (-104.9849 39.73924)
 5 needham      tuscaloosa      153463.74 (-88.33309 31.98683)
 6 philadelphia doylestown       39775.87 (-75.16353 39.95272)
 7 doylestown   philadelphia     39775.87 (-75.13046 40.31004)
 8 galveston    needham         687140.47 (-94.79459 29.29933)
 9 tuscaloosa   needham         153463.74 (-87.56753 33.20956)
10 hollywood    needham        1035934.60  (-80.14949 26.0112)