R - ggmap - 通过地理编码计算城市之间的最短距离
R - ggmap - calculate shortest distance between cities via geocoding
我有一个城市列表和我放在数据框中的相关信息,如下所示:
library(plyr)
library(dplyr)
library(ggmap)
library(Imap)
cities <- c("washington, dc", "wilmington, de", "amarillo, tx",
"denver, co", "needham, ma", "philadelphia, pa",
"doylestown, pa", "galveston, tx", "tuscaloosa, al",
"hollywood, fl"
)
id <- c(156952, 154222, 785695, 154423, 971453, 149888, 1356987,
178946, 169944, 136421)
month <- c(201811, 201811, 201912, 201912, 202005, 202005,
202005, 202106, 202106, 202106 )
category<- c("home", "work", "home", "home", "home", "work",
"cell", "home", "work", "cell")
places <- data.frame(cities, id, category, month)
使用 Imap
和 ggmap
包,我可以检索每个城市的经度和纬度:
lat <- geocode(location = places$cities, source = "google")$lat
lon <- geocode(location = places$cities, source = "google")$lon
places <- cbind(places, lat, lon)
我想做的是:
- 按月份和类别计算每个城市之间的距离
- return第二短的距离和对应的城市和id分列在
places
我写了一个for
循环来计算距离:
for (i in 1:nrow(places)) {
dist_list[[i]] <- gdist(lon.1 = places$lon[i],
lat.1 = places$lat[i],
lon.2 = places$lon,
lat.2 = places$lat,
units="miles")
}
产生以下数据:
dput(dist_list)
list(c(0, 98.3464717885451, 1386.25425677199, 1489.87718040776,
383.083760289456, 123.232894969413, 140.284537078237, 1209.23510542932,
706.670452283757, 906.79542720295), c(98.4762434610638, 0, 1472.06660056474,
1560.93398322985, 285.23618862797, 24.9195071209828, 44.8853561530985,
1308.60741637919, 805.755084908157, 983.102810248198), c(1389.07354011351,
1472.06660056474, 0, 356.573530670257, 1712.29111612461, 1493.39302974566,
1497.2125164277, 579.329313217289, 827.577713357261, 1434.82691622332
), c(1492.80130415651, 1560.93398322985, 356.573530670257, 0,
1761.3773163288, 1578.71125031146, 1576.80713231756, 923.725006795209,
1067.04809350934, 1717.32991551111), c(383.551997010915, 285.23618862797,
1712.29111612461, 1761.3773163288, 0, 260.382178510916, 243.947043197789,
1588.85470703957, 1088.38640303169, 1230.47219244291), c(123.395655314093,
24.9195071209827, 1493.39302974566, 1578.71125031146, 260.382178510916,
0, 24.7382114555287, 1333.29925285915, 830.581742827321, 1002.94777739349
), c(140.431447025301, 44.8853561530986, 1497.2125164277, 1576.80713231756,
243.947043197789, 24.7382114555285, 0, 1346.44527983873, 844.827513981938,
1026.98263808807), c(1211.16392416136, 1308.60741637919, 579.329313217289,
923.725006795209, 1588.85470703957, 1333.29925285915, 1346.44527983873,
0, 505.292529136012, 925.512554201542), c(707.73957320737, 805.755084908157,
827.577713357261, 1067.04809350934, 1088.38640303169, 830.581742827321,
844.827513981938, 505.292529136012, 0, 666.837848781548), c(906.880841903584,
983.102810248198, 1434.82691622332, 1717.32991551111, 1230.47219244291,
1002.94777739349, 1026.98263808807, 925.512554201542, 666.837848781548,
0))
所需的结果如下所示(第一行):
cities id category month lat lon min.dist closest city closest city id
washington, dc 156952 home 201811 38.90719 -77.03687 98.34647 wilmington, de 154222
然后通过Rfast
中的nth
函数我可以得到第二小的距离
nth(dist_list[[1]], 2)
我遇到的问题是我不知道如何将列表中的信息连接到 df places
。任何帮助或建议将不胜感激。
# get min distance:
min_d <- sapply(dist_list, function(x) sort(x)[2])
places$min_dist <- min_d
# index:
i <- sapply(dist_list, function(x) which(sort(x)[2] == x))
# add name:
places$min_name <- places$cities[i]
分组:
# prepare dist matrix outside loop
m <- t(as.data.frame(dist_list))
row.names(m) <- NULL
diag(m) <- NA
# create grouping variable:
gv <- as.integer(factor(places$month)) # or:
# gv <- as.integer(factor(paste(places$month, places$category)))
# set distance to NA if not in relevant group:
i <- sapply(gv, function(x) gv == x)
m[!i] <- NA
l <- sapply(as.data.frame(t(m)), function(x) {
if (all(is.na(x))) return(list(NA, NA))
mv <- min(x, na.rm = T)
i <- which(x == mv)
list(mv, i)
})
l
places <- cbind(places, min_dist = unlist(l[1, ]), min_nr = unlist(l[2, ]))
places$min_name <- places$cities[places$min_nr] # add name
places$min_id <- places$id[places$min_nr] # add id
places
结果:
cities id category month min_dist min_nr min_name min_id
V1 washington, dc 156952 home 201811 98.34647 2 wilmington, de 154222
V2 wilmington, de 154222 work 201811 98.47624 1 washington, dc 156952
V3 amarillo, tx 785695 home 201912 356.57353 4 denver, co 154423
V4 denver, co 154423 home 201912 356.57353 3 amarillo, tx 785695
V5 needham, ma 971453 home 202005 243.94704 7 doylestown, pa 1356987
V6 philadelphia, pa 149888 work 202005 24.73821 7 doylestown, pa 1356987
V7 doylestown, pa 1356987 cell 202005 24.73821 6 philadelphia, pa 149888
V8 galveston, tx 178946 home 202106 505.29253 9 tuscaloosa, al 169944
V9 tuscaloosa, al 169944 work 202106 505.29253 8 galveston, tx 178946
V10 hollywood, fl 136421 cell 202106 666.83785 9 tuscaloosa, al 169944
更新
假设我们只按month
分组,我们可以试试下面的代码
f <- function(df) {
r <- list()
for (i in 1:nrow(df)) {
x <- c()
for (j in 1:nrow(df)) {
x <- append(
x,
with(df, gdist(lat[i], lon[i], lat[j], lon[j], units = "miles"))
)
}
x <- replace(x, x == 0, Inf)
idx <- which.min(x)
r[[i]] <- data.frame(
min.dist = min(x),
closest_city = df$cities[idx],
closest_city_id = df$id[idx]
)
}
do.call(rbind, r)
}
places %>%
group_by(month) %>%
do(cbind(., f(.))) %>%
ungroup()
这给出了
# A tibble: 10 x 9
cities id category month lat lon min.dist closest_city
<chr> <int> <chr> <int> <dbl> <dbl> <dbl> <chr>
1 washington, dc 156952 home 201811 38.9 -77.0 104. wilmington, de
2 wilmington, de 154222 work 201811 39.7 -75.5 104. washington, dc
3 amarillo, tx 785695 home 201912 35.2 -102. 232. denver, co
4 denver, co 154423 home 201912 39.8 -105. 232. amarillo, tx
5 needham, ma 971453 home 202005 42.3 -71.2 273. doylestown, pa
6 philadelphia, ~ 149888 work 202005 40.0 -75.2 6.81 doylestown, pa
7 doylestown, pa 1356987 cell 202005 40.3 -75.1 6.81 philadelphia, ~
8 galveston, tx 178946 home 202106 29.2 -94.9 11405. hollywood, fl
9 tuscaloosa, al 169944 work 202106 33.2 -87.6 517. hollywood, fl
10 hollywood, fl 136421 cell 202106 26.0 -80.1 517. tuscaloosa, al
# ... with 1 more variable: closest_city_id <int>
根据你得到的dist_list
,我们可以试试下面的代码
closest <- do.call(
rbind,
lapply(
dist_list,
function(x) {
x <- replace(x, x == 0, Inf)
idx <- which.min(x)
with(
places,
data.frame(
min.dist = min(x),
closest_city = cities[idx],
closest_city_id = id[idx]
)
)
}
)
)
这给出了
min.dist closest_city closest_city_id
1 98.34647 wilmington, de 154222
2 24.91951 philadelphia, pa 149888
3 356.57353 denver, co 154423
4 356.57353 amarillo, tx 785695
5 243.94704 doylestown, pa 1356987
6 24.73821 doylestown, pa 1356987
7 24.73821 philadelphia, pa 149888
8 505.29253 tuscaloosa, al 169944
9 505.29253 galveston, tx 178946
10 666.83785 tuscaloosa, al 169944
此外,如果您想将上述数据框附加到现有的 places
,您可以使用
places <- cbind(places, closest)
使用sf::st_distance()
鉴于您正在处理空间数据,我建议使用一种基于空间库的方法,例如 {sf}
。
library(tidyverse)
library(tidygeocoder)
library(sf)
# clean location, geocode, and convert to sf object
places <- places %>%
separate(cities, into = c("city", "state"), sep = ", ") %>%
geocode(city = city, state = state) %>%
st_as_sf(coords = c("long", "lat"), crs = 4269)
# sanity check
mapview::mapview(places)
# calculate distances between point pairs with st_distance()
compute_close_city <- function(i){
# compute distances btwn a point and its neighbors (excluding itself)
distances = st_distance(places[i, ], places[-i, ])
# index of the nearest neighbor
j = which.min(distances)
# organize and return the result
result <- tibble(
close_city = places$city[-i][j], # closest city
close_state = places$state[-i][j], # closest state
close_dist_m = distances[j] # distance in meters
)
return(result)
}
# calculate close cities and distances, bind results into dataframe
close_df <- map_df(1:nrow(places), ~compute_close_city(.x))
# bind the result to the places data frame
places <- bind_cols(places, close_df)
# view the result and verify it works
select(places, city, close_city, close_dist_m)
Returns:
Simple feature collection with 10 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -104.9849 ymin: 26.0112 xmax: -75.13046 ymax: 40.31004
Geodetic CRS: NAD83
# A tibble: 10 x 4
city close_city close_dist_m geometry
<chr> <chr> [m] <POINT [°]>
1 washington wilmington 159476.85 (-77.03656 38.89499)
2 wilmington philadelphia 40022.81 (-75.54659 39.74595)
3 amarillo denver 574956.04 (-101.8338 35.20722)
4 denver amarillo 574956.04 (-104.9849 39.73924)
5 needham tuscaloosa 153463.74 (-88.33309 31.98683)
6 philadelphia doylestown 39775.87 (-75.16353 39.95272)
7 doylestown philadelphia 39775.87 (-75.13046 40.31004)
8 galveston needham 687140.47 (-94.79459 29.29933)
9 tuscaloosa needham 153463.74 (-87.56753 33.20956)
10 hollywood needham 1035934.60 (-80.14949 26.0112)
我有一个城市列表和我放在数据框中的相关信息,如下所示:
library(plyr)
library(dplyr)
library(ggmap)
library(Imap)
cities <- c("washington, dc", "wilmington, de", "amarillo, tx",
"denver, co", "needham, ma", "philadelphia, pa",
"doylestown, pa", "galveston, tx", "tuscaloosa, al",
"hollywood, fl"
)
id <- c(156952, 154222, 785695, 154423, 971453, 149888, 1356987,
178946, 169944, 136421)
month <- c(201811, 201811, 201912, 201912, 202005, 202005,
202005, 202106, 202106, 202106 )
category<- c("home", "work", "home", "home", "home", "work",
"cell", "home", "work", "cell")
places <- data.frame(cities, id, category, month)
使用 Imap
和 ggmap
包,我可以检索每个城市的经度和纬度:
lat <- geocode(location = places$cities, source = "google")$lat
lon <- geocode(location = places$cities, source = "google")$lon
places <- cbind(places, lat, lon)
我想做的是:
- 按月份和类别计算每个城市之间的距离
- return第二短的距离和对应的城市和id分列在
places
我写了一个for
循环来计算距离:
for (i in 1:nrow(places)) {
dist_list[[i]] <- gdist(lon.1 = places$lon[i],
lat.1 = places$lat[i],
lon.2 = places$lon,
lat.2 = places$lat,
units="miles")
}
产生以下数据:
dput(dist_list)
list(c(0, 98.3464717885451, 1386.25425677199, 1489.87718040776,
383.083760289456, 123.232894969413, 140.284537078237, 1209.23510542932,
706.670452283757, 906.79542720295), c(98.4762434610638, 0, 1472.06660056474,
1560.93398322985, 285.23618862797, 24.9195071209828, 44.8853561530985,
1308.60741637919, 805.755084908157, 983.102810248198), c(1389.07354011351,
1472.06660056474, 0, 356.573530670257, 1712.29111612461, 1493.39302974566,
1497.2125164277, 579.329313217289, 827.577713357261, 1434.82691622332
), c(1492.80130415651, 1560.93398322985, 356.573530670257, 0,
1761.3773163288, 1578.71125031146, 1576.80713231756, 923.725006795209,
1067.04809350934, 1717.32991551111), c(383.551997010915, 285.23618862797,
1712.29111612461, 1761.3773163288, 0, 260.382178510916, 243.947043197789,
1588.85470703957, 1088.38640303169, 1230.47219244291), c(123.395655314093,
24.9195071209827, 1493.39302974566, 1578.71125031146, 260.382178510916,
0, 24.7382114555287, 1333.29925285915, 830.581742827321, 1002.94777739349
), c(140.431447025301, 44.8853561530986, 1497.2125164277, 1576.80713231756,
243.947043197789, 24.7382114555285, 0, 1346.44527983873, 844.827513981938,
1026.98263808807), c(1211.16392416136, 1308.60741637919, 579.329313217289,
923.725006795209, 1588.85470703957, 1333.29925285915, 1346.44527983873,
0, 505.292529136012, 925.512554201542), c(707.73957320737, 805.755084908157,
827.577713357261, 1067.04809350934, 1088.38640303169, 830.581742827321,
844.827513981938, 505.292529136012, 0, 666.837848781548), c(906.880841903584,
983.102810248198, 1434.82691622332, 1717.32991551111, 1230.47219244291,
1002.94777739349, 1026.98263808807, 925.512554201542, 666.837848781548,
0))
所需的结果如下所示(第一行):
cities id category month lat lon min.dist closest city closest city id
washington, dc 156952 home 201811 38.90719 -77.03687 98.34647 wilmington, de 154222
然后通过Rfast
中的nth
函数我可以得到第二小的距离
nth(dist_list[[1]], 2)
我遇到的问题是我不知道如何将列表中的信息连接到 df places
。任何帮助或建议将不胜感激。
# get min distance:
min_d <- sapply(dist_list, function(x) sort(x)[2])
places$min_dist <- min_d
# index:
i <- sapply(dist_list, function(x) which(sort(x)[2] == x))
# add name:
places$min_name <- places$cities[i]
分组:
# prepare dist matrix outside loop
m <- t(as.data.frame(dist_list))
row.names(m) <- NULL
diag(m) <- NA
# create grouping variable:
gv <- as.integer(factor(places$month)) # or:
# gv <- as.integer(factor(paste(places$month, places$category)))
# set distance to NA if not in relevant group:
i <- sapply(gv, function(x) gv == x)
m[!i] <- NA
l <- sapply(as.data.frame(t(m)), function(x) {
if (all(is.na(x))) return(list(NA, NA))
mv <- min(x, na.rm = T)
i <- which(x == mv)
list(mv, i)
})
l
places <- cbind(places, min_dist = unlist(l[1, ]), min_nr = unlist(l[2, ]))
places$min_name <- places$cities[places$min_nr] # add name
places$min_id <- places$id[places$min_nr] # add id
places
结果:
cities id category month min_dist min_nr min_name min_id
V1 washington, dc 156952 home 201811 98.34647 2 wilmington, de 154222
V2 wilmington, de 154222 work 201811 98.47624 1 washington, dc 156952
V3 amarillo, tx 785695 home 201912 356.57353 4 denver, co 154423
V4 denver, co 154423 home 201912 356.57353 3 amarillo, tx 785695
V5 needham, ma 971453 home 202005 243.94704 7 doylestown, pa 1356987
V6 philadelphia, pa 149888 work 202005 24.73821 7 doylestown, pa 1356987
V7 doylestown, pa 1356987 cell 202005 24.73821 6 philadelphia, pa 149888
V8 galveston, tx 178946 home 202106 505.29253 9 tuscaloosa, al 169944
V9 tuscaloosa, al 169944 work 202106 505.29253 8 galveston, tx 178946
V10 hollywood, fl 136421 cell 202106 666.83785 9 tuscaloosa, al 169944
更新
假设我们只按month
分组,我们可以试试下面的代码
f <- function(df) {
r <- list()
for (i in 1:nrow(df)) {
x <- c()
for (j in 1:nrow(df)) {
x <- append(
x,
with(df, gdist(lat[i], lon[i], lat[j], lon[j], units = "miles"))
)
}
x <- replace(x, x == 0, Inf)
idx <- which.min(x)
r[[i]] <- data.frame(
min.dist = min(x),
closest_city = df$cities[idx],
closest_city_id = df$id[idx]
)
}
do.call(rbind, r)
}
places %>%
group_by(month) %>%
do(cbind(., f(.))) %>%
ungroup()
这给出了
# A tibble: 10 x 9
cities id category month lat lon min.dist closest_city
<chr> <int> <chr> <int> <dbl> <dbl> <dbl> <chr>
1 washington, dc 156952 home 201811 38.9 -77.0 104. wilmington, de
2 wilmington, de 154222 work 201811 39.7 -75.5 104. washington, dc
3 amarillo, tx 785695 home 201912 35.2 -102. 232. denver, co
4 denver, co 154423 home 201912 39.8 -105. 232. amarillo, tx
5 needham, ma 971453 home 202005 42.3 -71.2 273. doylestown, pa
6 philadelphia, ~ 149888 work 202005 40.0 -75.2 6.81 doylestown, pa
7 doylestown, pa 1356987 cell 202005 40.3 -75.1 6.81 philadelphia, ~
8 galveston, tx 178946 home 202106 29.2 -94.9 11405. hollywood, fl
9 tuscaloosa, al 169944 work 202106 33.2 -87.6 517. hollywood, fl
10 hollywood, fl 136421 cell 202106 26.0 -80.1 517. tuscaloosa, al
# ... with 1 more variable: closest_city_id <int>
根据你得到的dist_list
,我们可以试试下面的代码
closest <- do.call(
rbind,
lapply(
dist_list,
function(x) {
x <- replace(x, x == 0, Inf)
idx <- which.min(x)
with(
places,
data.frame(
min.dist = min(x),
closest_city = cities[idx],
closest_city_id = id[idx]
)
)
}
)
)
这给出了
min.dist closest_city closest_city_id
1 98.34647 wilmington, de 154222
2 24.91951 philadelphia, pa 149888
3 356.57353 denver, co 154423
4 356.57353 amarillo, tx 785695
5 243.94704 doylestown, pa 1356987
6 24.73821 doylestown, pa 1356987
7 24.73821 philadelphia, pa 149888
8 505.29253 tuscaloosa, al 169944
9 505.29253 galveston, tx 178946
10 666.83785 tuscaloosa, al 169944
此外,如果您想将上述数据框附加到现有的 places
,您可以使用
places <- cbind(places, closest)
使用sf::st_distance()
鉴于您正在处理空间数据,我建议使用一种基于空间库的方法,例如 {sf}
。
library(tidyverse)
library(tidygeocoder)
library(sf)
# clean location, geocode, and convert to sf object
places <- places %>%
separate(cities, into = c("city", "state"), sep = ", ") %>%
geocode(city = city, state = state) %>%
st_as_sf(coords = c("long", "lat"), crs = 4269)
# sanity check
mapview::mapview(places)
# calculate distances between point pairs with st_distance()
compute_close_city <- function(i){
# compute distances btwn a point and its neighbors (excluding itself)
distances = st_distance(places[i, ], places[-i, ])
# index of the nearest neighbor
j = which.min(distances)
# organize and return the result
result <- tibble(
close_city = places$city[-i][j], # closest city
close_state = places$state[-i][j], # closest state
close_dist_m = distances[j] # distance in meters
)
return(result)
}
# calculate close cities and distances, bind results into dataframe
close_df <- map_df(1:nrow(places), ~compute_close_city(.x))
# bind the result to the places data frame
places <- bind_cols(places, close_df)
# view the result and verify it works
select(places, city, close_city, close_dist_m)
Returns:
Simple feature collection with 10 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -104.9849 ymin: 26.0112 xmax: -75.13046 ymax: 40.31004
Geodetic CRS: NAD83
# A tibble: 10 x 4
city close_city close_dist_m geometry
<chr> <chr> [m] <POINT [°]>
1 washington wilmington 159476.85 (-77.03656 38.89499)
2 wilmington philadelphia 40022.81 (-75.54659 39.74595)
3 amarillo denver 574956.04 (-101.8338 35.20722)
4 denver amarillo 574956.04 (-104.9849 39.73924)
5 needham tuscaloosa 153463.74 (-88.33309 31.98683)
6 philadelphia doylestown 39775.87 (-75.16353 39.95272)
7 doylestown philadelphia 39775.87 (-75.13046 40.31004)
8 galveston needham 687140.47 (-94.79459 29.29933)
9 tuscaloosa needham 153463.74 (-87.56753 33.20956)
10 hollywood needham 1035934.60 (-80.14949 26.0112)