查找从数据框到特定位置最近的城市
Find nearest cities from the data frame to the specific location
下面的数据框包含有关纬度、经度、州和城市的信息。我想找
数据框中给定的每个城市的三个最近的城市。例如,从下面
dataframe,Oklahoma city 和 Colarado SPringd 离阿尔伯克基最近,所以离阿尔伯克基最近的三个城市应该是
保存在另一个名为 nearest_AL 的数据框中(我不知道如何得到这个结果,那是我试图通过创建数据框来给出一个想法)。
dataframe<-data.frame(long=c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261"),
lat=c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171"),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs")
)
nearest_Al<-data.frame(long=c("-97.60056","-104.70261"),
lat=c("35.39305","38.80171"),
state=c("OK","CO"),
city=c("Oklahoma City","Colarado Springs")
)
我必须对包含 500k 行和大约 100 个位置的数据框执行同样的操作。
提前致谢!
这是一个想法。 dataframe2
是最终输出。 Near_City
列显示了 city
列中每个城市的前三个最接近的城市。
library(dplyr)
library(sp)
library(rgdal)
library(sf)
# Create example data frame
dataframe<-data.frame(long=c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261"),
lat=c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171"),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs"),
stringsAsFactors = FALSE
)
# Create spatial point data frame object
dataframe_sp <- dataframe %>%
mutate(long = as.numeric(long), lat = as.numeric(lat))
coordinates(dataframe_sp) <- ~long + lat
# Convert to sf object
dataframe_sf <- st_as_sf(dataframe_sp)
# Set projection
st_crs(dataframe_sf) <- 4326
# Calculate the distance
dist_m <- st_distance(dataframe_sf, dataframe_sf)
# Select the closet three cities
# Remove the first row, and then select the first three rows
index <- apply(dist_m, 1, order)
index <- index[2:nrow(index), ]
index <- index[1:3, ]
# Rep each city by three
dataframe2 <- dataframe[rep(1:nrow(dataframe), each = 3), ]
# Process the dataframe based on index, store the results in Near_City column
dataframe2$Near_City <- dataframe[as.vector(index), ]$city
更新
我们可以进一步创建 OP 想要的输出。
dataframe3 <- dataframe[as.vector(index), ]
dataframe3$TargetCity <- dataframe2$city
nearest_city_list <- split(dataframe3, f = dataframe3$TargetCity)
现在每个 "Target City" 都是列表 nearest_city_list
中的一个元素。要访问数据,我们可以使用目标城市名称访问列表元素。这是一个提取阿尔伯克基结果的示例:
nearest_city_list[["Albuquerque"]]
long lat state city TargetCity
6 -104.70261 38.80171 CO Colarado Springs Albuquerque
5 -97.60056 35.39305 OK Oklahoma City Albuquerque
3 -84.42770 33.64073 GA Atlanta Albuquerque
以下应该适合你
我制作了一个 distance
函数,它接受 x
(dataframe
中当前行的经度),y
(dataframe
中当前行的纬度),以及 dataframe
。它returns前2个最近的城市(不包括目标城市)
dist <- function(xi, yi, z) {
z <- z %>%
mutate(dist = sqrt((as.double(as.character(z$long)) - as.double(as.character(xi)))^2 + (as.double(as.character(z$lat)) - as.double(as.character(yi)))^2)) %>%
arrange(dist) %>% # distance
slice(2:3) # top 2 nearest cities
return(z)
}
tidyverse 解决方案
library(tidyverse)
mod <- dataframe %>%
mutate(copylong = long, copylat = lat) %>% # make copy of longitude and latitude to nest
nest(copylong, copylat) %>% # nest copy
mutate(data = map(data, ~ dist(.x$copylong, .x$copylat, dataframe)))
仅将最近的城市保存为单独的数据框
desired <- map_df(1:nrow(mod), ~ mod$data[.x][[1]])
输出
long lat state city dist
1 -104.70261 38.80171 CO Colarado Springs 4.216001
2 -97.60056 35.39305 OK Oklahoma City 9.019133
3 -84.42770 33.64073 GA Atlanta 2.469928
4 -72.68604 41.93887 TX Windsor Locks 12.633063
5 -81.97224 33.37378 GA Augusta 2.469928
6 -97.60056 35.39305 OK Oklahoma City 13.288900
# etc
额外
如果要保留原始数据库和最近的城市
mod <- dataframe %>%
mutate(copylong = long, copylat = lat) %>% # make copy of longitude and latitude to nest
nest(copylong, copylat) %>% # nest copy
mutate(data = map(data, ~ dist(.x$copylong, .x$copylat, dataframe))) %>%
unnest(data)
额外输出
long lat state city long1 lat1 state1 city1 dist
1 -106.61291 35.04333 NM Albuquerque -104.70261 38.80171 CO Colarado Springs 4.216001
2 -106.61291 35.04333 NM Albuquerque -97.60056 35.39305 OK Oklahoma City 9.019133
3 -81.97224 33.37378 GA Augusta -84.42770 33.64073 GA Atlanta 2.469928
4 -81.97224 33.37378 GA Augusta -72.68604 41.93887 TX Windsor Locks 12.633063
拆分为命名列表
L <- split(mod, mod$city)
names(L) <- dataframe$city
这对于您的所有数据来说可能有点慢,但它确实有效
dataframe<-data.frame(long=as.numeric(c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261")),
lat=as.numeric(c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171")),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs"))
library(sp)
library(rgeos)
coordinates(dataframe) <- ~long+lat
dist_cities <- gDistance(dataframe, byid=T)
dist_cities_rank<-data.frame()
for(i in seq(1,dim(dist_cities)[1])){
dist_cities_rank<-rbind(dist_cities_rank,rank(as.numeric(dist_cities[i,])))
}
three_close_cities<-list()
for(i in seq(1,dim(dataframe)[1])){
three_close_cities[[i]]<-
list(test_city=dataframe[i,],cbind(dataframe[which(dist_cities_rank[i,]<=4&dist_cities_rank[i,]!=1),],
dist_cities[i,which(dist_cities_rank[i,]<=4&dist_cities_rank[i,]!=1)]))
}
下面的数据框包含有关纬度、经度、州和城市的信息。我想找 数据框中给定的每个城市的三个最近的城市。例如,从下面 dataframe,Oklahoma city 和 Colarado SPringd 离阿尔伯克基最近,所以离阿尔伯克基最近的三个城市应该是 保存在另一个名为 nearest_AL 的数据框中(我不知道如何得到这个结果,那是我试图通过创建数据框来给出一个想法)。
dataframe<-data.frame(long=c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261"),
lat=c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171"),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs")
)
nearest_Al<-data.frame(long=c("-97.60056","-104.70261"),
lat=c("35.39305","38.80171"),
state=c("OK","CO"),
city=c("Oklahoma City","Colarado Springs")
)
我必须对包含 500k 行和大约 100 个位置的数据框执行同样的操作。
提前致谢!
这是一个想法。 dataframe2
是最终输出。 Near_City
列显示了 city
列中每个城市的前三个最接近的城市。
library(dplyr)
library(sp)
library(rgdal)
library(sf)
# Create example data frame
dataframe<-data.frame(long=c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261"),
lat=c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171"),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs"),
stringsAsFactors = FALSE
)
# Create spatial point data frame object
dataframe_sp <- dataframe %>%
mutate(long = as.numeric(long), lat = as.numeric(lat))
coordinates(dataframe_sp) <- ~long + lat
# Convert to sf object
dataframe_sf <- st_as_sf(dataframe_sp)
# Set projection
st_crs(dataframe_sf) <- 4326
# Calculate the distance
dist_m <- st_distance(dataframe_sf, dataframe_sf)
# Select the closet three cities
# Remove the first row, and then select the first three rows
index <- apply(dist_m, 1, order)
index <- index[2:nrow(index), ]
index <- index[1:3, ]
# Rep each city by three
dataframe2 <- dataframe[rep(1:nrow(dataframe), each = 3), ]
# Process the dataframe based on index, store the results in Near_City column
dataframe2$Near_City <- dataframe[as.vector(index), ]$city
更新
我们可以进一步创建 OP 想要的输出。
dataframe3 <- dataframe[as.vector(index), ]
dataframe3$TargetCity <- dataframe2$city
nearest_city_list <- split(dataframe3, f = dataframe3$TargetCity)
现在每个 "Target City" 都是列表 nearest_city_list
中的一个元素。要访问数据,我们可以使用目标城市名称访问列表元素。这是一个提取阿尔伯克基结果的示例:
nearest_city_list[["Albuquerque"]]
long lat state city TargetCity
6 -104.70261 38.80171 CO Colarado Springs Albuquerque
5 -97.60056 35.39305 OK Oklahoma City Albuquerque
3 -84.42770 33.64073 GA Atlanta Albuquerque
以下应该适合你
我制作了一个 distance
函数,它接受 x
(dataframe
中当前行的经度),y
(dataframe
中当前行的纬度),以及 dataframe
。它returns前2个最近的城市(不包括目标城市)
dist <- function(xi, yi, z) {
z <- z %>%
mutate(dist = sqrt((as.double(as.character(z$long)) - as.double(as.character(xi)))^2 + (as.double(as.character(z$lat)) - as.double(as.character(yi)))^2)) %>%
arrange(dist) %>% # distance
slice(2:3) # top 2 nearest cities
return(z)
}
tidyverse 解决方案
library(tidyverse)
mod <- dataframe %>%
mutate(copylong = long, copylat = lat) %>% # make copy of longitude and latitude to nest
nest(copylong, copylat) %>% # nest copy
mutate(data = map(data, ~ dist(.x$copylong, .x$copylat, dataframe)))
仅将最近的城市保存为单独的数据框
desired <- map_df(1:nrow(mod), ~ mod$data[.x][[1]])
输出
long lat state city dist
1 -104.70261 38.80171 CO Colarado Springs 4.216001
2 -97.60056 35.39305 OK Oklahoma City 9.019133
3 -84.42770 33.64073 GA Atlanta 2.469928
4 -72.68604 41.93887 TX Windsor Locks 12.633063
5 -81.97224 33.37378 GA Augusta 2.469928
6 -97.60056 35.39305 OK Oklahoma City 13.288900
# etc
额外
如果要保留原始数据库和最近的城市
mod <- dataframe %>%
mutate(copylong = long, copylat = lat) %>% # make copy of longitude and latitude to nest
nest(copylong, copylat) %>% # nest copy
mutate(data = map(data, ~ dist(.x$copylong, .x$copylat, dataframe))) %>%
unnest(data)
额外输出
long lat state city long1 lat1 state1 city1 dist
1 -106.61291 35.04333 NM Albuquerque -104.70261 38.80171 CO Colarado Springs 4.216001
2 -106.61291 35.04333 NM Albuquerque -97.60056 35.39305 OK Oklahoma City 9.019133
3 -81.97224 33.37378 GA Augusta -84.42770 33.64073 GA Atlanta 2.469928
4 -81.97224 33.37378 GA Augusta -72.68604 41.93887 TX Windsor Locks 12.633063
拆分为命名列表
L <- split(mod, mod$city)
names(L) <- dataframe$city
这对于您的所有数据来说可能有点慢,但它确实有效
dataframe<-data.frame(long=as.numeric(c("-106.61291","-81.97224","-84.42770","-72.68604","-97.60056","-104.70261")),
lat=as.numeric(c("35.04333","33.37378","33.64073","41.93887","35.39305","38.80171")),
state=c("NM","GA","GA","TX","OK","CO"),
city=c("Albuquerque","Augusta","Atlanta","Windsor Locks","Oklahoma City","Colarado Springs"))
library(sp)
library(rgeos)
coordinates(dataframe) <- ~long+lat
dist_cities <- gDistance(dataframe, byid=T)
dist_cities_rank<-data.frame()
for(i in seq(1,dim(dist_cities)[1])){
dist_cities_rank<-rbind(dist_cities_rank,rank(as.numeric(dist_cities[i,])))
}
three_close_cities<-list()
for(i in seq(1,dim(dataframe)[1])){
three_close_cities[[i]]<-
list(test_city=dataframe[i,],cbind(dataframe[which(dist_cities_rank[i,]<=4&dist_cities_rank[i,]!=1),],
dist_cities[i,which(dist_cities_rank[i,]<=4&dist_cities_rank[i,]!=1)]))
}