基于最短地理距离匹配数据框
Matching data frames based on shortest geographic distance
我有两个数据框,都包含纬度和经度坐标。第一个数据框是对事件的观察,其中记录了位置和时间。第二个数据框是地理特征,其中记录了关于该特征的位置和信息。
my_df_1 <- structure(list(START_LAT = c(-33.15, -35.6, -34.08333, -34.13333,
-34.31667, -47.38333, -47.53333, -34.08333, -47.38333, -47.15
), START_LONG = c(163, 165.18333, 162.88333, 162.58333, 162.76667,
148.98333, 148.66667, 162.9, 148.98333, 148.71667)), row.names = c(1175L,
528L, 1328L, 870L, 672L, 707L, 506L, 981L, 756L, 210L), class = "data.frame", .Names = c("START_LAT",
"START_LONG"))
my_df_2 <- structure(list(latitude = c(-42.7984, -34.195, -49.81, -35.417,
-28.1487, -44.657, -42.7898, -36.245, -39.1335, -31.8482), longitude = c(179.9874,
179.526, -176.68, 178.765, -168.0314, 174.695, -179.9873, 177.7873,
-170.0583, 173.2424), depth_top = c(935L, 2204L, 869L, 1973L,
4750L, 555L, 894L, 1500L, 4299L, 1303L)), row.names = c(580L,
1306L, 926L, 1102L, 60L, 1481L, 574L, 454L, 1168L, 144L), class = "data.frame", .Names = c("latitude",
"longitude", "depth_top"))
我需要做的是,对于 df1 中的每个观察,我需要找出 df2 中哪个特征在地理上最接近。理想情况下,我会在 df1 后附加一个新列,其中每一行都是最接近 df2 的特征。
我解决了这个问题 How to assign several names to lat-lon observations,但无法弄清楚如何将其与我的数据相匹配
真正的数据帧有 1000 行,这就是为什么我不能手动执行此操作的原因
使用 sf
包中的 st_distance
的解决方案。 my_df_final
是最终输出。
# Load packages
library(tidyverse)
library(sp)
library(sf)
# Create ID for my_df_1 and my_df_2 based on row id
# This step is not required, just help me to better distinguish each point
my_df_1 <- my_df_1 %>% mutate(ID1 = row.names(.))
my_df_2 <- my_df_2 %>% mutate(ID2 = row.names(.))
# Create spatial point data frame
my_df_1_sp <- my_df_1
coordinates(my_df_1_sp) <- ~START_LONG + START_LAT
my_df_2_sp <- my_df_2
coordinates(my_df_2_sp) <- ~longitude + latitude
# Convert to simple feature
my_df_1_sf <- st_as_sf(my_df_1_sp)
my_df_2_sf <- st_as_sf(my_df_2_sp)
# Set projection based on the epsg code
st_crs(my_df_1_sf) <- 4326
st_crs(my_df_2_sf) <- 4326
# Calculate the distance
m_dist <- st_distance(my_df_1_sf, my_df_2_sf)
# Filter for the nearest
near_index <- apply(m_dist, 1, order)[1, ]
# Based on the index in near_index to select the rows in my_df_2
# Combine with my_df_1
my_df_final <- cbind(my_df_1, my_df_2[near_index, ])
基于这个你可以做到
library(geosphere)
mat <- distm(my_df_1[2:1], my_df_2[2:1], fun = distVincentyEllipsoid)
cbind(my_df_1, my_df_2[max.col(-mat),])
给出:
# START_LAT START_LONG ID1 latitude longitude depth_top ID2
#10 -33.15000 163.0000 1175 -31.8482 173.2424 1303 144
#10.1 -35.60000 165.1833 528 -31.8482 173.2424 1303 144
#10.2 -34.08333 162.8833 1328 -31.8482 173.2424 1303 144
#10.3 -34.13333 162.5833 870 -31.8482 173.2424 1303 144
#10.4 -34.31667 162.7667 672 -31.8482 173.2424 1303 144
#6 -47.38333 148.9833 707 -44.6570 174.6950 555 1481
#6.1 -47.53333 148.6667 506 -44.6570 174.6950 555 1481
#10.5 -34.08333 162.9000 981 -31.8482 173.2424 1303 144
#6.2 -47.38333 148.9833 756 -44.6570 174.6950 555 1481
#6.3 -47.15000 148.7167 210 -44.6570 174.6950 555 1481
我有两个数据框,都包含纬度和经度坐标。第一个数据框是对事件的观察,其中记录了位置和时间。第二个数据框是地理特征,其中记录了关于该特征的位置和信息。
my_df_1 <- structure(list(START_LAT = c(-33.15, -35.6, -34.08333, -34.13333,
-34.31667, -47.38333, -47.53333, -34.08333, -47.38333, -47.15
), START_LONG = c(163, 165.18333, 162.88333, 162.58333, 162.76667,
148.98333, 148.66667, 162.9, 148.98333, 148.71667)), row.names = c(1175L,
528L, 1328L, 870L, 672L, 707L, 506L, 981L, 756L, 210L), class = "data.frame", .Names = c("START_LAT",
"START_LONG"))
my_df_2 <- structure(list(latitude = c(-42.7984, -34.195, -49.81, -35.417,
-28.1487, -44.657, -42.7898, -36.245, -39.1335, -31.8482), longitude = c(179.9874,
179.526, -176.68, 178.765, -168.0314, 174.695, -179.9873, 177.7873,
-170.0583, 173.2424), depth_top = c(935L, 2204L, 869L, 1973L,
4750L, 555L, 894L, 1500L, 4299L, 1303L)), row.names = c(580L,
1306L, 926L, 1102L, 60L, 1481L, 574L, 454L, 1168L, 144L), class = "data.frame", .Names = c("latitude",
"longitude", "depth_top"))
我需要做的是,对于 df1 中的每个观察,我需要找出 df2 中哪个特征在地理上最接近。理想情况下,我会在 df1 后附加一个新列,其中每一行都是最接近 df2 的特征。
我解决了这个问题 How to assign several names to lat-lon observations,但无法弄清楚如何将其与我的数据相匹配
真正的数据帧有 1000 行,这就是为什么我不能手动执行此操作的原因
使用 sf
包中的 st_distance
的解决方案。 my_df_final
是最终输出。
# Load packages
library(tidyverse)
library(sp)
library(sf)
# Create ID for my_df_1 and my_df_2 based on row id
# This step is not required, just help me to better distinguish each point
my_df_1 <- my_df_1 %>% mutate(ID1 = row.names(.))
my_df_2 <- my_df_2 %>% mutate(ID2 = row.names(.))
# Create spatial point data frame
my_df_1_sp <- my_df_1
coordinates(my_df_1_sp) <- ~START_LONG + START_LAT
my_df_2_sp <- my_df_2
coordinates(my_df_2_sp) <- ~longitude + latitude
# Convert to simple feature
my_df_1_sf <- st_as_sf(my_df_1_sp)
my_df_2_sf <- st_as_sf(my_df_2_sp)
# Set projection based on the epsg code
st_crs(my_df_1_sf) <- 4326
st_crs(my_df_2_sf) <- 4326
# Calculate the distance
m_dist <- st_distance(my_df_1_sf, my_df_2_sf)
# Filter for the nearest
near_index <- apply(m_dist, 1, order)[1, ]
# Based on the index in near_index to select the rows in my_df_2
# Combine with my_df_1
my_df_final <- cbind(my_df_1, my_df_2[near_index, ])
基于这个
library(geosphere)
mat <- distm(my_df_1[2:1], my_df_2[2:1], fun = distVincentyEllipsoid)
cbind(my_df_1, my_df_2[max.col(-mat),])
给出:
# START_LAT START_LONG ID1 latitude longitude depth_top ID2
#10 -33.15000 163.0000 1175 -31.8482 173.2424 1303 144
#10.1 -35.60000 165.1833 528 -31.8482 173.2424 1303 144
#10.2 -34.08333 162.8833 1328 -31.8482 173.2424 1303 144
#10.3 -34.13333 162.5833 870 -31.8482 173.2424 1303 144
#10.4 -34.31667 162.7667 672 -31.8482 173.2424 1303 144
#6 -47.38333 148.9833 707 -44.6570 174.6950 555 1481
#6.1 -47.53333 148.6667 506 -44.6570 174.6950 555 1481
#10.5 -34.08333 162.9000 981 -31.8482 173.2424 1303 144
#6.2 -47.38333 148.9833 756 -44.6570 174.6950 555 1481
#6.3 -47.15000 148.7167 210 -44.6570 174.6950 555 1481