在第二个数据集中为第一个数据集中的每个点找到最近的点
Find nearest point in second datset for every point in first dataset
我必须对点(lat+lon 对)的数据帧进行处理。对于第一个数据框中的每个点,我想在第二个数据框中找到最近的点。这个问题和答案让我快到了 ()。正如评论中所建议的,唯一的区别是 我想使用 geosphere::distGeo 或 raster::pointDistance 而不是答案中的内容。我试过这样做(见下面的代码),但出现错误。我该如何解决? Distance
变量的单位是什么?我希望它以英里为单位。
代码 - 直接从上述问题复制,除了(错误地)修改以尝试获得我想要的内容。
df1 <- structure(list(id = c(1L, 2L, 4L, 5L,
6L, 7L, 8L, 9, 10L, 3L),
HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529,
63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207,
78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822,
-2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454,
-5.162345043546359, -8.63311254098095, 3.813289888829932,
-3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")
df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L,
5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432,
42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062,
-6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA,
10L), class = "data.frame")
library(data.table)
library(raster)
library(geosphere)
#Euclidean distance
mydist <- function(a, b, df1, x, y){
#dt <- data.table(sqrt((df1[[x]]-a)^2 + (df1[[y]]-b)^2))
#I couldn't figure out how to modify this correcly:
dt <- data.table(pointDistance(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
#dt <- data.table(distGeo(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
return(data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)]))
}
setDT(df1)[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON, setDT(df2),
"HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
这是我得到的错误:
Error in .pointsToMatrix(p2) : Wrong length for a vector, should be 2
c(df1[[x]],df1[[y]])
应更改为 df1[, c(y,x), with=FALSE]
因为 geosphere::distGeo
/ raster::pointDistance
需要一个 2 列点矩阵。此外,它希望点作为(经度,纬度)提供 - 因此(x,y)应该翻转为(y,x)。
使用raster::pointDistance
:
# Using raster::pointDistance
library(data.table)
library(raster)
library(geosphere)
setDT(df1)
setDT(df2)
mydist <- function(a, b, df1, x, y) {
dt <- data.table(pointDistance(c(b,a), df1[, c(y,x), with=FALSE], lonlat=TRUE))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
使用geosphere::distGeo
(与上面相同的结果):
# Using geosphere::distGeo
mydist <- function(a, b, df1, x, y) {
dt <- data.table(distGeo(c(b,a), df1[, c(y,x), with=FALSE]))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
我必须对点(lat+lon 对)的数据帧进行处理。对于第一个数据框中的每个点,我想在第二个数据框中找到最近的点。这个问题和答案让我快到了 (Distance
变量的单位是什么?我希望它以英里为单位。
代码 - 直接从上述问题复制,除了(错误地)修改以尝试获得我想要的内容。
df1 <- structure(list(id = c(1L, 2L, 4L, 5L,
6L, 7L, 8L, 9, 10L, 3L),
HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529,
63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207,
78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822,
-2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454,
-5.162345043546359, -8.63311254098095, 3.813289888829932,
-3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")
df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L,
5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432,
42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062,
-6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA,
10L), class = "data.frame")
library(data.table)
library(raster)
library(geosphere)
#Euclidean distance
mydist <- function(a, b, df1, x, y){
#dt <- data.table(sqrt((df1[[x]]-a)^2 + (df1[[y]]-b)^2))
#I couldn't figure out how to modify this correcly:
dt <- data.table(pointDistance(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
#dt <- data.table(distGeo(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
return(data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)]))
}
setDT(df1)[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON, setDT(df2),
"HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
这是我得到的错误:
Error in .pointsToMatrix(p2) : Wrong length for a vector, should be 2
c(df1[[x]],df1[[y]])
应更改为 df1[, c(y,x), with=FALSE]
因为 geosphere::distGeo
/ raster::pointDistance
需要一个 2 列点矩阵。此外,它希望点作为(经度,纬度)提供 - 因此(x,y)应该翻转为(y,x)。
使用raster::pointDistance
:
# Using raster::pointDistance
library(data.table)
library(raster)
library(geosphere)
setDT(df1)
setDT(df2)
mydist <- function(a, b, df1, x, y) {
dt <- data.table(pointDistance(c(b,a), df1[, c(y,x), with=FALSE], lonlat=TRUE))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
使用geosphere::distGeo
(与上面相同的结果):
# Using geosphere::distGeo
mydist <- function(a, b, df1, x, y) {
dt <- data.table(distGeo(c(b,a), df1[, c(y,x), with=FALSE]))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]