在第二个数据集中为第一个数据集中的每个点找到最近的点

Find nearest point in second datset for every point in first dataset

我必须对点(lat+lon 对)的数据帧进行处理。对于第一个数据框中的每个点,我想在第二个数据框中找到最近的点。这个问题和答案让我快到了 ()。正如评论中所建议的,唯一的区别是 我想使用 geosphere::distGeo 或 raster::pointDistance 而不是答案中的内容。我试过这样做(见下面的代码),但出现错误。我该如何解决? Distance 变量的单位是什么?我希望它以英里为单位。

代码 - 直接从上述问题复制,除了(错误地)修改以尝试获得我想要的内容。

df1 <- structure(list(id = c(1L, 2L, 4L, 5L, 
                      6L, 7L, 8L, 9, 10L, 3L), 
               HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529, 
                              63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207, 
                              78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822, 
                                                                                                 -2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454, 
                                                                                                 -5.162345043546359, -8.63311254098095, 3.813289888829932, 
                                                                                                 -3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA, 
                                                                                                                                                     NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA, 
                                                                                                                                                                                                  NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")


df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L, 
                          5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432, 
                                                                42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
                          ), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062, 
                                             -6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA, 
                                                                                                              10L), class = "data.frame")

library(data.table)
library(raster)
library(geosphere)
#Euclidean distance 
mydist <- function(a, b, df1, x, y){
    
    #dt <- data.table(sqrt((df1[[x]]-a)^2 + (df1[[y]]-b)^2))
    #I couldn't figure out how to modify this correcly:
    dt <- data.table(pointDistance(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
    #dt <- data.table(distGeo(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
    
    return(data.table(Closest.V1  = which.min(dt$V1),
                      Distance    = dt[which.min(dt$V1)]))
}

setDT(df1)[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON, setDT(df2), 
                        "HIGH_PRCN_LAT", "HIGH_PRCN_LON"), 
           by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]

这是我得到的错误:

Error in .pointsToMatrix(p2) : Wrong length for a vector, should be 2

c(df1[[x]],df1[[y]]) 应更改为 df1[, c(y,x), with=FALSE] 因为 geosphere::distGeo / raster::pointDistance 需要一个 2 列点矩阵。此外,它希望点作为(经度,纬度)提供 - 因此(x,y)应该翻转为(y,x)。

使用raster::pointDistance

# Using raster::pointDistance
library(data.table)
library(raster)
library(geosphere)

setDT(df1)
setDT(df2)

mydist <- function(a, b, df1, x, y) {
    dt <- data.table(pointDistance(c(b,a), df1[, c(y,x), with=FALSE], lonlat=TRUE))
    
    return (
        data.table(Closest.V1 = which.min(dt$V1),
        Distance = dt[which.min(dt$V1)])
    )
}

df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
    df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
    by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]

使用geosphere::distGeo(与上面相同的结果):

# Using geosphere::distGeo
mydist <- function(a, b, df1, x, y) {
    dt <- data.table(distGeo(c(b,a), df1[, c(y,x), with=FALSE]))
    
    return (
        data.table(Closest.V1 = which.min(dt$V1),
        Distance = dt[which.min(dt$V1)])
    )
}

df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
    df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
    by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]