分组值之间的最小距离

Question

我有按时间和 ID 分组的二维数据，对于每个唯一的时间戳，我试图确定一个 ID 与该时间戳内任何其他 ID 的最近邻（即获取最近邻对于时间戳 1 内的所有 ID，然后为时间图 2 内的所有 ID 获取最近邻居，依此类推）。我有笛卡尔坐标；我知道 distm() returns 是一个矩阵，但不知道如何使用 non-Lat/Long 坐标来做到这一点。 dplyr 或欢迎其他选项。

dat<-structure(list(timestamp = structure(c(1585958400, 1585958400, 
1585958400, 1585958400, 1585958400, 1585962000, 1585962000, 1585962000
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), ID = structure(c(1L, 
12L, 25L, 47L, 51L, 12L, 47L, 50L), .Label = c("1", "10", "11", 
"12", "13", "14", "15", "16", "17", "18", "19", "2", "20", "21", 
"21b", "22", "23", "23b", "24", "25", "26", "27", "28", "29", 
"3", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", 
"4", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", 
"5", "50", "51", "6", "7", "8", "9"), class = "factor"), x = c(740.693095051881, 
743.998405321748, 739.480351500548, 744.040204814706, 743.357515557653, 
744.432012727096, 744.604552094105, 746.065894928645), y = c(2300.91570604786, 
2304.38234085183, 2304.9102593082, 2304.33896409476, 2302.08781536681, 
2304.18977683083, 2304.07139807708, 2301.47198606318)), row.names = c(NA, 
8L), class = "data.frame")

> dat
            timestamp ID        x        y
1 2020-04-04 00:00:00  1 740.6931 2300.916
2 2020-04-04 00:00:00  2 743.9984 2304.382
3 2020-04-04 00:00:00  3 739.4804 2304.910
4 2020-04-04 00:00:00  5 744.0402 2304.339
5 2020-04-04 00:00:00  7 743.3575 2302.088
6 2020-04-04 01:00:00  2 744.4320 2304.190
7 2020-04-04 01:00:00  5 744.6046 2304.071
8 2020-04-04 01:00:00  6 746.0659 2301.472

我需要的是该数据框中的第 5 列，其中包含到最近邻居的距离。我不需要知道哪个 ID 最接近，我只需要值。

编辑：

按照@Dave2e 的建议，我试过了。它可能真的很老套，因为我不知道如何让它更整洁、更流线型。我希望我没有遗漏任何东西并且我的思维有严重的缺陷。我试图想出一个 dplyr 解决方案，但我不知道如何在每个组的 dplyr 调用中临时创建距离矩阵。

#Option1: looping through subsets for each unique timestamp

dat.2<-data.frame()

for(i in 1:length(unique(dat$timestamp))){
  
  dat.sub<-dat[dat$timestamp==unique(dat$timestamp)[i],]
  d<-as.data.frame(as.matrix(dist(dat.sub[ , c("x", "y")], upper=TRUE)))
  d[d==0]<-NA #exclude distance to self
  dat.sub$min.d<-sapply(d, min, na.rm=T)
  
  dat.2<-rbind(dat.2, dat.sub)
}

> dat.2
            timestamp ID        x        y      min.d
1 2020-04-04 00:00:00  1 740.6931 2300.916 2.91083783
2 2020-04-04 00:00:00  2 743.9984 2304.382 0.06023903
3 2020-04-04 00:00:00  3 739.4804 2304.910 4.17459012
4 2020-04-04 00:00:00  5 744.0402 2304.339 0.06023903
5 2020-04-04 00:00:00  7 743.3575 2302.088 2.35238926
6 2020-04-04 01:00:00  2 744.4320 2304.190 0.20924474
7 2020-04-04 01:00:00  5 744.6046 2304.071 0.20924474
8 2020-04-04 01:00:00  6 746.0659 2301.472 2.98202376


#Option 2: using dplyr - don't know how

dat.2<-as.data.frame(dat%>%
                       group_by(timestamp)%>%
                       ...dplyr magic ... 
                       summarize? mutate? 
                     )

Answer 1

这是您要找的吗？

groups <-split(dat, dat$timestamp)
answer <-lapply(groups, function(dat){
   #calculate distance matrix
   d<-dist(dat[ , c("x", "y")], upper=TRUE, diag = TRUE)
   #convert to matrix
   dm<-as.matrix(d)

   #find the smallest value in each row less self
   closest <-sapply(1:nrow(dm), function(i){
      names(which.min(dm[i,-i]))
   })

dat$closest <-as.integer(closest)
dat
})
dplyr::bind_rows(answer)



timestamp ID        x        y closest
1 2020-04-04 00:00:00  1 740.6931 2300.916       5
2 2020-04-04 00:00:00  2 743.9984 2304.382       4
3 2020-04-04 00:00:00  3 739.4804 2304.910       1
4 2020-04-04 00:00:00  5 744.0402 2304.339       2
5 2020-04-04 00:00:00  7 743.3575 2302.088       7
6 2020-04-04 01:00:00  2 744.4320 2304.190       7
7 2020-04-04 01:00:00  5 744.6046 2304.071       6
8 2020-04-04 01:00:00  6 746.0659 2301.472       5

Answer 2

我会尝试这样的事情：

library(data.table)

f<- function(x,y){ 
  d <-as.matrix(dist(data.table(x,y), upper=T))
  d[d==0] <- NA
  apply(d,1,min,na.rm=T)
}

setDT(dat)[, mindist:=f(x,y), by=timestamp]

输出：

             timestamp     ID        x        y    mindist
                <POSc> <fctr>    <num>    <num>      <num>
1: 2020-04-04 00:00:00      1 740.6931 2300.916 2.91083783
2: 2020-04-04 00:00:00      2 743.9984 2304.382 0.06023903
3: 2020-04-04 00:00:00      3 739.4804 2304.910 4.17459012
4: 2020-04-04 00:00:00      5 744.0402 2304.339 0.06023903
5: 2020-04-04 00:00:00      7 743.3575 2302.088 2.35238926
6: 2020-04-04 01:00:00      2 744.4320 2304.190 0.20924474
7: 2020-04-04 01:00:00      5 744.6046 2304.071 0.20924474
8: 2020-04-04 01:00:00      6 746.0659 2301.472 2.98202376

分组值之间的最小距离

Minimum distances among grouped values

r

nearest-neighbor

dplyr