分组值之间的最小距离
Minimum distances among grouped values
我有按时间和 ID 分组的二维数据,对于每个唯一的时间戳,我试图确定一个 ID 与该时间戳内任何其他 ID 的最近邻(即获取最近邻对于时间戳 1 内的所有 ID,然后为时间图 2 内的所有 ID 获取最近邻居,依此类推)。我有笛卡尔坐标;我知道 distm()
returns 是一个矩阵,但不知道如何使用 non-Lat/Long 坐标来做到这一点。 dplyr
或欢迎其他选项。
dat<-structure(list(timestamp = structure(c(1585958400, 1585958400,
1585958400, 1585958400, 1585958400, 1585962000, 1585962000, 1585962000
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), ID = structure(c(1L,
12L, 25L, 47L, 51L, 12L, 47L, 50L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "2", "20", "21",
"21b", "22", "23", "23b", "24", "25", "26", "27", "28", "29",
"3", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
"4", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
"5", "50", "51", "6", "7", "8", "9"), class = "factor"), x = c(740.693095051881,
743.998405321748, 739.480351500548, 744.040204814706, 743.357515557653,
744.432012727096, 744.604552094105, 746.065894928645), y = c(2300.91570604786,
2304.38234085183, 2304.9102593082, 2304.33896409476, 2302.08781536681,
2304.18977683083, 2304.07139807708, 2301.47198606318)), row.names = c(NA,
8L), class = "data.frame")
> dat
timestamp ID x y
1 2020-04-04 00:00:00 1 740.6931 2300.916
2 2020-04-04 00:00:00 2 743.9984 2304.382
3 2020-04-04 00:00:00 3 739.4804 2304.910
4 2020-04-04 00:00:00 5 744.0402 2304.339
5 2020-04-04 00:00:00 7 743.3575 2302.088
6 2020-04-04 01:00:00 2 744.4320 2304.190
7 2020-04-04 01:00:00 5 744.6046 2304.071
8 2020-04-04 01:00:00 6 746.0659 2301.472
我需要的是该数据框中的第 5 列,其中包含到最近邻居的距离。我不需要知道哪个 ID 最接近,我只需要值。
编辑:
按照@Dave2e 的建议,我试过了。它可能真的很老套,因为我不知道如何让它更整洁、更流线型。我希望我没有遗漏任何东西并且我的思维有严重的缺陷。
我试图想出一个 dplyr
解决方案,但我不知道如何在每个组的 dplyr 调用中临时创建距离矩阵。
#Option1: looping through subsets for each unique timestamp
dat.2<-data.frame()
for(i in 1:length(unique(dat$timestamp))){
dat.sub<-dat[dat$timestamp==unique(dat$timestamp)[i],]
d<-as.data.frame(as.matrix(dist(dat.sub[ , c("x", "y")], upper=TRUE)))
d[d==0]<-NA #exclude distance to self
dat.sub$min.d<-sapply(d, min, na.rm=T)
dat.2<-rbind(dat.2, dat.sub)
}
> dat.2
timestamp ID x y min.d
1 2020-04-04 00:00:00 1 740.6931 2300.916 2.91083783
2 2020-04-04 00:00:00 2 743.9984 2304.382 0.06023903
3 2020-04-04 00:00:00 3 739.4804 2304.910 4.17459012
4 2020-04-04 00:00:00 5 744.0402 2304.339 0.06023903
5 2020-04-04 00:00:00 7 743.3575 2302.088 2.35238926
6 2020-04-04 01:00:00 2 744.4320 2304.190 0.20924474
7 2020-04-04 01:00:00 5 744.6046 2304.071 0.20924474
8 2020-04-04 01:00:00 6 746.0659 2301.472 2.98202376
#Option 2: using dplyr - don't know how
dat.2<-as.data.frame(dat%>%
group_by(timestamp)%>%
...dplyr magic ...
summarize? mutate?
)
这是您要找的吗?
groups <-split(dat, dat$timestamp)
answer <-lapply(groups, function(dat){
#calculate distance matrix
d<-dist(dat[ , c("x", "y")], upper=TRUE, diag = TRUE)
#convert to matrix
dm<-as.matrix(d)
#find the smallest value in each row less self
closest <-sapply(1:nrow(dm), function(i){
names(which.min(dm[i,-i]))
})
dat$closest <-as.integer(closest)
dat
})
dplyr::bind_rows(answer)
timestamp ID x y closest
1 2020-04-04 00:00:00 1 740.6931 2300.916 5
2 2020-04-04 00:00:00 2 743.9984 2304.382 4
3 2020-04-04 00:00:00 3 739.4804 2304.910 1
4 2020-04-04 00:00:00 5 744.0402 2304.339 2
5 2020-04-04 00:00:00 7 743.3575 2302.088 7
6 2020-04-04 01:00:00 2 744.4320 2304.190 7
7 2020-04-04 01:00:00 5 744.6046 2304.071 6
8 2020-04-04 01:00:00 6 746.0659 2301.472 5
我会尝试这样的事情:
library(data.table)
f<- function(x,y){
d <-as.matrix(dist(data.table(x,y), upper=T))
d[d==0] <- NA
apply(d,1,min,na.rm=T)
}
setDT(dat)[, mindist:=f(x,y), by=timestamp]
输出:
timestamp ID x y mindist
<POSc> <fctr> <num> <num> <num>
1: 2020-04-04 00:00:00 1 740.6931 2300.916 2.91083783
2: 2020-04-04 00:00:00 2 743.9984 2304.382 0.06023903
3: 2020-04-04 00:00:00 3 739.4804 2304.910 4.17459012
4: 2020-04-04 00:00:00 5 744.0402 2304.339 0.06023903
5: 2020-04-04 00:00:00 7 743.3575 2302.088 2.35238926
6: 2020-04-04 01:00:00 2 744.4320 2304.190 0.20924474
7: 2020-04-04 01:00:00 5 744.6046 2304.071 0.20924474
8: 2020-04-04 01:00:00 6 746.0659 2301.472 2.98202376
我有按时间和 ID 分组的二维数据,对于每个唯一的时间戳,我试图确定一个 ID 与该时间戳内任何其他 ID 的最近邻(即获取最近邻对于时间戳 1 内的所有 ID,然后为时间图 2 内的所有 ID 获取最近邻居,依此类推)。我有笛卡尔坐标;我知道 distm()
returns 是一个矩阵,但不知道如何使用 non-Lat/Long 坐标来做到这一点。 dplyr
或欢迎其他选项。
dat<-structure(list(timestamp = structure(c(1585958400, 1585958400,
1585958400, 1585958400, 1585958400, 1585962000, 1585962000, 1585962000
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), ID = structure(c(1L,
12L, 25L, 47L, 51L, 12L, 47L, 50L), .Label = c("1", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "2", "20", "21",
"21b", "22", "23", "23b", "24", "25", "26", "27", "28", "29",
"3", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
"4", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
"5", "50", "51", "6", "7", "8", "9"), class = "factor"), x = c(740.693095051881,
743.998405321748, 739.480351500548, 744.040204814706, 743.357515557653,
744.432012727096, 744.604552094105, 746.065894928645), y = c(2300.91570604786,
2304.38234085183, 2304.9102593082, 2304.33896409476, 2302.08781536681,
2304.18977683083, 2304.07139807708, 2301.47198606318)), row.names = c(NA,
8L), class = "data.frame")
> dat
timestamp ID x y
1 2020-04-04 00:00:00 1 740.6931 2300.916
2 2020-04-04 00:00:00 2 743.9984 2304.382
3 2020-04-04 00:00:00 3 739.4804 2304.910
4 2020-04-04 00:00:00 5 744.0402 2304.339
5 2020-04-04 00:00:00 7 743.3575 2302.088
6 2020-04-04 01:00:00 2 744.4320 2304.190
7 2020-04-04 01:00:00 5 744.6046 2304.071
8 2020-04-04 01:00:00 6 746.0659 2301.472
我需要的是该数据框中的第 5 列,其中包含到最近邻居的距离。我不需要知道哪个 ID 最接近,我只需要值。
编辑:
按照@Dave2e 的建议,我试过了。它可能真的很老套,因为我不知道如何让它更整洁、更流线型。我希望我没有遗漏任何东西并且我的思维有严重的缺陷。
我试图想出一个 dplyr
解决方案,但我不知道如何在每个组的 dplyr 调用中临时创建距离矩阵。
#Option1: looping through subsets for each unique timestamp
dat.2<-data.frame()
for(i in 1:length(unique(dat$timestamp))){
dat.sub<-dat[dat$timestamp==unique(dat$timestamp)[i],]
d<-as.data.frame(as.matrix(dist(dat.sub[ , c("x", "y")], upper=TRUE)))
d[d==0]<-NA #exclude distance to self
dat.sub$min.d<-sapply(d, min, na.rm=T)
dat.2<-rbind(dat.2, dat.sub)
}
> dat.2
timestamp ID x y min.d
1 2020-04-04 00:00:00 1 740.6931 2300.916 2.91083783
2 2020-04-04 00:00:00 2 743.9984 2304.382 0.06023903
3 2020-04-04 00:00:00 3 739.4804 2304.910 4.17459012
4 2020-04-04 00:00:00 5 744.0402 2304.339 0.06023903
5 2020-04-04 00:00:00 7 743.3575 2302.088 2.35238926
6 2020-04-04 01:00:00 2 744.4320 2304.190 0.20924474
7 2020-04-04 01:00:00 5 744.6046 2304.071 0.20924474
8 2020-04-04 01:00:00 6 746.0659 2301.472 2.98202376
#Option 2: using dplyr - don't know how
dat.2<-as.data.frame(dat%>%
group_by(timestamp)%>%
...dplyr magic ...
summarize? mutate?
)
这是您要找的吗?
groups <-split(dat, dat$timestamp)
answer <-lapply(groups, function(dat){
#calculate distance matrix
d<-dist(dat[ , c("x", "y")], upper=TRUE, diag = TRUE)
#convert to matrix
dm<-as.matrix(d)
#find the smallest value in each row less self
closest <-sapply(1:nrow(dm), function(i){
names(which.min(dm[i,-i]))
})
dat$closest <-as.integer(closest)
dat
})
dplyr::bind_rows(answer)
timestamp ID x y closest
1 2020-04-04 00:00:00 1 740.6931 2300.916 5
2 2020-04-04 00:00:00 2 743.9984 2304.382 4
3 2020-04-04 00:00:00 3 739.4804 2304.910 1
4 2020-04-04 00:00:00 5 744.0402 2304.339 2
5 2020-04-04 00:00:00 7 743.3575 2302.088 7
6 2020-04-04 01:00:00 2 744.4320 2304.190 7
7 2020-04-04 01:00:00 5 744.6046 2304.071 6
8 2020-04-04 01:00:00 6 746.0659 2301.472 5
我会尝试这样的事情:
library(data.table)
f<- function(x,y){
d <-as.matrix(dist(data.table(x,y), upper=T))
d[d==0] <- NA
apply(d,1,min,na.rm=T)
}
setDT(dat)[, mindist:=f(x,y), by=timestamp]
输出:
timestamp ID x y mindist
<POSc> <fctr> <num> <num> <num>
1: 2020-04-04 00:00:00 1 740.6931 2300.916 2.91083783
2: 2020-04-04 00:00:00 2 743.9984 2304.382 0.06023903
3: 2020-04-04 00:00:00 3 739.4804 2304.910 4.17459012
4: 2020-04-04 00:00:00 5 744.0402 2304.339 0.06023903
5: 2020-04-04 00:00:00 7 743.3575 2302.088 2.35238926
6: 2020-04-04 01:00:00 2 744.4320 2304.190 0.20924474
7: 2020-04-04 01:00:00 5 744.6046 2304.071 0.20924474
8: 2020-04-04 01:00:00 6 746.0659 2301.472 2.98202376