计算矩阵 A 和 B 中所有行之间的欧氏距离
Calculate Euclidean distances between all rows in matrices A and B
我有两个矩阵,A
和 B
,分别有 N_a
和 N_b
行。我需要计算 A
(a
) 中一个元素的所有成对组合与 B
(b
) 中另一个元素的所有成对组合之间的欧氏距离,以便计算的输出是一个 Na 乘以 Nb 的矩阵,其中单元格 [a, b]
是 a 到 b 的距离。我在下面开始了一个例子。
# Example
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
# Return N_a x N_b matrix of euclidean distances, where [a,b] is the
# distance from a to b
# Example
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
d <- matrix(NA, nrow(A), nrow(B))
for (a in 1:nrow(A)) for (b in 1:nrow(B)) d[a,b] <- sqrt(sum((A[a,]-B[b,])^2))
这是一个使用我的包并进行并行化的解决方案。请注意,github 上的当前构建不稳定,因此您必须从昨天的先前提交中安装。
编辑:v0.7.1 现已稳定,您无需使用 commit-ref
只有当两个矩阵都非常大时,这个解决方案才会更快and/or你有很多核心。但是我写的很有趣,所以:
devtools::install_github("alexwhitworth/imputation",
ref= "75723b769ed2ceae8c915d00089a31f059e447aa")
library(microbenchmark)
library(parallel)
f <- function(a, b) {
nnodes <- detectCores()
cl <- makeCluster(nnodes)
d <- do.call("cbind", clusterApply(cl, x= parallel:::splitRows(a, nnodes),
fun= function(x_sub, b) {
apply(x_sub, 1, function(i, b) {imputation::dist_q.matrix(x= rbind(i, b), ref= 1L, q=2)}, b= b)
}, b= b))
stopCluster(cl)
return(d)
}
a <- matrix(rnorm(50000), 1000)
b <- matrix(rnorm(50000), 1000)
d <- matrix(NA, 1000, 1000)
# run on 4 cores
microbenchmark(jogo= for (i in 1:nrow(a)) for (j in 1:nrow(b)) d[i,j] <- sqrt(sum((a[i,]-a[j,])^2)),
alex= f(a,b), times= 10L)
Unit: seconds
expr min lq mean median uq max neval cld
jogo 4.190531 4.196546 4.289265 4.265351 4.358022 4.486445 10 b
alex 3.585672 3.603485 3.783583 3.760859 3.966435 4.048676 10 a
如果你真的想要的话,你可以使用 library(Rdsm)
来改进这个......但我会推荐使用 jogo 的答案。
没有循环的单行代码,没有额外的包,而且速度更快:
euklDist <- sqrt(apply(array(apply(B,1,function(x){(x-t(A))^2}),c(ncol(A),nrow(A),nrow(B))),2:3,sum))
速度比较:
> microbenchmark(jogo = for (i in 1:nrow(A)) for (j in 1:nrow(B)) d[i,j] <- sqrt(sum((A[i,]-B[j,])^2)),
+ mra68 = sqrt(apply(array(app .... [TRUNCATED]
Unit: seconds
expr min lq mean median uq max neval
jogo 3.601533 4.724619 5.403420 5.549199 6.098734 6.470888 10
mra68 1.334661 1.635258 2.473297 2.542550 3.247981 3.348365 10
interdist_func <- function(x, y){
apply(y, 1, FUN=function(y_i){
sqrt(colSums((t(x)-y_i)^2))
})
}
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
d <- matrix(NA, nrow(A), nrow(B))
microbenchmark(
jogo =
for (i in 1:nrow(A)) for (j in 1:nrow(B)) d[i,j] <-sqrt(sum((A[i,]-B[j,])^2)),
mra68 =
sqrt(apply(array(apply(B,1,function(x){(x-t(A))^2}),c(ncol(A),nrow(A),nrow(B))),2:3,sum)),
roboshea =
apply(B, 1, FUN=function(B_i){sqrt(colSums((t(A)-B_i)^2))}))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# jogo 486.0123 553.45700 585.69967 580.20000 619.26870 751.2992 100 b
# mra68 512.1435 606.38120 653.00116 639.32560 675.40945 1011.6164 100 c
# roboshea 29.5313 32.95525 42.32124 37.87175 41.27385 128.2292 100 a
我有两个矩阵,A
和 B
,分别有 N_a
和 N_b
行。我需要计算 A
(a
) 中一个元素的所有成对组合与 B
(b
) 中另一个元素的所有成对组合之间的欧氏距离,以便计算的输出是一个 Na 乘以 Nb 的矩阵,其中单元格 [a, b]
是 a 到 b 的距离。我在下面开始了一个例子。
# Example
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
# Return N_a x N_b matrix of euclidean distances, where [a,b] is the
# distance from a to b
# Example
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
d <- matrix(NA, nrow(A), nrow(B))
for (a in 1:nrow(A)) for (b in 1:nrow(B)) d[a,b] <- sqrt(sum((A[a,]-B[b,])^2))
这是一个使用我的包并进行并行化的解决方案。请注意,github 上的当前构建不稳定,因此您必须从昨天的先前提交中安装。
编辑:v0.7.1 现已稳定,您无需使用 commit-ref
只有当两个矩阵都非常大时,这个解决方案才会更快and/or你有很多核心。但是我写的很有趣,所以:
devtools::install_github("alexwhitworth/imputation",
ref= "75723b769ed2ceae8c915d00089a31f059e447aa")
library(microbenchmark)
library(parallel)
f <- function(a, b) {
nnodes <- detectCores()
cl <- makeCluster(nnodes)
d <- do.call("cbind", clusterApply(cl, x= parallel:::splitRows(a, nnodes),
fun= function(x_sub, b) {
apply(x_sub, 1, function(i, b) {imputation::dist_q.matrix(x= rbind(i, b), ref= 1L, q=2)}, b= b)
}, b= b))
stopCluster(cl)
return(d)
}
a <- matrix(rnorm(50000), 1000)
b <- matrix(rnorm(50000), 1000)
d <- matrix(NA, 1000, 1000)
# run on 4 cores
microbenchmark(jogo= for (i in 1:nrow(a)) for (j in 1:nrow(b)) d[i,j] <- sqrt(sum((a[i,]-a[j,])^2)),
alex= f(a,b), times= 10L)
Unit: seconds
expr min lq mean median uq max neval cld
jogo 4.190531 4.196546 4.289265 4.265351 4.358022 4.486445 10 b
alex 3.585672 3.603485 3.783583 3.760859 3.966435 4.048676 10 a
如果你真的想要的话,你可以使用 library(Rdsm)
来改进这个......但我会推荐使用 jogo 的答案。
没有循环的单行代码,没有额外的包,而且速度更快:
euklDist <- sqrt(apply(array(apply(B,1,function(x){(x-t(A))^2}),c(ncol(A),nrow(A),nrow(B))),2:3,sum))
速度比较:
> microbenchmark(jogo = for (i in 1:nrow(A)) for (j in 1:nrow(B)) d[i,j] <- sqrt(sum((A[i,]-B[j,])^2)),
+ mra68 = sqrt(apply(array(app .... [TRUNCATED]
Unit: seconds
expr min lq mean median uq max neval
jogo 3.601533 4.724619 5.403420 5.549199 6.098734 6.470888 10
mra68 1.334661 1.635258 2.473297 2.542550 3.247981 3.348365 10
interdist_func <- function(x, y){
apply(y, 1, FUN=function(y_i){
sqrt(colSums((t(x)-y_i)^2))
})
}
set.seed(1)
A <- matrix(rnorm(1000, 5, 50), ncol = 5)
B <- matrix(rnorm(10000, 0, 50), ncol = 5)
d <- matrix(NA, nrow(A), nrow(B))
microbenchmark(
jogo =
for (i in 1:nrow(A)) for (j in 1:nrow(B)) d[i,j] <-sqrt(sum((A[i,]-B[j,])^2)),
mra68 =
sqrt(apply(array(apply(B,1,function(x){(x-t(A))^2}),c(ncol(A),nrow(A),nrow(B))),2:3,sum)),
roboshea =
apply(B, 1, FUN=function(B_i){sqrt(colSums((t(A)-B_i)^2))}))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# jogo 486.0123 553.45700 585.69967 580.20000 619.26870 751.2992 100 b
# mra68 512.1435 606.38120 653.00116 639.32560 675.40945 1011.6164 100 c
# roboshea 29.5313 32.95525 42.32124 37.87175 41.27385 128.2292 100 a