R:将循环转换为向量化执行以实现行之间的相关性
R: transform loops into vectorized execution for correlation between rows
我希望有条件地 select 来自 dt2 的值基于 dt1 中的值以逐行方式,然后成对关联 dt2 中的行并将相关值保存在新矩阵 dt3 中。在我开始用文字解释之前,我想 R 代码更具描述性。我通过遍历数据帧来做到这一点,这非常慢。我确信有可能以矢量化方式执行此操作以提高性能。有没有人有解决方案或建议?非常感谢!
library(data.table)
dt1 <- data.table(a=round(runif(100)), b=round(runif(100)), c=round(runif(100)), d=round(runif(100)), e=round(runif(100)), f=round(runif(100)))
dt2 <- data.table(a=runif(100), b=runif(100), c=runif(100), d=runif(100), e=runif(100), f=runif(100))
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow=m, ncol=n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r,] > 0 & dt1[p,] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method='spearman', use='na.or.complete')
dt3[r,p] <- correlation_value
dt3[p,r] <- correlation_value
print(glue('row {r} vs row {p}'))
}
col_vec <- col_vec[-1]
}
您可以将 built-in NA
排除机制与 use = "pairwise.complete.obs"
一起使用。
将 dt2
中的值设置为缺失
如果对应的dt1
值为0
,则使用一次cor()
调用。
library(data.table)
n <- 4
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
#> [,1] [,2] [,3] [,4]
#> [1,] 1.0 1 -1 0.1
#> [2,] 1.0 1 NA -1.0
#> [3,] -1.0 NA 1 NA
#> [4,] 0.1 -1 NA 1.0
基准测试函数中的两种方法:
f_loop <- function(dt1, dt2) {
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow = m, ncol = n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r, ] > 0 & dt1[p, ] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method = "spearman", use = "na.or.complete")
dt3[r, p] <- correlation_value
dt3[p, r] <- correlation_value
# print(glue::glue("row {r} vs row {p}"))
}
col_vec <- col_vec[-1]
}
dt3
}
f_repl <- function(dt1, dt2) {
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
}
并用更大的数据进行测试:
n <- 100
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
# Check that we get the same result
all.equal(f_loop(dt1, dt2), f_repl(dt1, dt2))
#> [1] TRUE
bench::system_time(f_loop(dt1, dt2))
#> process real
#> 5.86s 5.92s
bench::system_time(f_repl(dt1, dt2))
#> process real
#> 188ms 198ms
我希望有条件地 select 来自 dt2 的值基于 dt1 中的值以逐行方式,然后成对关联 dt2 中的行并将相关值保存在新矩阵 dt3 中。在我开始用文字解释之前,我想 R 代码更具描述性。我通过遍历数据帧来做到这一点,这非常慢。我确信有可能以矢量化方式执行此操作以提高性能。有没有人有解决方案或建议?非常感谢!
library(data.table)
dt1 <- data.table(a=round(runif(100)), b=round(runif(100)), c=round(runif(100)), d=round(runif(100)), e=round(runif(100)), f=round(runif(100)))
dt2 <- data.table(a=runif(100), b=runif(100), c=runif(100), d=runif(100), e=runif(100), f=runif(100))
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow=m, ncol=n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r,] > 0 & dt1[p,] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method='spearman', use='na.or.complete')
dt3[r,p] <- correlation_value
dt3[p,r] <- correlation_value
print(glue('row {r} vs row {p}'))
}
col_vec <- col_vec[-1]
}
您可以将 built-in NA
排除机制与 use = "pairwise.complete.obs"
一起使用。
将 dt2
中的值设置为缺失
如果对应的dt1
值为0
,则使用一次cor()
调用。
library(data.table)
n <- 4
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
#> [,1] [,2] [,3] [,4]
#> [1,] 1.0 1 -1 0.1
#> [2,] 1.0 1 NA -1.0
#> [3,] -1.0 NA 1 NA
#> [4,] 0.1 -1 NA 1.0
基准测试函数中的两种方法:
f_loop <- function(dt1, dt2) {
m <- nrow(dt2)
n <- m
dt3 <- matrix(nrow = m, ncol = n)
col_vec <- 1:n
for (r in 1:m) {
for (p in col_vec) {
selection <- dt1[r, ] > 0 & dt1[p, ] > 0
selection <- as.vector(selection)
r_values <- as.numeric(dt2[p, ..selection])
p_values <- as.numeric(dt2[r, ..selection])
correlation_value <- cor(r_values, p_values, method = "spearman", use = "na.or.complete")
dt3[r, p] <- correlation_value
dt3[p, r] <- correlation_value
# print(glue::glue("row {r} vs row {p}"))
}
col_vec <- col_vec[-1]
}
dt3
}
f_repl <- function(dt1, dt2) {
replace(t(dt2), t(dt1) == 0, NA) |>
cor(method = "spearman", use = "pairwise.complete.obs")
}
并用更大的数据进行测试:
n <- 100
set.seed(42)
dt1 <- data.table(a = round(runif(n)), b = round(runif(n)), c = round(runif(n)), d = round(runif(n)), e = round(runif(n)), f = round(runif(n)))
dt2 <- data.table(a = runif(n), b = runif(n), c = runif(n), d = runif(n), e = runif(n), f = runif(n))
# Check that we get the same result
all.equal(f_loop(dt1, dt2), f_repl(dt1, dt2))
#> [1] TRUE
bench::system_time(f_loop(dt1, dt2))
#> process real
#> 5.86s 5.92s
bench::system_time(f_repl(dt1, dt2))
#> process real
#> 188ms 198ms