获取R中稀疏带状矩阵和向量的非零元素的乘积

Get the product of non-zero element of sparse banded matrix and vector in R

我有一个大带状稀疏矩阵 S,其行和列中几乎没有非零元素。我有向量 b,我想计算 sum(S*tcrossprod(b))。但是由于数据太大,系统无法计算,我需要将S中的非零部分相乘,怎么办?谢谢! 这是示例数据:

S = matrix(rnorm(64),8,8)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:8,40,replace = T)
S.c <- sample(1:8,40,replace = T)
for (i in 1:length(S.r)) {
  S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
rowlab = collab = NULL
for (ii in 1:S@Dim[2]) {
     n <- S@i[(S@p[ii]+1):S@p[ii+1]]
     rowlab <-c(rowlab, n)
     collab <- c(collab, rep(ii,length(n)))
     lab.b <- data.frame(rowlab = rowlab+1, collab = collab)
 }
b = rnorm(8)

我想得到

sum(S*tcrossprod(b))

对于真实数据,S的维度为c(20000,20000),b的维度为c(20000,1)。

由于从 tcrossprod(b) 返回的密集矩阵,您 运行 遇到了内存问题。但是由于其中的许多项都乘以零,我们可以避免计算某些值(避免生成完整的 cross-product)。您可以获取稀疏矩阵 S 的 non-zero 元素的索引,使用这些索引向量 b 并相乘。以下是计算 S * tcrossprod(b):

的几种方法
library(Matrix)

# Several functions to calculate the matrix products:
f1 <- function(S, b) S*tcrossprod(b)
f2 <- function(S, b) (S * b) %*% Diagonal(x=b) 
f3 <- function(S, b) {indices = summary(S); S@x = S@x * b[indices$i]*b[indices$j]; S}
f4 <- function(S, b) {dp = diff(S@p); j = rep(seq_along(dp),dp); S@x = S@x * b[S@i+1]*b[j]; S}

# Check equality of function results
all.equal(f1(S, b), f2(S, b))
# [1] TRUE
all.equal(f2(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f4(S, b))
# [1] TRUE
all.equal(f2(S, b), f4(S, b))
# [1] TRUE
all.equal(f3(S, b), f4(S, b))
# [1] TRUE

基准:

rbenchmark::benchmark(f1(S, b), f2(S, b), f3(S, b), f4(S, b), replications=1) # 1 rep as f1() is slow
#       test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(S, b)            1  12.182    12182     7.972    4.932          0         0
# 2 f2(S, b)            1   0.003        3     0.003    0.000          0         0
# 3 f3(S, b)            1   0.002        2     0.002    0.000          0         0
# 4 f4(S, b)            1   0.001        1     0.001    0.000          0         0

# longer benchmark for the other functions, drop f1()
#      test replications elapsed relative user.self sys.self user.child sys.child
# 1 f2(S, b)          100   0.363    2.556     0.305    0.042          0         0
# 2 f3(S, b)          100   0.279    1.965     0.255    0.024          0         0
# 3 f4(S, b)          100   0.142    1.000     0.142    0.000          0         0

只要你想要sum就够了

dp = diff(S@p); j = rep(seq_along(dp),dp); sum(S@x * b[S@i+1]*b[j])

数据:

n = 1e4 # I dont have enough memeory on my laptop for 20k
set.seed(1)
b = rnorm(n)
S = matrix(rnorm(n*n), n, n)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:n,40,replace = T)
S.c <- sample(1:n,40,replace = T)
for (i in 1:length(S.r)) {
  S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")