获取R中稀疏带状矩阵和向量的非零元素的乘积
Get the product of non-zero element of sparse banded matrix and vector in R
我有一个大带状稀疏矩阵 S,其行和列中几乎没有非零元素。我有向量 b,我想计算 sum(S*tcrossprod(b))。但是由于数据太大,系统无法计算,我需要将S中的非零部分相乘,怎么办?谢谢!
这是示例数据:
S = matrix(rnorm(64),8,8)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:8,40,replace = T)
S.c <- sample(1:8,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
rowlab = collab = NULL
for (ii in 1:S@Dim[2]) {
n <- S@i[(S@p[ii]+1):S@p[ii+1]]
rowlab <-c(rowlab, n)
collab <- c(collab, rep(ii,length(n)))
lab.b <- data.frame(rowlab = rowlab+1, collab = collab)
}
b = rnorm(8)
我想得到
sum(S*tcrossprod(b))
对于真实数据,S的维度为c(20000,20000),b的维度为c(20000,1)。
由于从 tcrossprod(b)
返回的密集矩阵,您 运行 遇到了内存问题。但是由于其中的许多项都乘以零,我们可以避免计算某些值(避免生成完整的 cross-product)。您可以获取稀疏矩阵 S
的 non-zero 元素的索引,使用这些索引向量 b
并相乘。以下是计算 S * tcrossprod(b)
:
的几种方法
library(Matrix)
# Several functions to calculate the matrix products:
f1 <- function(S, b) S*tcrossprod(b)
f2 <- function(S, b) (S * b) %*% Diagonal(x=b)
f3 <- function(S, b) {indices = summary(S); S@x = S@x * b[indices$i]*b[indices$j]; S}
f4 <- function(S, b) {dp = diff(S@p); j = rep(seq_along(dp),dp); S@x = S@x * b[S@i+1]*b[j]; S}
# Check equality of function results
all.equal(f1(S, b), f2(S, b))
# [1] TRUE
all.equal(f2(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f4(S, b))
# [1] TRUE
all.equal(f2(S, b), f4(S, b))
# [1] TRUE
all.equal(f3(S, b), f4(S, b))
# [1] TRUE
基准:
rbenchmark::benchmark(f1(S, b), f2(S, b), f3(S, b), f4(S, b), replications=1) # 1 rep as f1() is slow
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(S, b) 1 12.182 12182 7.972 4.932 0 0
# 2 f2(S, b) 1 0.003 3 0.003 0.000 0 0
# 3 f3(S, b) 1 0.002 2 0.002 0.000 0 0
# 4 f4(S, b) 1 0.001 1 0.001 0.000 0 0
# longer benchmark for the other functions, drop f1()
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f2(S, b) 100 0.363 2.556 0.305 0.042 0 0
# 2 f3(S, b) 100 0.279 1.965 0.255 0.024 0 0
# 3 f4(S, b) 100 0.142 1.000 0.142 0.000 0 0
只要你想要sum
就够了
dp = diff(S@p); j = rep(seq_along(dp),dp); sum(S@x * b[S@i+1]*b[j])
数据:
n = 1e4 # I dont have enough memeory on my laptop for 20k
set.seed(1)
b = rnorm(n)
S = matrix(rnorm(n*n), n, n)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:n,40,replace = T)
S.c <- sample(1:n,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
我有一个大带状稀疏矩阵 S,其行和列中几乎没有非零元素。我有向量 b,我想计算 sum(S*tcrossprod(b))。但是由于数据太大,系统无法计算,我需要将S中的非零部分相乘,怎么办?谢谢! 这是示例数据:
S = matrix(rnorm(64),8,8)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:8,40,replace = T)
S.c <- sample(1:8,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")
rowlab = collab = NULL
for (ii in 1:S@Dim[2]) {
n <- S@i[(S@p[ii]+1):S@p[ii+1]]
rowlab <-c(rowlab, n)
collab <- c(collab, rep(ii,length(n)))
lab.b <- data.frame(rowlab = rowlab+1, collab = collab)
}
b = rnorm(8)
我想得到
sum(S*tcrossprod(b))
对于真实数据,S的维度为c(20000,20000),b的维度为c(20000,1)。
由于从 tcrossprod(b)
返回的密集矩阵,您 运行 遇到了内存问题。但是由于其中的许多项都乘以零,我们可以避免计算某些值(避免生成完整的 cross-product)。您可以获取稀疏矩阵 S
的 non-zero 元素的索引,使用这些索引向量 b
并相乘。以下是计算 S * tcrossprod(b)
:
library(Matrix)
# Several functions to calculate the matrix products:
f1 <- function(S, b) S*tcrossprod(b)
f2 <- function(S, b) (S * b) %*% Diagonal(x=b)
f3 <- function(S, b) {indices = summary(S); S@x = S@x * b[indices$i]*b[indices$j]; S}
f4 <- function(S, b) {dp = diff(S@p); j = rep(seq_along(dp),dp); S@x = S@x * b[S@i+1]*b[j]; S}
# Check equality of function results
all.equal(f1(S, b), f2(S, b))
# [1] TRUE
all.equal(f2(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f3(S, b))
# [1] TRUE
all.equal(f1(S, b), f4(S, b))
# [1] TRUE
all.equal(f2(S, b), f4(S, b))
# [1] TRUE
all.equal(f3(S, b), f4(S, b))
# [1] TRUE
基准:
rbenchmark::benchmark(f1(S, b), f2(S, b), f3(S, b), f4(S, b), replications=1) # 1 rep as f1() is slow
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f1(S, b) 1 12.182 12182 7.972 4.932 0 0
# 2 f2(S, b) 1 0.003 3 0.003 0.000 0 0
# 3 f3(S, b) 1 0.002 2 0.002 0.000 0 0
# 4 f4(S, b) 1 0.001 1 0.001 0.000 0 0
# longer benchmark for the other functions, drop f1()
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 f2(S, b) 100 0.363 2.556 0.305 0.042 0 0
# 2 f3(S, b) 100 0.279 1.965 0.255 0.024 0 0
# 3 f4(S, b) 100 0.142 1.000 0.142 0.000 0 0
只要你想要sum
就够了
dp = diff(S@p); j = rep(seq_along(dp),dp); sum(S@x * b[S@i+1]*b[j])
数据:
n = 1e4 # I dont have enough memeory on my laptop for 20k
set.seed(1)
b = rnorm(n)
S = matrix(rnorm(n*n), n, n)
bw = 3
pick = (row(S)<(col(S) - bw)) | (row(S)>(col(S)+bw))
S[pick] = 0
S.r <- sample(1:n,40,replace = T)
S.c <- sample(1:n,40,replace = T)
for (i in 1:length(S.r)) {
S[S.r[i],S.c[i]] <-0
}
S <- as(S,"sparseMatrix")