作为稀疏矩阵交叉积的结果的稀疏矩阵
Sparse Matrix as a result of crossprod of sparse matrices
我已经解决这个问题一段时间了,但没有找到满意的解决方案。
我在二进制稀疏矩阵 (TermDocumentMatrix) 中有数据,带有暗淡 ([1] 340436 763717
)。我在这里使用摘录作为概念证明:
m = structure(list(i = c(1L, 2L, 5L, 2L, 4L, 3L, 5L, 4L), j = c(1L, 1L, 1L, 2L, 2L,
3L, 3L, 3L), v = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), nrow = 5L, ncol = 3L,
dimnames = list(Terms = c("action", "activities", "advisory", "alike", "almanac"),
Docs = c("1000008721", "1000010083","1000013295"))),
class = c("TermDocumentMatrix", "simple_triplet_matrix"), weighting = c("binary", "bin"))
inspect(m)
<<TermDocumentMatrix (terms: 5, documents: 3)>>
Non-/sparse entries: 8/7
Sparsity : 47%
Maximal term length: 10
Weighting : binary (bin)
Sample :
Docs
Terms 1000008721 1000010083 1000013295
action 1 0 0
activities 1 1 0
advisory 0 0 1
alike 0 1 1
almanac 1 0 1
我想将每个向量化文档标准化为单位长度,然后获得一个(稀疏)矩阵,其中行上的文档和列上的文档以及相应的标准化向量的点积作为值。
预期输出:
Sparse Matrix:
Docs
Docs 1000008721 1000010083 1000013295 ... N
1000008721 1.0000000 0.4082483 0.3333333 .
1000010083 0.4082483 1.0000000 0.4082483 .
1000013295 0.3333333 0.4082483 1.0000000 .
...
N . . .
or also: data.table
ID1 ID2 v
1000008721 1000008721 1
1000010083 1000008721 0.4082483
1000013295 1000008721 0.3333333
... ... ...
在使用将每个值除以范数的函数应用归一化后,crossprod_simple_triplet_matrix(m)
可以很容易地实现这一点。带有二元向量的欧几里德范数减少到 sqrt(col_sums(m))
.
因为我不能通过as.matrix()
转换(“错误:无法分配大小为968.6 Gb的向量”),而且我找不到任何其他方法,所以我使用了data.table可以绕过需要在稀疏矩阵上应用一个函数:
# exploit the triples and manipulate through data.table
dt = as.data.table(list(i=m$i,j=m$j,v=m$v))
# obtain euclidean norm for every column
dt[,e.norm:=list(as.numeric(sqrt(sum(v)))),by=j]
# divide the v for the corresponding group, subset and replace
dt = dt[,v.norm:=v/e.norm][,.(i,j,v.norm)][,v:=v.norm][,.(i,j,v)]
m$v <- dt$v
inspect(m)
Docs
Terms 1000008721 1000010083 1000013295
action 0.5773503 0.0000000 0.0000000
activities 0.5773503 0.7071068 0.0000000
advisory 0.0000000 0.0000000 0.5773503
alike 0.0000000 0.7071068 0.5773503
almanac 0.5773503 0.0000000 0.5773503
(这(可能与 slam)的等价物是什么?)
问题:鉴于 crossprod_simple_triplet_matrix(tdm)
仍然是 return 一个密集矩阵(因此存在内存错误),您能否考虑一个类似的 data.table 解决方案return 一个稀疏矩阵与两个稀疏矩阵的叉积,或任何替代方法?
具有 35879680 non-zero 个元素的 340436 x 763717 稀疏矩阵将产生非常大的对象 (~30 GB)。我的机器无法在 16GB RAM 的内存中保存该对象。但是,叉积很容易零碎地进行。下面的函数 bigcrossprod
分段执行叉积,将结果转换为 data.table
个对象,然后 rbinds
个对象。 crossprod
操作分为 nseg
个单独的操作。
library(data.table)
library(Matrix)
bigcrossprod <- function(m, nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
rbindlist(
lapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")
)
}
)
)
}
使用较小的稀疏矩阵进行演示:
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks
dt3 <- bigcrossprod(m, 10)
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
为了计算 340436 x 763717 稀疏矩阵与 35879680 个元素的叉积,而不是将 data.table
个对象的列表存储在一个列表中以传递给 rbindlist
,保存单个data.table 个对象供以后使用 fst
包处理。以下版本的 bigcrossprod
returns 不是返回单个 data.table
,而是一个长度为 nseg
的字符向量,其中包含 .fst 文件路径。同样,用较小的矩阵进行演示:
library(data.table)
library(Matrix)
library(fst)
bigcrossprod <- function(m, nseg, path) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
vapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
},
character(1)
)
}
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks, read the resulting .fst files,
# and rbindlist into a single data.table
dt3 <- rbindlist(lapply(bigcrossprod(m, 10, "C:/temp"),
function(x) read.fst(x, as.data.table = TRUE)))
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
我能够在大约 15 分钟内用 16GB 的 RAM 处理一个 340436 x 763717 的稀疏矩阵和 35879680 non-zero 个元素。
解释:
使用 OP 的 5 x 3 示例矩阵演练 bigcrossprod
中的逻辑:
idx <- c(1,2,5,7,9,13:15)
dt <- data.table(i = as.integer(((idx - 1) %% 5) + 1),
j = as.integer(((idx - 1) %/% 5) + 1),
key = "j")[, v := 1/sqrt(.N), j]
(m <- sparseMatrix(dt$i, dt$j, x = dt$v))
#> 5 x 3 sparse Matrix of class "dgCMatrix"
#>
#> [1,] 0.5773503 . .
#> [2,] 0.5773503 0.7071068 .
#> [3,] . . 0.5773503
#> [4,] . 0.7071068 0.5773503
#> [5,] 0.5773503 . 0.5773503
nseg <- 2 # process the cross product of m in two chunks
jmax <- ncol(m)
sumj <- cumsum(as.numeric(jmax:1))
# In order to balance the processing between chunks, process column 1 first,
# then columns 2:3 next. Each crossprod call will result in 3 elements.
(dtj <- data.table(j = 1:jmax, int = sumj %/% (sumj[jmax]/nseg + 1))[, .(idx1 = min(j), idx2 = max(j)), int][, idx1:idx2])
#> idx1 idx2
#> 1: 1 1
#> 2: 2 3
# process the first chunk
seg <- 1L
j1 <- dtj$idx1[seg]
# cross product of column 1 and the full matrix
(m2 <- as(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax]), "dgTMatrix"))
#> 1 x 3 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483 0.3333333
# The indices of m2 are 0-based. Add j1 to the i,j indices of m2 when converting
# to a data.table.
(dt1 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
# process the second chunk
seg <- 2L
j1 <- dtj$idx1[seg] # starts at column 2
# Cross product of columns 2:3 and columns 2:jmax (2:3). The end result
# will be a symmetric matrix, so we need only the upper triangular matrix.
(m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax])), "dgTMatrix"))
#> 2 x 2 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483
#> [2,] . 1.0000000
(dt2 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 2 2 1.0000000
#> 2: 2 3 0.4082483
#> 3: 3 3 1.0000000
# the final matrix (in data.table form) is the two data.tables row-bound
# together (in bigcrossprod, the lapply returns a list of data.table objects for
# rbindlist)
(dt3 <- rbindlist(list(dt1, dt2)))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
#> 4: 2 2 1.0000000
#> 5: 2 3 0.4082483
#> 6: 3 3 1.0000000
# dt3 can be converted to a symmetric sparse matrix
(m3 <- forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)))
#> 3 x 3 sparse Matrix of class "dsCMatrix"
#>
#> [1,] 1.0000000 0.4082483 0.3333333
#> [2,] 0.4082483 1.0000000 0.4082483
#> [3,] 0.3333333 0.4082483 1.0000000
最后,bigcrossprod
的并行版本(对于 Linux):
library(data.table)
library(Matrix)
library(parallel)
library(fst)
bigcrossprod <- function(m, nseg, path, nthreads = nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
cl <- makeCluster(nthreads, type = "FORK", outfile = "")
on.exit(stopCluster(cl))
unlist(
parLapply(
cl,
1:nseg,
function(seg) {
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
}
)
)
}
我已经解决这个问题一段时间了,但没有找到满意的解决方案。
我在二进制稀疏矩阵 (TermDocumentMatrix) 中有数据,带有暗淡 ([1] 340436 763717
)。我在这里使用摘录作为概念证明:
m = structure(list(i = c(1L, 2L, 5L, 2L, 4L, 3L, 5L, 4L), j = c(1L, 1L, 1L, 2L, 2L,
3L, 3L, 3L), v = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), nrow = 5L, ncol = 3L,
dimnames = list(Terms = c("action", "activities", "advisory", "alike", "almanac"),
Docs = c("1000008721", "1000010083","1000013295"))),
class = c("TermDocumentMatrix", "simple_triplet_matrix"), weighting = c("binary", "bin"))
inspect(m)
<<TermDocumentMatrix (terms: 5, documents: 3)>>
Non-/sparse entries: 8/7
Sparsity : 47%
Maximal term length: 10
Weighting : binary (bin)
Sample :
Docs
Terms 1000008721 1000010083 1000013295
action 1 0 0
activities 1 1 0
advisory 0 0 1
alike 0 1 1
almanac 1 0 1
我想将每个向量化文档标准化为单位长度,然后获得一个(稀疏)矩阵,其中行上的文档和列上的文档以及相应的标准化向量的点积作为值。
预期输出:
Sparse Matrix:
Docs
Docs 1000008721 1000010083 1000013295 ... N
1000008721 1.0000000 0.4082483 0.3333333 .
1000010083 0.4082483 1.0000000 0.4082483 .
1000013295 0.3333333 0.4082483 1.0000000 .
...
N . . .
or also: data.table
ID1 ID2 v
1000008721 1000008721 1
1000010083 1000008721 0.4082483
1000013295 1000008721 0.3333333
... ... ...
在使用将每个值除以范数的函数应用归一化后,crossprod_simple_triplet_matrix(m)
可以很容易地实现这一点。带有二元向量的欧几里德范数减少到 sqrt(col_sums(m))
.
因为我不能通过as.matrix()
转换(“错误:无法分配大小为968.6 Gb的向量”),而且我找不到任何其他方法,所以我使用了data.table可以绕过需要在稀疏矩阵上应用一个函数:
# exploit the triples and manipulate through data.table
dt = as.data.table(list(i=m$i,j=m$j,v=m$v))
# obtain euclidean norm for every column
dt[,e.norm:=list(as.numeric(sqrt(sum(v)))),by=j]
# divide the v for the corresponding group, subset and replace
dt = dt[,v.norm:=v/e.norm][,.(i,j,v.norm)][,v:=v.norm][,.(i,j,v)]
m$v <- dt$v
inspect(m)
Docs
Terms 1000008721 1000010083 1000013295
action 0.5773503 0.0000000 0.0000000
activities 0.5773503 0.7071068 0.0000000
advisory 0.0000000 0.0000000 0.5773503
alike 0.0000000 0.7071068 0.5773503
almanac 0.5773503 0.0000000 0.5773503
(这(可能与 slam)的等价物是什么?)
问题:鉴于 crossprod_simple_triplet_matrix(tdm)
仍然是 return 一个密集矩阵(因此存在内存错误),您能否考虑一个类似的 data.table 解决方案return 一个稀疏矩阵与两个稀疏矩阵的叉积,或任何替代方法?
具有 35879680 non-zero 个元素的 340436 x 763717 稀疏矩阵将产生非常大的对象 (~30 GB)。我的机器无法在 16GB RAM 的内存中保存该对象。但是,叉积很容易零碎地进行。下面的函数 bigcrossprod
分段执行叉积,将结果转换为 data.table
个对象,然后 rbinds
个对象。 crossprod
操作分为 nseg
个单独的操作。
library(data.table)
library(Matrix)
bigcrossprod <- function(m, nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
rbindlist(
lapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")
)
}
)
)
}
使用较小的稀疏矩阵进行演示:
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks
dt3 <- bigcrossprod(m, 10)
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
为了计算 340436 x 763717 稀疏矩阵与 35879680 个元素的叉积,而不是将 data.table
个对象的列表存储在一个列表中以传递给 rbindlist
,保存单个data.table 个对象供以后使用 fst
包处理。以下版本的 bigcrossprod
returns 不是返回单个 data.table
,而是一个长度为 nseg
的字符向量,其中包含 .fst 文件路径。同样,用较小的矩阵进行演示:
library(data.table)
library(Matrix)
library(fst)
bigcrossprod <- function(m, nseg, path) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
vapply(
1:nseg,
function(seg) {
cat("\r", seg) # user feedback
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
},
character(1)
)
}
idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
j = as.integer(((idx - 1) %/% 34043L) + 1),
key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
j = attr(m2, "j") + 1L,
v = attr(m2, "x"))
# calculate crossprod in 10 chunks, read the resulting .fst files,
# and rbindlist into a single data.table
dt3 <- rbindlist(lapply(bigcrossprod(m, 10, "C:/temp"),
function(x) read.fst(x, as.data.table = TRUE)))
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE
我能够在大约 15 分钟内用 16GB 的 RAM 处理一个 340436 x 763717 的稀疏矩阵和 35879680 non-zero 个元素。
解释:
使用 OP 的 5 x 3 示例矩阵演练 bigcrossprod
中的逻辑:
idx <- c(1,2,5,7,9,13:15)
dt <- data.table(i = as.integer(((idx - 1) %% 5) + 1),
j = as.integer(((idx - 1) %/% 5) + 1),
key = "j")[, v := 1/sqrt(.N), j]
(m <- sparseMatrix(dt$i, dt$j, x = dt$v))
#> 5 x 3 sparse Matrix of class "dgCMatrix"
#>
#> [1,] 0.5773503 . .
#> [2,] 0.5773503 0.7071068 .
#> [3,] . . 0.5773503
#> [4,] . 0.7071068 0.5773503
#> [5,] 0.5773503 . 0.5773503
nseg <- 2 # process the cross product of m in two chunks
jmax <- ncol(m)
sumj <- cumsum(as.numeric(jmax:1))
# In order to balance the processing between chunks, process column 1 first,
# then columns 2:3 next. Each crossprod call will result in 3 elements.
(dtj <- data.table(j = 1:jmax, int = sumj %/% (sumj[jmax]/nseg + 1))[, .(idx1 = min(j), idx2 = max(j)), int][, idx1:idx2])
#> idx1 idx2
#> 1: 1 1
#> 2: 2 3
# process the first chunk
seg <- 1L
j1 <- dtj$idx1[seg]
# cross product of column 1 and the full matrix
(m2 <- as(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax]), "dgTMatrix"))
#> 1 x 3 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483 0.3333333
# The indices of m2 are 0-based. Add j1 to the i,j indices of m2 when converting
# to a data.table.
(dt1 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
# process the second chunk
seg <- 2L
j1 <- dtj$idx1[seg] # starts at column 2
# Cross product of columns 2:3 and columns 2:jmax (2:3). The end result
# will be a symmetric matrix, so we need only the upper triangular matrix.
(m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax])), "dgTMatrix"))
#> 2 x 2 sparse Matrix of class "dgTMatrix"
#>
#> [1,] 1 0.4082483
#> [2,] . 1.0000000
(dt2 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#> i j v
#> 1: 2 2 1.0000000
#> 2: 2 3 0.4082483
#> 3: 3 3 1.0000000
# the final matrix (in data.table form) is the two data.tables row-bound
# together (in bigcrossprod, the lapply returns a list of data.table objects for
# rbindlist)
(dt3 <- rbindlist(list(dt1, dt2)))
#> i j v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
#> 4: 2 2 1.0000000
#> 5: 2 3 0.4082483
#> 6: 3 3 1.0000000
# dt3 can be converted to a symmetric sparse matrix
(m3 <- forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)))
#> 3 x 3 sparse Matrix of class "dsCMatrix"
#>
#> [1,] 1.0000000 0.4082483 0.3333333
#> [2,] 0.4082483 1.0000000 0.4082483
#> [3,] 0.3333333 0.4082483 1.0000000
最后,bigcrossprod
的并行版本(对于 Linux):
library(data.table)
library(Matrix)
library(parallel)
library(fst)
bigcrossprod <- function(m, nseg, path, nthreads = nseg) {
jmax <- ncol(m)
# get the indices to split the columns of m into chunks that will have
# approximately the same expense for crossprod
sumj <- cumsum(as.numeric(jmax:1))
dtj <- data.table(
j = 1:jmax,
int = sumj %/% (sumj[jmax]/nseg + 1)
)[
, .(idx1 = min(j), idx2 = max(j)), int
][, idx1:idx2]
cl <- makeCluster(nthreads, type = "FORK", outfile = "")
on.exit(stopCluster(cl))
unlist(
parLapply(
cl,
1:nseg,
function(seg) {
j1 <- dtj$idx1[seg]
m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
m[, j1:jmax])), "dgTMatrix")
filepath <- file.path(path, paste0("dt", seg, ".fst"))
write.fst(
data.table(
i = attr(m2, "i") + j1,
j = attr(m2, "j") + j1,
v = attr(m2, "x")),
filepath
)
filepath
}
)
)
}