作为稀疏矩阵交叉积的结果的稀疏矩阵

Sparse Matrix as a result of crossprod of sparse matrices

我已经解决这个问题一段时间了,但没有找到满意的解决方案。

我在二进制稀疏矩阵 (TermDocumentMatrix) 中有数据,带有暗淡 ([1] 340436 763717)。我在这里使用摘录作为概念证明:

m = structure(list(i = c(1L, 2L, 5L, 2L, 4L, 3L, 5L, 4L), j = c(1L, 1L, 1L, 2L, 2L, 
    3L, 3L, 3L), v = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), nrow = 5L, ncol = 3L, 
    dimnames = list(Terms = c("action", "activities", "advisory", "alike", "almanac"),
                    Docs = c("1000008721", "1000010083","1000013295"))), 
    class = c("TermDocumentMatrix", "simple_triplet_matrix"), weighting = c("binary", "bin"))

inspect(m)
<<TermDocumentMatrix (terms: 5, documents: 3)>>
Non-/sparse entries: 8/7
Sparsity           : 47%
Maximal term length: 10
Weighting          : binary (bin)
Sample             :
            Docs
Terms        1000008721 1000010083 1000013295
  action              1          0          0
  activities          1          1          0
  advisory            0          0          1
  alike               0          1          1
  almanac             1          0          1

我想将每个向量化文档标准化为单位长度,然后获得一个(稀疏)矩阵,其中行上的文档和列上的文档以及相应的标准化向量的点积作为值。

预期输出:

Sparse Matrix:
            Docs
Docs         1000008721 1000010083 1000013295 ... N
  1000008721  1.0000000  0.4082483  0.3333333     .
  1000010083  0.4082483  1.0000000  0.4082483     .  
  1000013295  0.3333333  0.4082483  1.0000000     .
    ...
   N               .          .          .

or also: data.table
 ID1              ID2          v
1000008721     1000008721      1
1000010083     1000008721     0.4082483
1000013295     1000008721     0.3333333
   ...             ...         ...

在使用将每个值除以范数的函数应用归一化后,crossprod_simple_triplet_matrix(m) 可以很容易地实现这一点。带有二元向量的欧几里德范数减少到 sqrt(col_sums(m)).

因为我不能通过as.matrix()转换(“错误:无法分配大小为968.6 Gb的向量”),而且我找不到任何其他方法,所以我使用了data.table可以绕过需要在稀疏矩阵上应用一个函数:

# exploit the triples and manipulate through data.table
dt = as.data.table(list(i=m$i,j=m$j,v=m$v))

# obtain euclidean norm for every column 
dt[,e.norm:=list(as.numeric(sqrt(sum(v)))),by=j]

# divide the v for the corresponding group, subset and replace
dt = dt[,v.norm:=v/e.norm][,.(i,j,v.norm)][,v:=v.norm][,.(i,j,v)]

m$v <- dt$v
inspect(m)
            Docs
Terms        1000008721 1000010083 1000013295
  action      0.5773503  0.0000000  0.0000000
  activities  0.5773503  0.7071068  0.0000000
  advisory    0.0000000  0.0000000  0.5773503
  alike       0.0000000  0.7071068  0.5773503
  almanac     0.5773503  0.0000000  0.5773503

(这(可能与 slam)的等价物是什么?)

问题:鉴于 crossprod_simple_triplet_matrix(tdm) 仍然是 return 一个密集矩阵(因此存在内存错误),您能否考虑一个类似的 data.table 解决方案return 一个稀疏矩阵与两个稀疏矩阵的叉积,或任何替代方法?

具有 35879680 non-zero 个元素的 340436 x 763717 稀疏矩阵将产生非常大的对象 (~30 GB)。我的机器无法在 16GB RAM 的内存中保存该对象。但是,叉积很容易零碎地进行。下面的函数 bigcrossprod 分段执行叉积,将结果转换为 data.table 个对象,然后 rbinds 个对象。 crossprod 操作分为 nseg 个单独的操作。

library(data.table)
library(Matrix)

bigcrossprod <- function(m, nseg) {
  jmax <- ncol(m)
  # get the indices to split the columns of m into chunks that will have
  # approximately the same expense for crossprod
  sumj <- cumsum(as.numeric(jmax:1))
  dtj <- data.table(
    j = 1:jmax,
    int = sumj %/% (sumj[jmax]/nseg + 1)
  )[
    , .(idx1 = min(j), idx2 = max(j)), int
  ][, idx1:idx2]
  rbindlist(
    lapply(
      1:nseg,
      function(seg) {
        cat("\r", seg) # user feedback
        j1 <- dtj$idx1[seg]
        m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
                                m[, j1:jmax])), "dgTMatrix")
        data.table(
          i = attr(m2, "i") + j1,
          j = attr(m2, "j") + j1,
          v = attr(m2, "x")
        )
      }
    )
  )
}

使用较小的稀疏矩阵进行演示:

idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
                 j = as.integer(((idx - 1) %/% 34043L) + 1),
                 key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
                  j = attr(m2, "j") + 1L,
                  v = attr(m2, "x"))
# calculate crossprod in 10 chunks
dt3 <- bigcrossprod(m, 10)
#>  1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE

为了计算 340436 x 763717 稀疏矩阵与 35879680 个元素的叉积,而不是将 data.table 个对象的列表存储在一个列表中以传递给 rbindlist,保存单个data.table 个对象供以后使用 fst 包处理。以下版本的 bigcrossprod returns 不是返回单个 data.table,而是一个长度为 nseg 的字符向量,其中包含 .fst 文件路径。同样,用较小的矩阵进行演示:

library(data.table)
library(Matrix)
library(fst)

bigcrossprod <- function(m, nseg, path) {
  jmax <- ncol(m)
  # get the indices to split the columns of m into chunks that will have
  # approximately the same expense for crossprod
  sumj <- cumsum(as.numeric(jmax:1))
  dtj <- data.table(
    j = 1:jmax,
    int = sumj %/% (sumj[jmax]/nseg + 1)
  )[
    , .(idx1 = min(j), idx2 = max(j)), int
  ][, idx1:idx2]
  vapply(
    1:nseg,
    function(seg) {
      cat("\r", seg) # user feedback
      j1 <- dtj$idx1[seg]
      m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE],
                              m[, j1:jmax])), "dgTMatrix")
      filepath <- file.path(path, paste0("dt", seg, ".fst"))
      write.fst(
        data.table(
          i = attr(m2, "i") + j1,
          j = attr(m2, "j") + j1,
          v = attr(m2, "x")),
        filepath
      )
      filepath
    },
    character(1)
  )
}

idx <- sample(34043*76371, 358796)
dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
                 j = as.integer(((idx - 1) %/% 34043L) + 1),
                 key = "j")[, v := 1/sqrt(.N), j]
m <- sparseMatrix(dt$i, dt$j, x = dt$v)
# calculate crossprod on the full matrix and convert the result to triplet
# form in order to represent as a data.table
m2 <- as(crossprod(m), "dgTMatrix")
dt2 <- data.table(i = attr(m2, "i") + 1L,
                  j = attr(m2, "j") + 1L,
                  v = attr(m2, "x"))
# calculate crossprod in 10 chunks, read the resulting .fst files,
# and rbindlist into a single data.table
dt3 <- rbindlist(lapply(bigcrossprod(m, 10, "C:/temp"),
                        function(x) read.fst(x, as.data.table = TRUE)))
#> 1 2 3 4 5 6 7 8 9 10
# convert the result into a symmetric sparse matrix in triplet form (the same as m2)
m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
# remove elements from the data.table objects that are redundant due to symmetry
dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
# check that the dgTMatrix and data.table objects are identical
identical(m2, m3)
#> [1] TRUE
identical(dt2, dt3)
#> [1] TRUE

我能够在大约 15 分钟内用 16GB 的 RAM 处理一个 340436 x 763717 的稀疏矩阵和 35879680 non-zero 个元素。

解释:

使用 OP 的 5 x 3 示例矩阵演练 bigcrossprod 中的逻辑:

idx <- c(1,2,5,7,9,13:15)
dt <- data.table(i = as.integer(((idx - 1) %% 5) + 1),
                 j = as.integer(((idx - 1) %/% 5) + 1),
                 key = "j")[, v := 1/sqrt(.N), j]
(m <- sparseMatrix(dt$i, dt$j, x = dt$v))
#> 5 x 3 sparse Matrix of class "dgCMatrix"
#>                                   
#> [1,] 0.5773503 .         .        
#> [2,] 0.5773503 0.7071068 .        
#> [3,] .         .         0.5773503
#> [4,] .         0.7071068 0.5773503
#> [5,] 0.5773503 .         0.5773503
nseg <- 2 # process the cross product of m in two chunks
jmax <- ncol(m)
sumj <- cumsum(as.numeric(jmax:1))
# In order to balance the processing between chunks, process column 1 first,
# then columns 2:3 next. Each crossprod call will result in 3 elements.
(dtj <- data.table(j = 1:jmax, int = sumj %/% (sumj[jmax]/nseg + 1))[, .(idx1 = min(j), idx2 = max(j)), int][, idx1:idx2])
#>    idx1 idx2
#> 1:    1    1
#> 2:    2    3
# process the first chunk
seg <- 1L
j1 <- dtj$idx1[seg]
# cross product of column 1 and the full matrix
(m2 <- as(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax]), "dgTMatrix"))
#> 1 x 3 sparse Matrix of class "dgTMatrix"
#>                           
#> [1,] 1 0.4082483 0.3333333
# The indices of m2 are 0-based. Add j1 to the i,j indices of m2 when converting
# to a data.table.
(dt1 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#>    i j         v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
# process the second chunk
seg <- 2L
j1 <- dtj$idx1[seg] # starts at column 2
# Cross product of columns 2:3 and columns 2:jmax (2:3). The end result
# will be a symmetric matrix, so we need only the upper triangular matrix.
(m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax])), "dgTMatrix"))
#> 2 x 2 sparse Matrix of class "dgTMatrix"
#>                 
#> [1,] 1 0.4082483
#> [2,] . 1.0000000
(dt2 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
#>    i j         v
#> 1: 2 2 1.0000000
#> 2: 2 3 0.4082483
#> 3: 3 3 1.0000000
# the final matrix (in data.table form) is the two data.tables row-bound
# together (in bigcrossprod, the lapply returns a list of data.table objects for
# rbindlist)
(dt3 <- rbindlist(list(dt1, dt2)))
#>    i j         v
#> 1: 1 1 1.0000000
#> 2: 1 2 0.4082483
#> 3: 1 3 0.3333333
#> 4: 2 2 1.0000000
#> 5: 2 3 0.4082483
#> 6: 3 3 1.0000000
# dt3 can be converted to a symmetric sparse matrix
(m3 <- forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)))
#> 3 x 3 sparse Matrix of class "dsCMatrix"
#>                                   
#> [1,] 1.0000000 0.4082483 0.3333333
#> [2,] 0.4082483 1.0000000 0.4082483
#> [3,] 0.3333333 0.4082483 1.0000000

最后,bigcrossprod 的并行版本(对于 Linux):

library(data.table)
library(Matrix)
library(parallel)
library(fst)

bigcrossprod <- function(m, nseg, path, nthreads = nseg) {
  jmax <- ncol(m)
  # get the indices to split the columns of m into chunks that will have
  # approximately the same expense for crossprod
  sumj <- cumsum(as.numeric(jmax:1))
  dtj <- data.table(
    j = 1:jmax,
    int = sumj %/% (sumj[jmax]/nseg + 1)
  )[
    , .(idx1 = min(j), idx2 = max(j)), int
  ][, idx1:idx2]
  cl <- makeCluster(nthreads, type = "FORK", outfile = "")
  on.exit(stopCluster(cl))
  unlist(
    parLapply(
      cl,
      1:nseg,
      function(seg) {
        j1 <- dtj$idx1[seg]
        m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
                                m[, j1:jmax])), "dgTMatrix")
        filepath <- file.path(path, paste0("dt", seg, ".fst"))
        write.fst(
          data.table(
            i = attr(m2, "i") + j1,
            j = attr(m2, "j") + j1,
            v = attr(m2, "x")),
          filepath
        )
        filepath
      }
    )
  )
}