Assemble 具有可变列的稀疏矩阵列表
Assemble list of sparse matrices with variable columns
我有一个(命名的)稀疏矩阵列表。它们具有相同的行,并且它们的列来自同一组。但是,任何稀疏矩阵都可能具有可能列的子集。例如:
library(Matrix)
set.seed(2)
my_colnames <- LETTERS[1:5]
my_rownames <- letters[1:3]
my_mat_names <- month.name[1:3]
generate_mat <- function(){
cols_here <- sample(my_colnames, rbinom(1,5, .7)) |> sort()
Matrix(rbinom(length(cols_here)*length(my_rownames),5, .2),
nrow = length(my_rownames),
ncol = length(cols_here),
dimnames = list(my_rownames, cols_here)) |>
as("dgCMatrix")
}
list_of_mat <- replicate(length(my_mat_names), generate_mat()) |>
setNames(my_mat_names)
list_of_mat
#> $January
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#> A B D E
#> a 1 . 3 .
#> b . 1 . 1
#> c 2 2 1 1
#>
#> $February
#> 3 x 3 sparse Matrix of class "dgCMatrix"
#> A B E
#> a 3 . 1
#> b . 2 1
#> c . 2 2
#>
#> $March
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#> A B C E
#> a 3 1 . 2
#> b 2 1 . .
#> c 3 2 1 2
我想以我可以使用的方式组合各个矩阵,例如作为(稀疏)数组或长数据框:
library(purrr)
list_of_mat_to_long_df <- function(list_of_mat){
all_rownames <- map(list_of_mat, ~ rownames(.x)) |>
reduce(union) |>
unique()
all_colnames <- map(list_of_mat, ~ colnames(.x)) |>
reduce(union) |>
unique()
expanded <- expand.grid(mat_name = names(list_of_mat),
row_name = all_rownames,
col_name = all_colnames)
expanded$value <- purrr::pmap_dbl(expanded,
\(mat_name, row_name, col_name) tryCatch(list_of_mat[[mat_name]][row_name, col_name],
error = \(e)0))
expanded
}
list_of_mat_to_long_df(list_of_mat)
#> mat_name row_name col_name value
#> 1 January a A 1
#> 2 February a A 3
#> 3 March a A 3
#> 4 January b A 0
#> 5 February b A 0
#> 6 March b A 2
#> 7 January c A 2
#> 8 February c A 0
#> 9 March c A 3
#> 10 January a B 0
#> 11 February a B 0
#> 12 March a B 1
#> 13 January b B 1
#> 14 February b B 2
#> 15 March b B 1
#> 16 January c B 2
#> 17 February c B 2
#> 18 March c B 2
#> 19 January a D 3
#> 20 February a D 0
#> 21 March a D 0
#> 22 January b D 0
#> 23 February b D 0
#> 24 March b D 0
#> 25 January c D 1
#> 26 February c D 0
#> 27 March c D 0
#> 28 January a E 0
#> 29 February a E 1
#> 30 March a E 2
#> 31 January b E 1
#> 32 February b E 1
#> 33 March b E 0
#> 34 January c E 1
#> 35 February c E 2
#> 36 March c E 2
#> 37 January a C 0
#> 38 February a C 0
#> 39 March a C 0
#> 40 January b C 0
#> 41 February b C 0
#> 42 March b C 0
#> 43 January c C 0
#> 44 February c C 0
#> 45 March c C 1
实际上,这些矩阵很大 (100k rows/columns),因此建议的解决方案太慢。此外,它们不能被转换为稠密矩阵。有没有有效的方法来做到这一点?
注意:有一种使用列索引调整稀疏矩阵大小的方法,我不能直接使用它,因为我必须依赖列名。
我猜你有几个选择。第一个是 cbind
稀疏矩阵并构造一个因子,指定生成的(更宽的)稀疏矩阵中每一列的原点:
(xx <- do.call(cbind, list_of_mat))
## 3 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names 'A', 'B', 'D' ... ]]
##
## a 1 . 3 . 3 . 1 3 1 . 2
## b . 1 . 1 . 2 1 2 1 . .
## c 2 2 1 1 . 2 2 3 2 1 2
(matname <- rep.int(gl(length(list_of_mat), 1L, labels = names(list_of_mat)),
vapply(list_of_mat, ncol, 0L)))
## [1] January January January January February February February March
## [9] March March March
## Levels: January February March
第二个是构建一个像您所显示的那样的长数据框,但排除带有 value == 0
的行以节省内存。 Matrix
有 mat2triplet
用于此目的:
(dd <- lapply(list_of_mat,
function(M) as.data.frame(mat2triplet(M)) |>
transform(rowname = rownames(M)[i],
colname = colnames(M)[j]) |>
subset(select = -c(i, j))))
## $January
## x rowname colname
## 1 1 a A
## 2 2 c A
## 3 1 b B
## 4 2 c B
## 5 3 a D
## 6 1 c D
## 7 1 b E
## 8 1 c E
##
## $February
## x rowname colname
## 1 3 a A
## 2 2 b B
## 3 2 c B
## 4 1 a E
## 5 1 b E
## 6 2 c E
##
## $March
## x rowname colname
## 1 3 a A
## 2 2 b A
## 3 3 c A
## 4 1 a B
## 5 1 b B
## 6 2 c B
## 7 1 c C
## 8 2 a E
## 9 2 c E
##
(tt <- do.call(rbind, unname(dd)) |>
transform(matname = rep.int(gl(length(dd), 1L, labels = names(dd)),
vapply(dd, nrow, 0L))))
## x rowname colname matname
## 1 1 a A January
## 2 2 c A January
## 3 1 b B January
## 4 2 c B January
## 5 3 a D January
## 6 1 c D January
## 7 1 b E January
## 8 1 c E January
## 9 3 a A February
## 10 2 b B February
## 11 2 c B February
## 12 1 a E February
## 13 1 b E February
## 14 2 c E February
## 15 3 a A March
## 16 2 b A March
## 17 3 c A March
## 18 1 a B March
## 19 1 b B March
## 20 2 c B March
## 21 1 c C March
## 22 2 a E March
## 23 2 c E March
我有一个(命名的)稀疏矩阵列表。它们具有相同的行,并且它们的列来自同一组。但是,任何稀疏矩阵都可能具有可能列的子集。例如:
library(Matrix)
set.seed(2)
my_colnames <- LETTERS[1:5]
my_rownames <- letters[1:3]
my_mat_names <- month.name[1:3]
generate_mat <- function(){
cols_here <- sample(my_colnames, rbinom(1,5, .7)) |> sort()
Matrix(rbinom(length(cols_here)*length(my_rownames),5, .2),
nrow = length(my_rownames),
ncol = length(cols_here),
dimnames = list(my_rownames, cols_here)) |>
as("dgCMatrix")
}
list_of_mat <- replicate(length(my_mat_names), generate_mat()) |>
setNames(my_mat_names)
list_of_mat
#> $January
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#> A B D E
#> a 1 . 3 .
#> b . 1 . 1
#> c 2 2 1 1
#>
#> $February
#> 3 x 3 sparse Matrix of class "dgCMatrix"
#> A B E
#> a 3 . 1
#> b . 2 1
#> c . 2 2
#>
#> $March
#> 3 x 4 sparse Matrix of class "dgCMatrix"
#> A B C E
#> a 3 1 . 2
#> b 2 1 . .
#> c 3 2 1 2
我想以我可以使用的方式组合各个矩阵,例如作为(稀疏)数组或长数据框:
library(purrr)
list_of_mat_to_long_df <- function(list_of_mat){
all_rownames <- map(list_of_mat, ~ rownames(.x)) |>
reduce(union) |>
unique()
all_colnames <- map(list_of_mat, ~ colnames(.x)) |>
reduce(union) |>
unique()
expanded <- expand.grid(mat_name = names(list_of_mat),
row_name = all_rownames,
col_name = all_colnames)
expanded$value <- purrr::pmap_dbl(expanded,
\(mat_name, row_name, col_name) tryCatch(list_of_mat[[mat_name]][row_name, col_name],
error = \(e)0))
expanded
}
list_of_mat_to_long_df(list_of_mat)
#> mat_name row_name col_name value
#> 1 January a A 1
#> 2 February a A 3
#> 3 March a A 3
#> 4 January b A 0
#> 5 February b A 0
#> 6 March b A 2
#> 7 January c A 2
#> 8 February c A 0
#> 9 March c A 3
#> 10 January a B 0
#> 11 February a B 0
#> 12 March a B 1
#> 13 January b B 1
#> 14 February b B 2
#> 15 March b B 1
#> 16 January c B 2
#> 17 February c B 2
#> 18 March c B 2
#> 19 January a D 3
#> 20 February a D 0
#> 21 March a D 0
#> 22 January b D 0
#> 23 February b D 0
#> 24 March b D 0
#> 25 January c D 1
#> 26 February c D 0
#> 27 March c D 0
#> 28 January a E 0
#> 29 February a E 1
#> 30 March a E 2
#> 31 January b E 1
#> 32 February b E 1
#> 33 March b E 0
#> 34 January c E 1
#> 35 February c E 2
#> 36 March c E 2
#> 37 January a C 0
#> 38 February a C 0
#> 39 March a C 0
#> 40 January b C 0
#> 41 February b C 0
#> 42 March b C 0
#> 43 January c C 0
#> 44 February c C 0
#> 45 March c C 1
实际上,这些矩阵很大 (100k rows/columns),因此建议的解决方案太慢。此外,它们不能被转换为稠密矩阵。有没有有效的方法来做到这一点?
注意:
我猜你有几个选择。第一个是 cbind
稀疏矩阵并构造一个因子,指定生成的(更宽的)稀疏矩阵中每一列的原点:
(xx <- do.call(cbind, list_of_mat))
## 3 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names 'A', 'B', 'D' ... ]]
##
## a 1 . 3 . 3 . 1 3 1 . 2
## b . 1 . 1 . 2 1 2 1 . .
## c 2 2 1 1 . 2 2 3 2 1 2
(matname <- rep.int(gl(length(list_of_mat), 1L, labels = names(list_of_mat)),
vapply(list_of_mat, ncol, 0L)))
## [1] January January January January February February February March
## [9] March March March
## Levels: January February March
第二个是构建一个像您所显示的那样的长数据框,但排除带有 value == 0
的行以节省内存。 Matrix
有 mat2triplet
用于此目的:
(dd <- lapply(list_of_mat,
function(M) as.data.frame(mat2triplet(M)) |>
transform(rowname = rownames(M)[i],
colname = colnames(M)[j]) |>
subset(select = -c(i, j))))
## $January
## x rowname colname
## 1 1 a A
## 2 2 c A
## 3 1 b B
## 4 2 c B
## 5 3 a D
## 6 1 c D
## 7 1 b E
## 8 1 c E
##
## $February
## x rowname colname
## 1 3 a A
## 2 2 b B
## 3 2 c B
## 4 1 a E
## 5 1 b E
## 6 2 c E
##
## $March
## x rowname colname
## 1 3 a A
## 2 2 b A
## 3 3 c A
## 4 1 a B
## 5 1 b B
## 6 2 c B
## 7 1 c C
## 8 2 a E
## 9 2 c E
##
(tt <- do.call(rbind, unname(dd)) |>
transform(matname = rep.int(gl(length(dd), 1L, labels = names(dd)),
vapply(dd, nrow, 0L))))
## x rowname colname matname
## 1 1 a A January
## 2 2 c A January
## 3 1 b B January
## 4 2 c B January
## 5 3 a D January
## 6 1 c D January
## 7 1 b E January
## 8 1 c E January
## 9 3 a A February
## 10 2 b B February
## 11 2 c B February
## 12 1 a E February
## 13 1 b E February
## 14 2 c E February
## 15 3 a A March
## 16 2 b A March
## 17 3 c A March
## 18 1 a B March
## 19 1 b B March
## 20 2 c B March
## 21 1 c C March
## 22 2 a E March
## 23 2 c E March