reshape/melt 根据rowKey的非对称矩阵

reshape/melt an asymmetric matrix according to a rowKey

我想 reshape/melt 一个非对称矩阵,这样当两个列单元格根据 rowKey 都不为零时,任意两行跨列求和。我尝试了各种选项,但 none 都有效。我正在寻找适用于大型非对称矩阵的通用解决方案。

#Dummy data    
set.seed(123)    

mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))

mat
  a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
C 1 1 3 0 0
D 2 2 1 2 3

rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")

#Desired output

    N1 N2 N3 value
1   A  C  a     0
2   A  C  b     4
3   A  C  c     4
4   A  C  d     0
5   A  C  e     0
6   B  C  a     3
7   B  C  b     0
8   B  C  c     4
9   B  C  d     0
10  B  C  e     0
11  A  D  a     0
12  A  D  b     5
13  A  D  c     2
14  A  D  d     3
15  A  D  e     0
16  B  D  a     4
17  B  D  b     0
18  B  D  c     2
19  B  D  d     3
20  B  D  e     0

非常感谢任何指点!

temp = expand.grid(c(split(names(rowKey), rowKey), list(N3 = colnames(mat))))

temp2 = sapply(1:nrow(temp), function(i)
    mat[row.names(mat) == temp$N1[i] | row.names(mat) == temp$N2[i],
        colnames(mat) == temp$N3[i]])

temp$value = colSums(temp2) * (colSums(temp2 > 0) == nrow(temp2))
temp
#   N1 N2 N3 value
#1   A  C  a     0
#2   B  C  a     3
#3   A  D  a     0
#4   B  D  a     4
#5   A  C  b     4
#6   B  C  b     0
#7   A  D  b     5
#8   B  D  b     0
#9   A  C  c     4
#10  B  C  c     4
#11  A  D  c     2
#12  B  D  c     2
#13  A  C  d     0
#14  B  C  d     0
#15  A  D  d     3
#16  B  D  d     3
#17  A  C  e     0
#18  B  C  e     0
#19  A  D  e     0
#20  B  D  e     0

这是一个更长的数据处理方法:

set.seed(123)
mat <- matrix(rbinom(20,100,0.01),4,5,dimnames=list(LETTERS[1:4],letters[1:5]))

rowKey <- c("A"="N1","B"="N1","C"="N2","D"="N2")

按行键分割矩阵:

> n1 <- names(which(rowKey=="N1"))
> mat[n1,]
  a b c d e
A 0 3 1 1 0
B 2 0 1 1 0
> n2 <- names(which(rowKey=="N2"))
> mat[n2,]
  a b c d e
C 1 1 3 0 0
D 2 2 1 2 3

然后将数据转换为熔融数据帧。

> library(reshape2)
> mmat1 <- melt(mat[n1,])
> mmat1
   Var1 Var2 value
1     A    a     0
2     B    a     2
3     A    b     3
4     B    b     0
5     A    c     1
6     B    c     1
7     A    d     1
8     B    d     1
9     A    e     0
10    B    e     0
> mmat2 <- melt(mat[n2,])
> mmat2
   Var1 Var2 value
1     C    a     1
2     D    a     2
3     C    b     1
4     D    b     2
5     C    c     3
6     D    c     1
7     C    d     0
8     D    d     2
9     C    e     0
10    D    e     3

然后按列名合并数据框,合并时注意列名

> colnames(mmat1) <- c("N1","N3","Val1")
> colnames(mmat2) <- c("N2","N3","Val2")
> mmat12 <- merge(mmat1,mmat2)

现在我们可以用条件

组成最终矩阵
> res <- cbind(mmat12[c('N1','N2','N3')],mmat12['Val1']+mmat12['Val2'])
> res[(mmat12['Val1'] == 0)|(mmat12['Val2'] == 0),4] <- 0
> res[with(res, order(N1,N2,N3)),]
   N1 N2 N3 Val1
1   A  C  a    0
5   A  C  b    4
9   A  C  c    4
13  A  C  d    0
17  A  C  e    0
2   A  D  a    0
6   A  D  b    5
10  A  D  c    2
14  A  D  d    3
18  A  D  e    0
3   B  C  a    3
7   B  C  b    0
11  B  C  c    4
15  B  C  d    0
19  B  C  e    0
4   B  D  a    4
8   B  D  b    0
12  B  D  c    2
16  B  D  d    3
20  B  D  e    0

这里有一些以 data.frame 为中心的选项,使用 tidyverse 进行操作:

library(tidyverse)
set.seed(123)    

mat <- matrix(rbinom(20, 100, 0.01), 4, 5, 
              dimnames = list(LETTERS[1:4], letters[1:5]))
rowKey <- c("A" = "N1", "B" = "N1", "C" = "N2", "D" = "N2")

output1 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N1') %>% 
    gather(N3, value, -N1) %>%    # reshape to long form
    crossing(N2 = .$N1) %>%    # add combinations of rowname values
    filter(N1 != N2, rowKey[N1] != rowKey[N2]) %>%    # drop unwanted combinations
    mutate(value = na_if(value, 0),    # change 0 values to NA so sum will be 0
           # sort rowname values to make group ID column for aggregation
           id = map2_chr(N1, N2, ~toString(sort(c(.x, .y))))) %>% 
    group_by(id, N3) %>% 
    summarise(N1 = min(N1),    # get alpabetically first rowname for N1
              N2 = max(N2),    # and last for N2
              value = coalesce(sum(value), 0L)) %>%    # sum and replace NAs with 0s again
    # clean up
    ungroup() %>% 
    select(N1, N2, N3, value) %>% 
    arrange(N2)

output1
#> # A tibble: 20 x 4
#>       N1    N2    N3 value
#>    <chr> <chr> <chr> <int>
#>  1     A     C     a     0
#>  2     A     C     b     4
#>  3     A     C     c     4
#>  4     A     C     d     0
#>  5     A     C     e     0
#>  6     B     C     a     3
#>  7     B     C     b     0
#>  8     B     C     c     4
#>  9     B     C     d     0
#> 10     B     C     e     0
#> 11     A     D     a     0
#> 12     A     D     b     5
#> 13     A     D     c     2
#> 14     A     D     d     3
#> 15     A     D     e     0
#> 16     B     D     a     4
#> 17     B     D     b     0
#> 18     B     D     c     2
#> 19     B     D     d     3
#> 20     B     D     e     0

expand.grid 一样,tidyr::crossing 扩展得比必要的多(例如 A/A 组合),这可能会在规模上减慢速度。如果写起来更烦人,基于 combn 的方法可能会更快。

拆分和使用自联接是通过添加列而不是行来创建组合的更直接的方法。它需要一些光表体操,或者 split:

output2 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N') %>% 
    gather(N3, value, -N) %>% 
    mutate(key = rowKey[N],    # add column with key
           value = na_if(value, 0)) %>% 
    split(.$key) %>%    # split list by key
    # join list elements to add N1/N2 and value combinations
    reduce(full_join, by = 'N3', suffix = sub('N', '', names(.))) %>% 
    transmute(N1, N2, N3, 
              value = coalesce(value1 + value2, 0L)) %>%
    arrange(N2, N1)

all_equal(output1, output2)
#> [1] TRUE

...或tidyr::nest:

output3 <- mat %>% 
    as.data.frame() %>% 
    rownames_to_column('N') %>% 
    gather(N3, value, -N) %>% 
    mutate(key = rowKey[N], 
           value = na_if(value, 0)) %>% 
    nest(-key) %>%    # store all but key column as nested data frame
    # join nested data frames by N3 to get N1/N2 and value combinations
    { reduce(.$data, full_join, by = 'N3', suffix = sub('N', '', .$key)) } %>% 
    transmute(N1, N2, N3, 
              value = coalesce(value1 + value2, 0L)) %>% 
    arrange(N2, N1)

all_equal(output1, output3)
#> [1] TRUE

reduce 调用可以替换为 purrr::invoke/do.call,因为 reduce 仅调用 full_join 一次,但减少连接是常见的习语,并可能使该方法更易于扩展。