通过将现有矩阵分成组（按因子）并将函数应用于每个子矩阵来创建矩阵

Question

这个问题最好通过示例解决：

设置

x1 <- c(1,4,5,6,7,1)
x2 <- c(1,1,2,3,4,1)
x3 <- c(3,4,5,6,7,1)
x4 <- c(1,2,3,5,7,2)
x5 <- c(6,2,3,9,7,2)
x6 <- c(5,2,4,3,2,3)
x7 <- c(6,4,3,1,8,3)



matrix <- t(data.frame(x1,x2,x3,x4,x5,x6,x7))
colnames(matrix)[6] <- "factor"```

我的目标是创建一个矩阵，其中矩阵元素是根据组中元素的列总和计算的，不包括第 (i) 行。然后我将其除以组中元素的数量减去 1。因此组 1 中的 3 个对象意味着除以 (3-1) =2.

对于每个组，如第6列或"factor"所述，我想计算一个对应的矩阵，然后将这些矩阵绑定在一起。

对于第 1 组，所需的矩阵是

output1 <- c(1+3/2, 1+4/2, 2+5/2 ...)
output2 <- c(1+3/2, 4+4/2, 5+5/2...)
output3 <- c(1+1/2, 4+1/2, 5+2/2, 6+3/2 ...)

mat_output1 <- rbind(output1, output2, output3)

对于第 2 组，所需的矩阵是

output4 <- c(6/1 , 2/1, 3/1, 9/1, 7/1 ...)
output5 <- c(1/1,2/1,3/1,5/1,7/1,2/1)
mat_output2 <- rbind(output4, output5)

第 3 组

output6 <- c(6/1,4/1,3/1,1/1,8/1,3/1)
output7 <- c(5/1,2/1,4/1,3/1,2/1,3/1)
mat_output3 <- rbind(output6, output7)

所需的输出形式为：

mat_output <- rbind(mat_output1, mat_output2, mat_output3)

Answer 1

此方法改为一次将每一列相加，然后从列总和中减去每个 i。

#here's a pretty quick base R solution:

do.call(rbind
        ,tapply(seq_len(nrow(matrix))
                , matrix[, 'factor']
                , FUN = function(i) sweep(-matrix[i, -length(matrix)]
                                          , 2
                                          , colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
                )
)

   <NA> <NA> <NA> <NA> <NA> factor
x1    2  2.5  3.5  4.5  5.5      1
x2    2  4.0  5.0  6.0  7.0      1
x3    1  2.5  3.5  4.5  5.5      1
x4    6  2.0  3.0  9.0  7.0      2
x5    1  2.0  3.0  5.0  7.0      2
x6    6  4.0  3.0  1.0  8.0      3
x7    5  2.0  4.0  3.0  2.0      3

# similar but MUCH slower

do.call(rbind
        , by(matrix[, -6]
             , matrix[, 6]
             #, function(x) sweep(-x, 2,colSums(x), FUN = '+') / (nrow(x)-1))
             , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)) #mapply is faster
)

一些表现。请注意，@Ronak 提供了警告，因此我更改为 dplyr 建议代码 mutate_at(vars(-group_cols), ...) 而不是 mutate_all(...)。此外，虽然 data.table 在 @akrun 中，但我正准备在 @akrun 发布之前编辑它，它基本上是 @Ronak 的 dplyr 翻译方式。

@G。 Fernando 的解决方案大约需要 20 秒，因此我将其从配置文件中排除。 @akrun 的基本解决方案是最快的。我个人有点喜欢@Ronak 的基础，因为它读起来很好。

#10,000 rows
#1,000 groups
Unit: milliseconds
        expr       min        lq       mean     median        uq       max neval
  dt_version  191.4823  193.7294  201.39367  200.61610  210.0798  211.0581    10
  cole_base2 7688.4689 7948.5534 8159.32689 8224.02570 8358.9145 8560.0802    10
  cole_base3  760.9410  761.6176  789.35789  791.22520  812.1285  822.8938    10
  Ronak_base  378.2914  381.9018  403.30458  403.65600  418.5159  431.2887    10
 Ronak_dplyr 7025.7606 7045.9863 7217.55143 7150.09070 7395.1977 7505.7091    10
  akrun_base   26.3189   27.2791   28.90526   28.03645   29.3622   33.5207    10

#10,000 rows
#100 groups
Unit: milliseconds
        expr      min       lq      mean    median       uq      max neval
  dt_version  32.6928  33.4362  36.27415  37.34835  38.8137  39.9793    10
  cole_base2 770.1962 817.3142 847.01249 846.13940 893.4095 900.8028    10
  cole_base3  97.5201 101.1023 108.46434 102.01210 105.9185 165.3160    10
  Ronak_base 115.7445 117.9968 128.06018 124.27730 129.9934 170.3994    10
 Ronak_dplyr 721.4570 734.6108 747.46815 735.65990 756.1121 787.0906    10
  akrun_base  23.3171  24.4940  26.79405  26.55190  29.1286  30.2099    10

library(data.table)
library(microbenchmark)
library(dplyr)

n_cols <- 10
n_rows <- 1E5
n_row_per_group <- 100


set.seed(1)
matrix <- matrix(sample(100, n_rows*n_cols, replace = T), ncol = n_cols)
matrix <- cbind(matrix, factor = rep(1:(n_rows / n_row_per_group), each = n_row_per_group))
df <- data.frame(matrix)
dt <- as.data.table(df)

do.call(rbind
        , lapply(unique(matrix[,'factor'])
                 , function(x) {
                   sub_mat <- matrix[matrix[, 'factor'] == x,] 
                   sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
         })
)



microbenchmark(
  # cole_base = { #too slow for lots of little groups
  #   do.call(rbind
  #           ,by(matrix[, -6]
  #               , matrix[, 6]
  #               , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)
  #           )
  #   )
  # },
   dt_version = {
    dt[, lapply(.SD, function(x) (sum(x) - x) / (.N - 1)) , by = 'factor']
  }
  ,cole_base2 = {
    do.call(rbind
            , lapply(unique(matrix[,'factor'])
                     , function(x) {
                       sub_mat <- matrix[matrix[, 'factor'] == x,] 
                       sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
                     })
    )
  }
  ,cole_base3 = {
    do.call(rbind
            ,tapply(seq_len(nrow(matrix))
                    , matrix[, 'factor']
                    , FUN = function(i) sweep(-matrix[i, -length(matrix)], 2, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
                    , simplify = F)
    )
  }

  ,Ronak_base = {

    lapply(df[-ncol(df)], function(x) 
      ave(x, df$factor, FUN = function(x) (sum(x) - x)/(length(x) - 1)))
  }
  # ,G_fern_base = { #pretty slow, i hardcoded the factor row - it needs fixed slightly
  #   do.call(rbind,
  #           lapply(levels(factor(matrix[,11])),function(x) {
  #             list=as.list(NULL)
  #             index=which(matrix[,11]==x)
  #             for(i in 1:length(index)){
  #               if(length(index)>2){
  #                 list[[i]]=colSums(matrix[index[-i],])
  #               }else{
  #                 list[[i]]=matrix[index[-i],] 
  #               }
  #               list[[i]]=list[[i]][-11]/(length(index)-1)
  #             }
  #             return(do.call(rbind,list))
  #           })
  #   )
  # }
  , Ronak_dplyr = {
    df %>%
      group_by(factor) %>%
      mutate_at(vars(-group_cols()), ~(sum(.)-.)/ (n() - 1))
  }
  , akrun_base = {
    n1 <- tabulate(matrix[, ncol(matrix)])
    m1 <- rowsum(matrix[,-ncol(matrix)], group = matrix[,ncol(matrix)])
    (m1[rep(seq_len(nrow(m1)), n1),] - matrix[, -ncol(matrix)])/rep(n1 - 1, n1)

  }
  , times = 10
)

Answer 2

如果我理解你的问题正确，下面的代码可能会成功

do.call(rbind,
lapply(levels(factor(matrix[,which(colnames=='factor')])),function(x) {
        list=as.list(NULL)
        index=which(matrix[,which(colnames=='factor')]==x)
        for(i in 1:length(index)){
                if(length(index)>2){
                        list[[i]]=colSums(matrix[index[-i],])
                }else{
                        list[[i]]=matrix[index[-i],] 
                }
                list[[i]]=list[[i]][-which(colnames=='factor')]/(length(index)-1)
        }
        return(do.call(rbind,list))
})
)

从示例中提供的代码得到：

     <NA> <NA> <NA> <NA> <NA>
[1,]    2  2.5  3.5  4.5  5.5
[2,]    2  4.0  5.0  6.0  7.0
[3,]    1  2.5  3.5  4.5  5.5
[4,]    6  2.0  3.0  9.0  7.0
[5,]    1  2.0  3.0  5.0  7.0
[6,]    6  4.0  3.0  1.0  8.0
[7,]    5  2.0  4.0  3.0  2.0

Answer 3

如果将数据转换为数据框而不是具有专有名称的矩阵，会更容易。

df <- setNames(data.frame(matrix), c(letters[1:5], "factor"))

然后我们可以使用 dplyr 并计算组的 sum 并将其减去当前值除以组中的行数减 1。

library(dplyr)

df %>%
  group_by(factor) %>%
  mutate_all(~(sum(.)-.)/ (n() - 1))

#      a     b     c     d     e factor
#  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
#1     2   2.5   3.5   4.5   5.5      1
#2     2   4     5     6     7        1
#3     1   2.5   3.5   4.5   5.5      1
#4     6   2     3     9     7        2
#5     1   2     3     5     7        2
#6     6   4     3     1     8        3
#7     5   2     4     3     2        3

使用 base R 你可以使用 lapply 和 ave

df[-ncol(df)] <- lapply(df[-ncol(df)], function(x) 
      ave(x, df$factor, FUN = function(x) (sum(x) - x)/(length(x) - 1)))

Answer 4

一个选项使用data.table

library(data.table)
setDT(df1)[, lapply(.SD, function(x) (sum(x) - x)/(.N-1)), .(factor)]
#   factor a   b   c   d   e
#1:      1 2 2.5 3.5 4.5 5.5
#2:      1 2 4.0 5.0 6.0 7.0
#3:      1 1 2.5 3.5 4.5 5.5
#4:      2 6 2.0 3.0 9.0 7.0
#5:      2 1 2.0 3.0 5.0 7.0
#6:      3 6 4.0 3.0 1.0 8.0
#7:      3 5 2.0 4.0 3.0 2.0

或者我们可以使用 base R 和 rowsum

n1 <- tabulate(matrix[, 6])
m1 <- rowsum(matrix[,-6], group = matrix[,6])
matrix[, -6] <- (m1[rep(seq_len(nrow(m1)), n1),] - matrix[, -6])/rep(n1 - 1, n1)
matrix
#   [,1] [,2] [,3] [,4] [,5] [,6]
#x1    2  2.5  3.5  4.5  5.5    1
#x2    2  4.0  5.0  6.0  7.0    1
#x3    1  2.5  3.5  4.5  5.5    1
#x4    6  2.0  3.0  9.0  7.0    2
#x5    1  2.0  3.0  5.0  7.0    2
#x6    6  4.0  3.0  1.0  8.0    3
#x7    5  2.0  4.0  3.0  2.0    3

数据

df1 <- as.data.frame(matrix)
names(df1) <- c(letters[1:5], "factor")

通过将现有矩阵分成组（按因子）并将函数应用于每个子矩阵来创建矩阵

create a matrix by seperating an existing matrix into groups(by factor) and applying a function to each sub-matrix

group-by

r

function

apply

column-sum

数据