创建二元向量的组合

Create combinations of a binary vector

我想创建一个由固定数量的 0 和 1 组成的二进制向量的所有可能组合。例如: 昏暗(v)=5x1; n1=3; n0=2; 在这种情况下,我想要类似的东西:

  1,1,1,0,0
  1,1,0,1,0
  1,1,0,0,1
  1,0,1,1,0
  1,0,1,0,1
  1,0,0,1,1
  0,1,1,1,0
  0,1,1,0,1
  0,1,0,1,1
  0,0,1,1,1

我在阅读这篇文章时找到了一些帮助 post 但我只想生成我需要的组合,避免浪费 space (我认为问题会随着 n 呈指数级增长)

你可以试试这个方法:

f <- function(n=5,m=3)
 t(apply(combn(1:n,m=m),2,function(cm) replace(rep(0,n),cm,1)))

f(5,3)
#       [,1] [,2] [,3] [,4] [,5]
#  [1,]    1    1    1    0    0
#  [2,]    1    1    0    1    0
#  [3,]    1    1    0    0    1
#  [4,]    1    0    1    1    0
#  [5,]    1    0    1    0    1
#  [6,]    1    0    0    1    1
#  [7,]    0    1    1    1    0
#  [8,]    0    1    1    0    1
#  [9,]    0    1    0    1    1
# [10,]    0    0    1    1    1

想法是生成 1 的所有索引组合,然后使用它们产生最终结果。

相同方法的另一种形式:

f.2 <- function(n=5,m=3)
  t(combn(1:n,m,FUN=function(cm) replace(rep(0,n),cm,1)))

第二种方法大约快两倍:

library(rbenchmark)
benchmark(f(16,8),f.2(16,8))
#         test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8)          100   5.706    1.000     5.688    0.017          0         0
# 1   f(16, 8)          100  10.802    1.893    10.715    0.082          0         0

基准

f.akrun <- function(n=5,m=3) {

  indx <- combnPrim(1:n,m)

  DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
  for(i in seq_len(nrow(DT))){
    set(DT, i=i, j=indx[,i],value=1) 
  }
  DT  
}

benchmark(f(16,8),f.2(16,8),f.akrun(16,8))
#            test replications elapsed relative user.self sys.self user.child sys.child
# 2     f.2(16, 8)          100   5.464    1.097     5.435    0.028          0         0
# 3 f.akrun(16, 8)          100   4.979    1.000     4.938    0.037          0         0
# 1       f(16, 8)          100  10.854    2.180    10.689    0.129          0         0

@akrun 的解决方案 (f.akrun) 比 f.2.

快 ~10%

[编辑] 另一种方法,更快更简单:

f.3 <- function(n=5,m=3) t(combn(n,m,tabulate,nbins=n))

您可以尝试 gRbase 中的 combnPrim 以及 data.table 中的 set(可能是 faster

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") 
library(gRbase)
library(data.table)
n <-5
indx <- combnPrim(1:n,3)

DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
 for(i in seq_len(nrow(DT))){
  set(DT, i=i, j=indx[,i],value=1) 
 }
DT
 #   V1 V2 V3 V4 V5
 #1:  1  1  1  0  0
 #2:  1  1  0  1  0
 #3:  1  0  1  1  0
 #4:  0  1  1  1  0
 #5:  1  1  0  0  1
 #6:  1  0  1  0  1
 #7:  0  1  1  0  1
 #8:  1  0  0  1  1
 #9:  0  1  0  1  1
#10:  0  0  1  1  1

Marat 回答的稍微快一点的版本:

f.roland <- function(n, m) {
  ind <- combn(seq_len(n), m)
  ind <- t(ind) + (seq_len(ncol(ind)) - 1) * n
  res <- rep(0, nrow(ind) * n)
  res[ind] <- 1
  matrix(res, ncol = n, nrow = nrow(ind), byrow = TRUE)
}

all.equal(f.2(16, 8), f.roland(16, 8))
#[1] TRUE
library(rbenchmark)
benchmark(f(16,8),f.2(16,8),f.roland(16,8))

#             test replications elapsed relative user.self sys.self user.child sys.child
#2      f.2(16, 8)          100   5.693    1.931     5.670    0.020          0         0
#3 f.roland(16, 8)          100   2.948    1.000     2.929    0.017          0         0
#1        f(16, 8)          100   8.287    2.811     8.214    0.066          0         0

这是另一种方法:

func <- function(n, m) t(combn(n, m, function(a) {z=integer(n);z[a]=1;z}))

func(n = 5, m = 2)

     # [,1] [,2] [,3] [,4] [,5]
 # [1,]    1    1    0    0    0
 # [2,]    1    0    1    0    0
 # [3,]    1    0    0    1    0
 # [4,]    1    0    0    0    1
 # [5,]    0    1    1    0    0
 # [6,]    0    1    0    1    0
 # [7,]    0    1    0    0    1
 # [8,]    0    0    1    1    0
 # [9,]    0    0    1    0    1
# [10,]    0    0    0    1    1

使用二叉树扩展比 f.roland 略有性能改进(n/m 大约等于 2,m << n f.roland 获胜),但代价是更高的内存用法:

f.krassowski = function(n, m) {
    m_minus_n = m - n
    paths = list(
        c(0, rep(NA, n-1)),
        c(1, rep(NA, n-1))
    )
    sums = c(0, 1)
    for (level in 2:n) {
        upper_threshold = level + m_minus_n

        is_worth_adding_0 = (sums <= m) & (upper_threshold <= sums)
        is_worth_adding_1 = (sums <= m - 1) & (upper_threshold - 1 <= sums)

        x = paths[is_worth_adding_0]
        y = paths[is_worth_adding_1]

        for (i in 1:length(x)) {
            x[[i]][[level]] = 0
        }
        for (i in 1:length(y)) {
            y[[i]][[level]] = 1
        }
        paths = c(x, y)
        sums = c(sums[is_worth_adding_0], sums[is_worth_adding_1] + 1)
    }
    matrix(unlist(paths), byrow=TRUE, nrow=length(paths))
}

元素的顺序不同。

n/m = 2 的基准测试:

               expr       min        lq     mean    median        uq      max
           f(16, 8) 47.488731 48.182502 52.04539 48.689082 57.558552 65.26211
         f.2(16, 8) 38.291302 39.533287 43.61786 40.513500 48.673713 54.21076
         f.3(16, 8) 38.289619 39.007766 40.21002 39.273940 39.970907 49.02320
       f.989(16, 8) 35.000941 35.199950 38.09043 35.607685 40.725833 49.61785
    f.roland(16, 8) 14.295560 14.399079 15.02285 14.559891 14.625825 23.54574
f.krassowski(16, 8)  9.343784  9.552871 10.20118  9.614251  9.863443 19.70659

值得注意的是,f.3 的内存占用最小:

expression mem_alloc
f(16, 8) 5.7MB
f.2(16, 8) 3.14MB
f.3(16, 8) 1.57MB
f.989(16, 8) 3.14MB
f.roland(16, 8) 5.25MB
f.krassowski(16, 8) 6.37MB

对于n/m = 10

               expr       min        lq      mean    median        uq      max
           f(30, 3) 14.590784 14.819879 15.061327 14.970385 15.238594 15.74435
         f.2(30, 3) 11.886532 12.164719 14.197877 12.267662 12.450575 32.47237
         f.3(30, 3) 11.458760 11.597360 12.741168 11.706475 11.892549 30.36309
       f.989(30, 3) 10.646286 10.861159 12.922651 10.971200 11.106610 30.86498
    f.roland(30, 3)  3.513980  3.589361  4.559673  3.629923  3.727350 21.58201
f.krassowski(30, 3)  8.861349  8.927388 10.430068  9.022631  9.405705 32.70073

使用 RcppAlgos::permuteGeneral() 的方法。

RcppAlgos::permuteGeneral(1:0, freq=3:2)
#       [,1] [,2] [,3] [,4] [,5]
#  [1,]    1    1    1    0    0
#  [2,]    1    1    0    1    0
#  [3,]    1    1    0    0    1
#  [4,]    1    0    1    1    0
#  [5,]    1    0    1    0    1
#  [6,]    1    0    0    1    1
#  [7,]    0    1    1    1    0
#  [8,]    0    1    1    0    1
#  [9,]    0    1    0    1    1
# [10,]    0    0    1    1    1