创建二元向量的组合
Create combinations of a binary vector
我想创建一个由固定数量的 0 和 1 组成的二进制向量的所有可能组合。例如:
昏暗(v)=5x1; n1=3; n0=2;
在这种情况下,我想要类似的东西:
1,1,1,0,0
1,1,0,1,0
1,1,0,0,1
1,0,1,1,0
1,0,1,0,1
1,0,0,1,1
0,1,1,1,0
0,1,1,0,1
0,1,0,1,1
0,0,1,1,1
我在阅读这篇文章时找到了一些帮助 post
但我只想生成我需要的组合,避免浪费 space (我认为问题会随着 n 呈指数级增长)
你可以试试这个方法:
f <- function(n=5,m=3)
t(apply(combn(1:n,m=m),2,function(cm) replace(rep(0,n),cm,1)))
f(5,3)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 1 0 0
# [2,] 1 1 0 1 0
# [3,] 1 1 0 0 1
# [4,] 1 0 1 1 0
# [5,] 1 0 1 0 1
# [6,] 1 0 0 1 1
# [7,] 0 1 1 1 0
# [8,] 0 1 1 0 1
# [9,] 0 1 0 1 1
# [10,] 0 0 1 1 1
想法是生成 1 的所有索引组合,然后使用它们产生最终结果。
相同方法的另一种形式:
f.2 <- function(n=5,m=3)
t(combn(1:n,m,FUN=function(cm) replace(rep(0,n),cm,1)))
第二种方法大约快两倍:
library(rbenchmark)
benchmark(f(16,8),f.2(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8) 100 5.706 1.000 5.688 0.017 0 0
# 1 f(16, 8) 100 10.802 1.893 10.715 0.082 0 0
基准
f.akrun <- function(n=5,m=3) {
indx <- combnPrim(1:n,m)
DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
for(i in seq_len(nrow(DT))){
set(DT, i=i, j=indx[,i],value=1)
}
DT
}
benchmark(f(16,8),f.2(16,8),f.akrun(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8) 100 5.464 1.097 5.435 0.028 0 0
# 3 f.akrun(16, 8) 100 4.979 1.000 4.938 0.037 0 0
# 1 f(16, 8) 100 10.854 2.180 10.689 0.129 0 0
@akrun 的解决方案 (f.akrun
) 比 f.2
.
快 ~10%
[编辑]
另一种方法,更快更简单:
f.3 <- function(n=5,m=3) t(combn(n,m,tabulate,nbins=n))
您可以尝试 gRbase
中的 combnPrim
以及 data.table
中的 set
(可能是 faster)
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase")
library(gRbase)
library(data.table)
n <-5
indx <- combnPrim(1:n,3)
DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
for(i in seq_len(nrow(DT))){
set(DT, i=i, j=indx[,i],value=1)
}
DT
# V1 V2 V3 V4 V5
#1: 1 1 1 0 0
#2: 1 1 0 1 0
#3: 1 0 1 1 0
#4: 0 1 1 1 0
#5: 1 1 0 0 1
#6: 1 0 1 0 1
#7: 0 1 1 0 1
#8: 1 0 0 1 1
#9: 0 1 0 1 1
#10: 0 0 1 1 1
Marat 回答的稍微快一点的版本:
f.roland <- function(n, m) {
ind <- combn(seq_len(n), m)
ind <- t(ind) + (seq_len(ncol(ind)) - 1) * n
res <- rep(0, nrow(ind) * n)
res[ind] <- 1
matrix(res, ncol = n, nrow = nrow(ind), byrow = TRUE)
}
all.equal(f.2(16, 8), f.roland(16, 8))
#[1] TRUE
library(rbenchmark)
benchmark(f(16,8),f.2(16,8),f.roland(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
#2 f.2(16, 8) 100 5.693 1.931 5.670 0.020 0 0
#3 f.roland(16, 8) 100 2.948 1.000 2.929 0.017 0 0
#1 f(16, 8) 100 8.287 2.811 8.214 0.066 0 0
这是另一种方法:
func <- function(n, m) t(combn(n, m, function(a) {z=integer(n);z[a]=1;z}))
func(n = 5, m = 2)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 0 0 0
# [2,] 1 0 1 0 0
# [3,] 1 0 0 1 0
# [4,] 1 0 0 0 1
# [5,] 0 1 1 0 0
# [6,] 0 1 0 1 0
# [7,] 0 1 0 0 1
# [8,] 0 0 1 1 0
# [9,] 0 0 1 0 1
# [10,] 0 0 0 1 1
使用二叉树扩展比 f.roland
略有性能改进(n/m
大约等于 2,m << n f.roland
获胜),但代价是更高的内存用法:
f.krassowski = function(n, m) {
m_minus_n = m - n
paths = list(
c(0, rep(NA, n-1)),
c(1, rep(NA, n-1))
)
sums = c(0, 1)
for (level in 2:n) {
upper_threshold = level + m_minus_n
is_worth_adding_0 = (sums <= m) & (upper_threshold <= sums)
is_worth_adding_1 = (sums <= m - 1) & (upper_threshold - 1 <= sums)
x = paths[is_worth_adding_0]
y = paths[is_worth_adding_1]
for (i in 1:length(x)) {
x[[i]][[level]] = 0
}
for (i in 1:length(y)) {
y[[i]][[level]] = 1
}
paths = c(x, y)
sums = c(sums[is_worth_adding_0], sums[is_worth_adding_1] + 1)
}
matrix(unlist(paths), byrow=TRUE, nrow=length(paths))
}
元素的顺序不同。
n/m = 2
的基准测试:
expr min lq mean median uq max
f(16, 8) 47.488731 48.182502 52.04539 48.689082 57.558552 65.26211
f.2(16, 8) 38.291302 39.533287 43.61786 40.513500 48.673713 54.21076
f.3(16, 8) 38.289619 39.007766 40.21002 39.273940 39.970907 49.02320
f.989(16, 8) 35.000941 35.199950 38.09043 35.607685 40.725833 49.61785
f.roland(16, 8) 14.295560 14.399079 15.02285 14.559891 14.625825 23.54574
f.krassowski(16, 8) 9.343784 9.552871 10.20118 9.614251 9.863443 19.70659
值得注意的是,f.3
的内存占用最小:
expression
mem_alloc
f(16, 8)
5.7MB
f.2(16, 8)
3.14MB
f.3(16, 8)
1.57MB
f.989(16, 8)
3.14MB
f.roland(16, 8)
5.25MB
f.krassowski(16, 8)
6.37MB
对于n/m = 10
:
expr min lq mean median uq max
f(30, 3) 14.590784 14.819879 15.061327 14.970385 15.238594 15.74435
f.2(30, 3) 11.886532 12.164719 14.197877 12.267662 12.450575 32.47237
f.3(30, 3) 11.458760 11.597360 12.741168 11.706475 11.892549 30.36309
f.989(30, 3) 10.646286 10.861159 12.922651 10.971200 11.106610 30.86498
f.roland(30, 3) 3.513980 3.589361 4.559673 3.629923 3.727350 21.58201
f.krassowski(30, 3) 8.861349 8.927388 10.430068 9.022631 9.405705 32.70073
使用 RcppAlgos::permuteGeneral()
的方法。
RcppAlgos::permuteGeneral(1:0, freq=3:2)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 1 0 0
# [2,] 1 1 0 1 0
# [3,] 1 1 0 0 1
# [4,] 1 0 1 1 0
# [5,] 1 0 1 0 1
# [6,] 1 0 0 1 1
# [7,] 0 1 1 1 0
# [8,] 0 1 1 0 1
# [9,] 0 1 0 1 1
# [10,] 0 0 1 1 1
我想创建一个由固定数量的 0 和 1 组成的二进制向量的所有可能组合。例如: 昏暗(v)=5x1; n1=3; n0=2; 在这种情况下,我想要类似的东西:
1,1,1,0,0
1,1,0,1,0
1,1,0,0,1
1,0,1,1,0
1,0,1,0,1
1,0,0,1,1
0,1,1,1,0
0,1,1,0,1
0,1,0,1,1
0,0,1,1,1
我在阅读这篇文章时找到了一些帮助 post
你可以试试这个方法:
f <- function(n=5,m=3)
t(apply(combn(1:n,m=m),2,function(cm) replace(rep(0,n),cm,1)))
f(5,3)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 1 0 0
# [2,] 1 1 0 1 0
# [3,] 1 1 0 0 1
# [4,] 1 0 1 1 0
# [5,] 1 0 1 0 1
# [6,] 1 0 0 1 1
# [7,] 0 1 1 1 0
# [8,] 0 1 1 0 1
# [9,] 0 1 0 1 1
# [10,] 0 0 1 1 1
想法是生成 1 的所有索引组合,然后使用它们产生最终结果。
相同方法的另一种形式:
f.2 <- function(n=5,m=3)
t(combn(1:n,m,FUN=function(cm) replace(rep(0,n),cm,1)))
第二种方法大约快两倍:
library(rbenchmark)
benchmark(f(16,8),f.2(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8) 100 5.706 1.000 5.688 0.017 0 0
# 1 f(16, 8) 100 10.802 1.893 10.715 0.082 0 0
基准
f.akrun <- function(n=5,m=3) {
indx <- combnPrim(1:n,m)
DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
for(i in seq_len(nrow(DT))){
set(DT, i=i, j=indx[,i],value=1)
}
DT
}
benchmark(f(16,8),f.2(16,8),f.akrun(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8) 100 5.464 1.097 5.435 0.028 0 0
# 3 f.akrun(16, 8) 100 4.979 1.000 4.938 0.037 0 0
# 1 f(16, 8) 100 10.854 2.180 10.689 0.129 0 0
@akrun 的解决方案 (f.akrun
) 比 f.2
.
[编辑] 另一种方法,更快更简单:
f.3 <- function(n=5,m=3) t(combn(n,m,tabulate,nbins=n))
您可以尝试 gRbase
中的 combnPrim
以及 data.table
中的 set
(可能是 faster)
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase")
library(gRbase)
library(data.table)
n <-5
indx <- combnPrim(1:n,3)
DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
for(i in seq_len(nrow(DT))){
set(DT, i=i, j=indx[,i],value=1)
}
DT
# V1 V2 V3 V4 V5
#1: 1 1 1 0 0
#2: 1 1 0 1 0
#3: 1 0 1 1 0
#4: 0 1 1 1 0
#5: 1 1 0 0 1
#6: 1 0 1 0 1
#7: 0 1 1 0 1
#8: 1 0 0 1 1
#9: 0 1 0 1 1
#10: 0 0 1 1 1
Marat 回答的稍微快一点的版本:
f.roland <- function(n, m) {
ind <- combn(seq_len(n), m)
ind <- t(ind) + (seq_len(ncol(ind)) - 1) * n
res <- rep(0, nrow(ind) * n)
res[ind] <- 1
matrix(res, ncol = n, nrow = nrow(ind), byrow = TRUE)
}
all.equal(f.2(16, 8), f.roland(16, 8))
#[1] TRUE
library(rbenchmark)
benchmark(f(16,8),f.2(16,8),f.roland(16,8))
# test replications elapsed relative user.self sys.self user.child sys.child
#2 f.2(16, 8) 100 5.693 1.931 5.670 0.020 0 0
#3 f.roland(16, 8) 100 2.948 1.000 2.929 0.017 0 0
#1 f(16, 8) 100 8.287 2.811 8.214 0.066 0 0
这是另一种方法:
func <- function(n, m) t(combn(n, m, function(a) {z=integer(n);z[a]=1;z}))
func(n = 5, m = 2)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 0 0 0
# [2,] 1 0 1 0 0
# [3,] 1 0 0 1 0
# [4,] 1 0 0 0 1
# [5,] 0 1 1 0 0
# [6,] 0 1 0 1 0
# [7,] 0 1 0 0 1
# [8,] 0 0 1 1 0
# [9,] 0 0 1 0 1
# [10,] 0 0 0 1 1
使用二叉树扩展比 f.roland
略有性能改进(n/m
大约等于 2,m << n f.roland
获胜),但代价是更高的内存用法:
f.krassowski = function(n, m) {
m_minus_n = m - n
paths = list(
c(0, rep(NA, n-1)),
c(1, rep(NA, n-1))
)
sums = c(0, 1)
for (level in 2:n) {
upper_threshold = level + m_minus_n
is_worth_adding_0 = (sums <= m) & (upper_threshold <= sums)
is_worth_adding_1 = (sums <= m - 1) & (upper_threshold - 1 <= sums)
x = paths[is_worth_adding_0]
y = paths[is_worth_adding_1]
for (i in 1:length(x)) {
x[[i]][[level]] = 0
}
for (i in 1:length(y)) {
y[[i]][[level]] = 1
}
paths = c(x, y)
sums = c(sums[is_worth_adding_0], sums[is_worth_adding_1] + 1)
}
matrix(unlist(paths), byrow=TRUE, nrow=length(paths))
}
元素的顺序不同。
n/m = 2
的基准测试:
expr min lq mean median uq max
f(16, 8) 47.488731 48.182502 52.04539 48.689082 57.558552 65.26211
f.2(16, 8) 38.291302 39.533287 43.61786 40.513500 48.673713 54.21076
f.3(16, 8) 38.289619 39.007766 40.21002 39.273940 39.970907 49.02320
f.989(16, 8) 35.000941 35.199950 38.09043 35.607685 40.725833 49.61785
f.roland(16, 8) 14.295560 14.399079 15.02285 14.559891 14.625825 23.54574
f.krassowski(16, 8) 9.343784 9.552871 10.20118 9.614251 9.863443 19.70659
值得注意的是,f.3
的内存占用最小:
expression | mem_alloc |
---|---|
f(16, 8) | 5.7MB |
f.2(16, 8) | 3.14MB |
f.3(16, 8) | 1.57MB |
f.989(16, 8) | 3.14MB |
f.roland(16, 8) | 5.25MB |
f.krassowski(16, 8) | 6.37MB |
对于n/m = 10
:
expr min lq mean median uq max
f(30, 3) 14.590784 14.819879 15.061327 14.970385 15.238594 15.74435
f.2(30, 3) 11.886532 12.164719 14.197877 12.267662 12.450575 32.47237
f.3(30, 3) 11.458760 11.597360 12.741168 11.706475 11.892549 30.36309
f.989(30, 3) 10.646286 10.861159 12.922651 10.971200 11.106610 30.86498
f.roland(30, 3) 3.513980 3.589361 4.559673 3.629923 3.727350 21.58201
f.krassowski(30, 3) 8.861349 8.927388 10.430068 9.022631 9.405705 32.70073
使用 RcppAlgos::permuteGeneral()
的方法。
RcppAlgos::permuteGeneral(1:0, freq=3:2)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 1 1 0 0
# [2,] 1 1 0 1 0
# [3,] 1 1 0 0 1
# [4,] 1 0 1 1 0
# [5,] 1 0 1 0 1
# [6,] 1 0 0 1 1
# [7,] 0 1 1 1 0
# [8,] 0 1 1 0 1
# [9,] 0 1 0 1 1
# [10,] 0 0 1 1 1