生成具有连续出现的向量的所有组合被视为单次出现

Generate all combinations of vector with consecutive occurrences is considered as single occurrence

我想生成包含向量元素所有可能组合的向量,其中元素的连续多次出现被视为该元素的单次出现。

简单案例

对于 n = 2,

original <- c("a","a","a","b","b","b")
      v1 <- c("b","b","b","a","a","a")

所以与 b 交换的所有唯一出现。

对于 n = 3,我们得到

original<-c("a","a","a","b","b","b","c","c","c")
    ver1<-c("a","a","a","c","c","c","b","b","b")
    ver2<-c("b","b","b","a","a","a","c","c","c")
    ver3<-c("b","b","b","c","c","c","a","a","a")
    ver4<-c("c","c","c","b","b","b","a","a","a")
    ver5<-c("c","c","c","a","a","a","b","b","b")

所以 a 的所有唯一出现与 bc 交换,b 的所有唯一出现与 a 和 [=16= 交换] AND 所有唯一出现的 cba 交换。

案例达到 n = 10。(我相信具有不同组合的可能向量是 10!)

此外,可以有多个 a、b、c...

复杂情况

对于 n = 2;

original<-c("a","a","a","b","b","b","a","a","b","b")
    ver1<-c("b","b","b","a","a","a","b","b","a","a")

但是如果我们正确地交换元素,复杂情况和简单情况应该无关紧要。

我在尝试什么:(对于 n=2)

original<-c("a","a","a","b","b","b","a","a","b","b")
ver1<-replace(original,which(original=='a'),'b')
ver1<-replace(ver1,which(original=='b'),'a')
gives ver1<-c("b","b","b","a","a","a","b","b","a","a")

但不确定如何自动执行此操作。

使用chartr,你可以这样做(尽管对于更大的向量这可能会崩溃):

f <- function(vec){
  uni <- unique(vec)
  size <- length(uni)
  pVec <- paste(uni, collapse = "")
  grid <- expand.grid(rep(list(uni), size))
  expanded <- grid[apply(grid, 1, function(x) length(unique(x))) == size,]
  p <- unname(apply(ex, 1, paste0, collapse = ""))
  
  lapply(p, function(x) chartr(pVec, x, vec))
}

输出:

original<-c("a","a","a","b","b","b","c","c","c")
f(original)

# [[1]]
# [1] "c" "c" "c" "b" "b" "b" "a" "a" "a"
# 
# [[2]]
# [1] "b" "b" "b" "c" "c" "c" "a" "a" "a"
# 
# [[3]]
# [1] "c" "c" "c" "a" "a" "a" "b" "b" "b"
# 
# [[4]]
# [1] "a" "a" "a" "c" "c" "c" "b" "b" "b"
# 
# [[5]]
# [1] "b" "b" "b" "a" "a" "a" "c" "c" "c"
# 
# [[6]]
# [1] "a" "a" "a" "b" "b" "b" "c" "c" "c"

上一个答案(不适用于 n > 2)。

使用gtools::permutations。结果是矩阵的每一列。这个想法是从唯一值中获取排列,并重复这些值以匹配所需的组长度。

f <- function(x){
  r <- rle(x)
  l <- length(r$values)
  apply(gtools::permutations(n=l, r=l, v=r$values), 1, function(x) rep(x, each = unique(r$l)))
}

这是一种使用非常快速的 arrangements 包进行排列的方法。我们计算与输入的唯一元素相对应的整数排列,然后进行一些巧妙的索引以输出相应的交换。这在小示例上非常快,在大示例上表现非常好——在我的计算机上,用 10 个唯一元素生成大小为 30 的输入的 10! = 3628800 交换只用了不到 7 秒。结果以 list.

形式方便地返回
library(arrangements)

all_swaps = function(x) {
  ux = unique(x)
  xi = as.integer(factor(x))
  perm = permutations(seq_along(ux))
  apply(perm, MARGIN = 1, FUN = \(p) ux[p][xi], simplify = FALSE)
}

问题的测试用例:

# n = 2
all_swaps(c("a","a","a","b","b","b","a","a","b","b"))
# [[1]]
#  [1] "a" "a" "a" "b" "b" "b" "a" "a" "b" "b"
# 
# [[2]]
#  [1] "b" "b" "b" "a" "a" "a" "b" "b" "a" "a"

## n = 3
all_swaps(c("a","a","a","b","b","b","c","c","c"))
# [[1]]
# [1] "a" "a" "a" "b" "b" "b" "c" "c" "c"
# 
# [[2]]
# [1] "a" "a" "a" "c" "c" "c" "b" "b" "b"
# 
# [[3]]
# [1] "b" "b" "b" "a" "a" "a" "c" "c" "c"
# 
# [[4]]
# [1] "b" "b" "b" "c" "c" "c" "a" "a" "a"
# 
# [[5]]
# [1] "c" "c" "c" "a" "a" "a" "b" "b" "b"
# 
# [[6]]
# [1] "c" "c" "c" "b" "b" "b" "a" "a" "a"

一个较短的演示,在“复杂”的情况下包含 3 个独特的元素,其中元素并非都是连续的:

all_swaps(c("a", "b", "b", "c", "b"))
# [[1]]
# [1] "a" "b" "b" "c" "b"
# 
# [[2]]
# [1] "a" "c" "c" "b" "c"
# 
# [[3]]
# [1] "b" "a" "a" "c" "a"
# 
# [[4]]
# [1] "b" "c" "c" "a" "c"
# 
# [[5]]
# [1] "c" "a" "a" "b" "a"
# 
# [[6]]
# [1] "c" "b" "b" "a" "b"

更大的箱子:

# n = 10
set.seed(47)
start_t = Sys.time()
n10 = all_swaps(sample(letters[1:10], size = 30, replace = TRUE))
end_t = Sys.time()
end_t - start_t
# Time difference of 6.711215 secs
length(n10)
# [1] 3628800

基准测试

将我的答案与 Maël 和 ThomasIsCoding 的答案进行对比,我的方法依赖于 arrangements 包,速度快且内存效率高。 ThomasIsCoding 的答案可以通过从 pracma::perms 更改为 arrangements::permutations 来改进——内存使用率得到了特别改进——但我的版本仍然表现更好。 Maël's 使用大量时间和内存。我将以结果为主导,重现代码如下。

## 5 Unique Elements
arrange(b5, desc(`itr/sec`))
# # A tibble: 4 × 13
#   expression                  min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#   <bch:expr>             <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
# 1 GregorThomas             2.31ms   12.6ms     77.5     5.77KB        0    40     0      516ms
# 2 ThomasIsCodingArr(in5)    9.3ms   20.5ms     47.4    19.55KB        0    24     0      506ms
# 3 ThomasIsCoding(in5)     12.57ms   22.7ms     41.2    45.41KB        0    22     0      534ms
# 4 Mael                   963.64ms  963.6ms      1.04    1.24MB        0     1     0      964ms
# # … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>

## 9 Unique Elements - memory allocation is important
arrange(b9, desc(`itr/sec`))
# # A tibble: 2 × 13
#   expression               min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result
#   <bch:expr>          <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>
# 1 GregorThomas            1.8s     1.8s     0.556    27.7MB    0         1     0       1.8s <NULL>
# 2 ThomasIsCoding(in9)     2.5s     2.5s     0.400   230.8MB    0.400     1     1       2.5s <NULL>
# # … with 3 more variables: memory <list>, time <list>, gc <list>

基准代码:

## Functions
library(arrangements)
library(pracma)
ThomasIsCoding <- function(x) {
  idx <- match(x, unique(x))
  m <- asplit(matrix(unique(x)[perms(1:max(idx))], ncol = max(idx)), 1)
  Map(`[`, m, list(idx))
}
ThomasIsCodingArr <- function(x) {
  idx <- match(x, unique(x))
  m <- asplit(matrix(unique(x)[permutations(1:max(idx))], ncol = max(idx)), 1)
  Map(`[`, m, list(idx))
}
Mael <- function(vec){
  uni <- unique(vec)
  size <- length(uni)
  pVec <- paste(uni, collapse = "")
  grid <- expand.grid(rep(list(uni), size))
  expanded <- grid[apply(grid, 1, function(x) length(unique(x))) == size,]
  p <- unname(apply(expanded, 1, paste0, collapse = ""))
  
  lapply(p, function(x) chartr(pVec, x, vec))
}
all_swaps = function(x) {
  ux = unique(x)
  xi = as.integer(factor(x))
  perm = permutations(seq_along(ux))
  apply(perm, MARGIN = 1, FUN = \(p) ux[p][xi], simplify = FALSE)
}

set.seed(47)
in5 = c(sample(letters[1:5], 5), sample(letters[1:5], 5, replace = TRUE))

b5 = bench::mark(
  GregorThomas = all_swaps(in5),
  Mael = Mael(in5),
  ThomasIsCoding(in5),
  ThomasIsCodingArr(in5),
  check = FALSE
)

更新

这里我们对之前的答案做了一些改进,结果存储在matrix中(而不是list),并应用arrangement::permuations(而不是pracma::perms(感谢的推荐)

f_TIC2 <- function(x) {
  u <- unique(x)
  idx <- match(x, u)
  n <- max(idx)
  m <- matrix(u[perms(1:n)], ncol = n)
  matrix(t(m)[c(outer(idx, (0:(nrow(m) - 1)) * ncol(m), `+`))], nrow = nrow(m), byrow = TRUE)
}

f_TIC2Arr <- function(x) {
  u <- unique(x)
  idx <- match(x, u)
  n <- max(idx)
  m <- matrix(u[permutations(1:n)], ncol = n)
  matrix(t(m)[c(outer(idx, (0:(nrow(m) - 1)) * ncol(m), `+`))], nrow = nrow(m), byrow = TRUE)
}

输出看起来像

> f_TIC2(c("a", "b", "b", "c", "b"))
     [,1] [,2] [,3] [,4] [,5]
[1,] "c"  "b"  "b"  "a"  "b"
[2,] "c"  "a"  "a"  "b"  "a"
[3,] "b"  "c"  "c"  "a"  "c"
[4,] "b"  "a"  "a"  "c"  "a"
[5,] "a"  "b"  "b"  "c"  "b"
[6,] "a"  "c"  "c"  "b"  "c"

> f_TIC2Arr(c("a", "b", "b", "c", "b"))
     [,1] [,2] [,3] [,4] [,5]
[1,] "a"  "b"  "b"  "c"  "b"
[2,] "a"  "c"  "c"  "b"  "c"
[3,] "b"  "a"  "a"  "c"  "a"
[4,] "b"  "c"  "c"  "a"  "c"
[5,] "c"  "a"  "a"  "b"  "a"
[6,] "c"  "b"  "b"  "a"  "b"

基准测试

这里是一些现有答案的基准(Maël 的解决方案计算量大,因此被跳过。)

NB:这个基准是 NOT 100% 公平,因为我改进的解决方案产生矩阵而不是列表,这节省了很多时间.因此,比较是并不是说我的是最快的,而是指出提高性能的可能方法

library(RcppAlgos)
library(arrangements)
library(pracma)
f_TIC1 <- function(x) {
  idx <- match(x, unique(x))
  m <- asplit(matrix(unique(x)[perms(1:max(idx))], ncol = max(idx)), 1)
  Map(`[`, m, list(idx))
}
f_TIC1Arr <- function(x) {
  idx <- match(x, unique(x))
  m <- asplit(matrix(unique(x)[permutations(1:max(idx))], ncol = max(idx)), 1)
  Map(`[`, m, list(idx))
}
f_TIC2 <- function(x) {
  u <- unique(x)
  idx <- match(x, u)
  n <- max(idx)
  m <- matrix(u[perms(1:n)], ncol = n)
  matrix(t(m)[outer(idx, (0:(nrow(m) - 1)) * ncol(m), `+`)], nrow = nrow(m), byrow = TRUE)
}

f_TIC2Arr <- function(x) {
  u <- unique(x)
  idx <- match(x, u)
  n <- max(idx)
  m <- matrix(u[permutations(1:n)], ncol = n)
  matrix(t(m)[outer(idx, (0:(nrow(m) - 1)) * ncol(m), `+`)], nrow = nrow(m), byrow = TRUE)
}

f_GT <- function(x) {
  ux <- unique(x)
  xi <- as.integer(factor(x))
  perm <- permutations(seq_along(ux))
  apply(perm, MARGIN = 1, FUN = \(p) ux[p][xi], simplify = FALSE)
}

f_RS <- function(x) {
  permuteGeneral(uv <- unique(x), length(uv), FUN = \(m) uv[match(x, m)])
}

set.seed(1)
x <- sample(letters[1:10], 10, replace = TRUE)

bm <- bench::mark(
  f_GT = f_GT(x),
  f_TIC1 = f_TIC1(x),
  f_TIC1Arr = f_TIC1Arr(x),
  f_TIC2 = f_TIC2(x),
  f_TIC2Arr = f_TIC2Arr(x),
  f_RS = f_RS(x),
  check = FALSE
)
autoplot(bm)

你会看到

> bm
# A tibble: 6 x 13
  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 f_GT       11.55ms 15.57ms      58.9  315.14KB     7.06    25     3      425ms
2 f_TIC1     17.05ms  20.8ms      45.5    2.58MB    10.1     18     4      396ms
3 f_TIC1Arr  16.45ms 19.62ms      48.9    1.06MB    13.6     18     5      368ms
4 f_TIC2      2.47ms  3.31ms     259.     3.84MB    28.5     91    10      351ms
5 f_TIC2Arr   1.54ms   1.7ms     469.     2.35MB    26.2    197    11      420ms
6 f_RS        5.66ms  7.46ms      93.9   72.75KB     9.63    39     4      415ms
# ... with 4 more variables: result <list>, memory <list>, time <list>,
#   gc <list>


上一个答案

您可以像下面那样尝试pracma::perms

library(pracma)
f <- function(x) {
  idx <- match(x, unique(x))
  m <- asplit(matrix(unique(x)[perms(1:max(idx))], ncol = max(idx)), 1)
  Map(`[`, m, list(idx))
}

你会看到

> f(c("a", "a", "a", "b", "b", "b", "a", "a", "b", "b"))
[[1]]
 [1] "b" "b" "b" "a" "a" "a" "b" "b" "a" "a"

[[2]]
 [1] "a" "a" "a" "b" "b" "b" "a" "a" "b" "b"


> f(c("a", "b", "b", "c", "b"))
[[1]]
[1] "c" "b" "b" "a" "b"

[[2]]
[1] "c" "a" "a" "b" "a"

[[3]]
[1] "b" "c" "c" "a" "c"

[[4]]
[1] "b" "a" "a" "c" "a"

[[5]]
[1] "a" "b" "b" "c" "b"

[[6]]
[1] "a" "c" "c" "b" "c"

此答案采用与已发布的答案相同的一般方法,但使用 RcppAlgos::permuteGeneral(),这不仅速度非常快,而且还允许将函数应用于排列。

library(RcppAlgos)

f <- function(x) permuteGeneral(uv <- unique(x), length(uv), FUN = \(m) uv[match(x, m)])

f(original)
[[1]]
[1] "a" "a" "a" "b" "b" "b" "c" "c" "c"

[[2]]
[1] "a" "a" "a" "c" "c" "c" "b" "b" "b"

[[3]]
[1] "b" "b" "b" "a" "a" "a" "c" "c" "c"

[[4]]
[1] "c" "c" "c" "a" "a" "a" "b" "b" "b"

[[5]]
[1] "b" "b" "b" "c" "c" "c" "a" "a" "a"

[[6]]
[1] "c" "c" "c" "b" "b" "b" "a" "a" "a"

这是基础 R 解决方案:

vec <- c("a","a","a","b","b","b","c","c","c")  # original vector
els <- unique(vec)                             # unique elements

pers <- do.call(expand.grid, args=rep(list(els), length(els)))  # all permutations
pers <- as.matrix(pers[apply(pers, 1, anyDuplicated) == 0,])    # no repeated cases
colnames(pers) <- els

unname(pers[,vec])

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "c"  "c"  "c"  "b"  "b"  "b"  "a"  "a"  "a"
[2,] "b"  "b"  "b"  "c"  "c"  "c"  "a"  "a"  "a"
[3,] "c"  "c"  "c"  "a"  "a"  "a"  "b"  "b"  "b"
[4,] "a"  "a"  "a"  "c"  "c"  "c"  "b"  "b"  "b"
[5,] "b"  "b"  "b"  "a"  "a"  "a"  "c"  "c"  "c"
[6,] "a"  "a"  "a"  "b"  "b"  "b"  "c"  "c"  "c"