如何总结组合列表

How to summarize a list of combination

我有一个 2 元素组合的列表,如下所示。

cbnl <- list(
  c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
  c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
  c("D", "E"), c("C", "E")
)

我想总结一下上面的列表。预期结果如下表所示。向量中元素的顺序在这里无关紧要。

[[1]]
[1] "A" "B"

[[2]]
[1] "C" "D" "E"

[[3]]
[1] "F" "G"

[[4]]
[1] "H" "I"

[[5]]
[1] "J" "K"

(规则 1){A, B} 等价于 {B, A}。对应这个我觉得我可以做到。

cbnl <- unique(lapply(cbnl, function(i) { sort(i) }))

(规则2) {A, B}, {B, C} (其中一个元素是公共的)然后取两个集合的并集。结果是 {A, B, C}。我没有明确的好主意来做这个。

有什么有效的方法吗?

我知道这个答案更像是一个传统的编程而不是“R like”,但它解决了这个问题。

cbnl <- unique(lapply(cbnl, sort))

i <- 1
count <- 1
out <- list()

while (i <= length(cbnl) - 1) {
  if (sum(cbnl[[i]] %in% cbnl[[i + 1]]) == 0) {
    out[[count]] <- cbnl[[i]]
    } else {
      out[[count]] <- sort(unique(c(cbnl[[i]], cbnl[[i + 1]])))
      i <- i + 1        
    }
  count <- count + 1
  i <- i + 1 
}

out

给予,

[[1]]
[1] "A" "B"

[[2]]
[1] "C" "D" "E"

[[3]]
[1] "F" "G"

[[4]]
[1] "H" "I"

[[5]]
[1] "J" "K"

您可以尝试以下igraph选项

library(igraph)

graph_from_data_frame(do.call(rbind, cbnl)) %>%
  components() %>%
  membership() %>%
  stack() %>%
  with(., split(as.character(ind), values))

这给出了

$`1`
[1] "A" "B"

$`2`
[1] "C" "E" "D"

$`3`
[1] "F" "G"

$`4`
[1] "H" "I"

$`5`
[1] "J" "K"

短一点

graph_from_data_frame(do.call(rbind, cbnl)) %>%
  decompose() %>%
  Map(function(x) names(V(x)), .)

这给出了

[[1]]
[1] "A" "B"

[[2]]
[1] "C" "E" "D"

[[3]]
[1] "F" "G"

[[4]]
[1] "H" "I"

[[5]]
[1] "J" "K"

Base R: sorting union as FUN= in combn, 然后部分填充基于独特的矩阵元素 u 并删除 duplicated 行,最后强制转换 as.list.

u <- Reduce(union, cbnl)  ## get unique elements

res <- combn(cbnl, 2, \(x) {
  if (length(intersect(x[[1]], x[[2]])) > 0) {
    union(x[[1]], x[[2]])
  } else {
    el(x)
  }
}, simplify=FALSE) |>
  unique() |>
  (\(x) sapply(x, \(i) replace(rep(NA, length(u)), match(i, u), i)))() |>
  (\(x) x[, !colSums(duplicated(x, MARGIN=1:2)) == nrow(x)])() |>
  (\(x) unname(lapply(as.list(as.data.frame(x)), \(x) x[!is.na(x)])))()

res
# [[1]]
# [1] "A" "B"
# 
# [[2]]
# [1] "C" "D" "E"
# 
# [[3]]
# [1] "F" "G"
# 
# [[4]]
# [1] "H" "I"
# 
# [[5]]
# [1] "J" "K"

注:

> R.version.string
[1] "R version 4.1.2 (2021-11-01)"

我从@ThomasIsCoding 获取了一行代码,并想证明我们可以使用我的包 dedupewider.

library(dedupewider)
library(purrr)
library(magrittr)

cbnl <- list(
  c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
  c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
  c("D", "E"), c("C", "E")
)

cbnl_df <- data.frame(do.call(rbind, cbnl))

result <- dedupe_wide(cbnl_df, names(cbnl_df)) # it performs deduplication by connecting elements which are linked by transitive relation

result_list <- as.list(as.data.frame(t(result)))

result_list <- map(result_list, ~ .x[!is.na(.x)]) # remove NA
result_list
#> $V1
#> [1] "A" "B"
#> 
#> $V2
#> [1] "C" "E" "D"
#> 
#> $V3
#> [1] "F" "G"
#> 
#> $V4
#> [1] "H" "I"
#> 
#> $V5
#> [1] "J" "K"

需要很多步骤,因为列表是输入和输出,所以使用 data.frame 我们的代码会比上面少。

感谢各位支持者的精彩解答

让我 post 我自己的基础 R 解决方案如下;

cbnl <- list(
  c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
  c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
  c("D", "E"), c("C", "E")
)

repeat {
  # Get A Count Table
  tbl <- table(unlist(cbnl))
  # No Duplicated Items Then break Out
  if (length(tbl[tbl > 1]) == 0) { break }
  # Take A First Duplicated Item And Get the Index
  idx <- which(sapply(seq_len(length(cbnl)), function(i) {
    any(cbnl[[i]] == names(tbl[tbl > 1])[1])
  }))
  # Create New vector By Taking Union
  newvec <- sort(unique(unlist(cbnl[idx])))
  # Append newvec To cbnl And Remove Original vectors
  cbnl <- c(cbnl, list(newvec))[-idx]
}

cbnl

结果是

[[1]]
[1] "A" "B"

[[2]]
[1] "C" "D" "E"

[[3]]
[1] "F" "G"

[[4]]
[1] "H" "I"

[[5]]
[1] "J" "K"

这里是 data.table 版本。

cbn <- data.table(
  item1 = c("A", "B", "C", "E", "F", "H", "J", "I", "K", "G", "D", "E", "D", "C"),
  item2 = c("B", "A", "D", "D", "G", "I", "K", "H", "J", "F", "C", "C", "E", "E")
)

repeat {
  # Get A Count Table
  tbl <- table(as.vector(as.matrix(cbn)))
  # No Duplicated Items Then break Out
  if (length(tbl[tbl > 1]) == 0) { break }
  # Take A First Duplicated Item And Get Row Numbers Where The Item Is Located
  idx <- which(cbn == names(tbl[tbl > 1])[1], arr.ind = TRUE)[, 1]
  # Create New Row By Taking Union
  newrow <- setDT(as.list(sort(unique(as.vector(as.matrix(cbn[idx]))))))
  names(newrow) <- paste0("item", seq_len(ncol(newrow)))
  # Append newrow To cbn And Remove Original Rows
  cbn <- rbindlist(l = list(cbn, newrow), use.names = TRUE, fill = TRUE)[-idx]
}

cbn

结果如下

   item1 item2 item3
1:     A     B  <NA>
2:     C     D     E
3:     F     G  <NA>
4:     H     I  <NA>
5:     J     K  <NA>