如何总结组合列表
How to summarize a list of combination
我有一个 2 元素组合的列表,如下所示。
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
我想总结一下上面的列表。预期结果如下表所示。向量中元素的顺序在这里无关紧要。
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
(规则 1){A, B} 等价于 {B, A}。对应这个我觉得我可以做到。
cbnl <- unique(lapply(cbnl, function(i) { sort(i) }))
(规则2) {A, B}, {B, C} (其中一个元素是公共的)然后取两个集合的并集。结果是 {A, B, C}。我没有明确的好主意来做这个。
有什么有效的方法吗?
我知道这个答案更像是一个传统的编程而不是“R like”,但它解决了这个问题。
cbnl <- unique(lapply(cbnl, sort))
i <- 1
count <- 1
out <- list()
while (i <= length(cbnl) - 1) {
if (sum(cbnl[[i]] %in% cbnl[[i + 1]]) == 0) {
out[[count]] <- cbnl[[i]]
} else {
out[[count]] <- sort(unique(c(cbnl[[i]], cbnl[[i + 1]])))
i <- i + 1
}
count <- count + 1
i <- i + 1
}
out
给予,
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
您可以尝试以下igraph
选项
library(igraph)
graph_from_data_frame(do.call(rbind, cbnl)) %>%
components() %>%
membership() %>%
stack() %>%
with(., split(as.character(ind), values))
这给出了
$`1`
[1] "A" "B"
$`2`
[1] "C" "E" "D"
$`3`
[1] "F" "G"
$`4`
[1] "H" "I"
$`5`
[1] "J" "K"
短一点
graph_from_data_frame(do.call(rbind, cbnl)) %>%
decompose() %>%
Map(function(x) names(V(x)), .)
这给出了
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "E" "D"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
Base R: sort
ing union
as FUN=
in combn
, 然后部分填充基于独特的矩阵元素 u
并删除 duplicated
行,最后强制转换 as.list
.
u <- Reduce(union, cbnl) ## get unique elements
res <- combn(cbnl, 2, \(x) {
if (length(intersect(x[[1]], x[[2]])) > 0) {
union(x[[1]], x[[2]])
} else {
el(x)
}
}, simplify=FALSE) |>
unique() |>
(\(x) sapply(x, \(i) replace(rep(NA, length(u)), match(i, u), i)))() |>
(\(x) x[, !colSums(duplicated(x, MARGIN=1:2)) == nrow(x)])() |>
(\(x) unname(lapply(as.list(as.data.frame(x)), \(x) x[!is.na(x)])))()
res
# [[1]]
# [1] "A" "B"
#
# [[2]]
# [1] "C" "D" "E"
#
# [[3]]
# [1] "F" "G"
#
# [[4]]
# [1] "H" "I"
#
# [[5]]
# [1] "J" "K"
注:
> R.version.string
[1] "R version 4.1.2 (2021-11-01)"
我从@ThomasIsCoding 获取了一行代码,并想证明我们可以使用我的包 dedupewider
.
library(dedupewider)
library(purrr)
library(magrittr)
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
cbnl_df <- data.frame(do.call(rbind, cbnl))
result <- dedupe_wide(cbnl_df, names(cbnl_df)) # it performs deduplication by connecting elements which are linked by transitive relation
result_list <- as.list(as.data.frame(t(result)))
result_list <- map(result_list, ~ .x[!is.na(.x)]) # remove NA
result_list
#> $V1
#> [1] "A" "B"
#>
#> $V2
#> [1] "C" "E" "D"
#>
#> $V3
#> [1] "F" "G"
#>
#> $V4
#> [1] "H" "I"
#>
#> $V5
#> [1] "J" "K"
需要很多步骤,因为列表是输入和输出,所以使用 data.frame 我们的代码会比上面少。
感谢各位支持者的精彩解答
让我 post 我自己的基础 R 解决方案如下;
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
repeat {
# Get A Count Table
tbl <- table(unlist(cbnl))
# No Duplicated Items Then break Out
if (length(tbl[tbl > 1]) == 0) { break }
# Take A First Duplicated Item And Get the Index
idx <- which(sapply(seq_len(length(cbnl)), function(i) {
any(cbnl[[i]] == names(tbl[tbl > 1])[1])
}))
# Create New vector By Taking Union
newvec <- sort(unique(unlist(cbnl[idx])))
# Append newvec To cbnl And Remove Original vectors
cbnl <- c(cbnl, list(newvec))[-idx]
}
cbnl
结果是
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
这里是 data.table 版本。
cbn <- data.table(
item1 = c("A", "B", "C", "E", "F", "H", "J", "I", "K", "G", "D", "E", "D", "C"),
item2 = c("B", "A", "D", "D", "G", "I", "K", "H", "J", "F", "C", "C", "E", "E")
)
repeat {
# Get A Count Table
tbl <- table(as.vector(as.matrix(cbn)))
# No Duplicated Items Then break Out
if (length(tbl[tbl > 1]) == 0) { break }
# Take A First Duplicated Item And Get Row Numbers Where The Item Is Located
idx <- which(cbn == names(tbl[tbl > 1])[1], arr.ind = TRUE)[, 1]
# Create New Row By Taking Union
newrow <- setDT(as.list(sort(unique(as.vector(as.matrix(cbn[idx]))))))
names(newrow) <- paste0("item", seq_len(ncol(newrow)))
# Append newrow To cbn And Remove Original Rows
cbn <- rbindlist(l = list(cbn, newrow), use.names = TRUE, fill = TRUE)[-idx]
}
cbn
结果如下
item1 item2 item3
1: A B <NA>
2: C D E
3: F G <NA>
4: H I <NA>
5: J K <NA>
我有一个 2 元素组合的列表,如下所示。
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
我想总结一下上面的列表。预期结果如下表所示。向量中元素的顺序在这里无关紧要。
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
(规则 1){A, B} 等价于 {B, A}。对应这个我觉得我可以做到。
cbnl <- unique(lapply(cbnl, function(i) { sort(i) }))
(规则2) {A, B}, {B, C} (其中一个元素是公共的)然后取两个集合的并集。结果是 {A, B, C}。我没有明确的好主意来做这个。
有什么有效的方法吗?
我知道这个答案更像是一个传统的编程而不是“R like”,但它解决了这个问题。
cbnl <- unique(lapply(cbnl, sort))
i <- 1
count <- 1
out <- list()
while (i <= length(cbnl) - 1) {
if (sum(cbnl[[i]] %in% cbnl[[i + 1]]) == 0) {
out[[count]] <- cbnl[[i]]
} else {
out[[count]] <- sort(unique(c(cbnl[[i]], cbnl[[i + 1]])))
i <- i + 1
}
count <- count + 1
i <- i + 1
}
out
给予,
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
您可以尝试以下igraph
选项
library(igraph)
graph_from_data_frame(do.call(rbind, cbnl)) %>%
components() %>%
membership() %>%
stack() %>%
with(., split(as.character(ind), values))
这给出了
$`1`
[1] "A" "B"
$`2`
[1] "C" "E" "D"
$`3`
[1] "F" "G"
$`4`
[1] "H" "I"
$`5`
[1] "J" "K"
短一点
graph_from_data_frame(do.call(rbind, cbnl)) %>%
decompose() %>%
Map(function(x) names(V(x)), .)
这给出了
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "E" "D"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
Base R: sort
ing union
as FUN=
in combn
, 然后部分填充基于独特的矩阵元素 u
并删除 duplicated
行,最后强制转换 as.list
.
u <- Reduce(union, cbnl) ## get unique elements
res <- combn(cbnl, 2, \(x) {
if (length(intersect(x[[1]], x[[2]])) > 0) {
union(x[[1]], x[[2]])
} else {
el(x)
}
}, simplify=FALSE) |>
unique() |>
(\(x) sapply(x, \(i) replace(rep(NA, length(u)), match(i, u), i)))() |>
(\(x) x[, !colSums(duplicated(x, MARGIN=1:2)) == nrow(x)])() |>
(\(x) unname(lapply(as.list(as.data.frame(x)), \(x) x[!is.na(x)])))()
res
# [[1]]
# [1] "A" "B"
#
# [[2]]
# [1] "C" "D" "E"
#
# [[3]]
# [1] "F" "G"
#
# [[4]]
# [1] "H" "I"
#
# [[5]]
# [1] "J" "K"
注:
> R.version.string
[1] "R version 4.1.2 (2021-11-01)"
我从@ThomasIsCoding 获取了一行代码,并想证明我们可以使用我的包 dedupewider
.
library(dedupewider)
library(purrr)
library(magrittr)
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
cbnl_df <- data.frame(do.call(rbind, cbnl))
result <- dedupe_wide(cbnl_df, names(cbnl_df)) # it performs deduplication by connecting elements which are linked by transitive relation
result_list <- as.list(as.data.frame(t(result)))
result_list <- map(result_list, ~ .x[!is.na(.x)]) # remove NA
result_list
#> $V1
#> [1] "A" "B"
#>
#> $V2
#> [1] "C" "E" "D"
#>
#> $V3
#> [1] "F" "G"
#>
#> $V4
#> [1] "H" "I"
#>
#> $V5
#> [1] "J" "K"
需要很多步骤,因为列表是输入和输出,所以使用 data.frame 我们的代码会比上面少。
感谢各位支持者的精彩解答
让我 post 我自己的基础 R 解决方案如下;
cbnl <- list(
c("A", "B"), c("B", "A"), c("C", "D"), c("E", "D"), c("F", "G"), c("H", "I"),
c("J", "K"), c("I", "H"), c("K", "J"), c("G", "F"), c("D", "C"), c("E", "C"),
c("D", "E"), c("C", "E")
)
repeat {
# Get A Count Table
tbl <- table(unlist(cbnl))
# No Duplicated Items Then break Out
if (length(tbl[tbl > 1]) == 0) { break }
# Take A First Duplicated Item And Get the Index
idx <- which(sapply(seq_len(length(cbnl)), function(i) {
any(cbnl[[i]] == names(tbl[tbl > 1])[1])
}))
# Create New vector By Taking Union
newvec <- sort(unique(unlist(cbnl[idx])))
# Append newvec To cbnl And Remove Original vectors
cbnl <- c(cbnl, list(newvec))[-idx]
}
cbnl
结果是
[[1]]
[1] "A" "B"
[[2]]
[1] "C" "D" "E"
[[3]]
[1] "F" "G"
[[4]]
[1] "H" "I"
[[5]]
[1] "J" "K"
这里是 data.table 版本。
cbn <- data.table(
item1 = c("A", "B", "C", "E", "F", "H", "J", "I", "K", "G", "D", "E", "D", "C"),
item2 = c("B", "A", "D", "D", "G", "I", "K", "H", "J", "F", "C", "C", "E", "E")
)
repeat {
# Get A Count Table
tbl <- table(as.vector(as.matrix(cbn)))
# No Duplicated Items Then break Out
if (length(tbl[tbl > 1]) == 0) { break }
# Take A First Duplicated Item And Get Row Numbers Where The Item Is Located
idx <- which(cbn == names(tbl[tbl > 1])[1], arr.ind = TRUE)[, 1]
# Create New Row By Taking Union
newrow <- setDT(as.list(sort(unique(as.vector(as.matrix(cbn[idx]))))))
names(newrow) <- paste0("item", seq_len(ncol(newrow)))
# Append newrow To cbn And Remove Original Rows
cbn <- rbindlist(l = list(cbn, newrow), use.names = TRUE, fill = TRUE)[-idx]
}
cbn
结果如下
item1 item2 item3
1: A B <NA>
2: C D E
3: F G <NA>
4: H I <NA>
5: J K <NA>