按组长格式组合
Combinations in long format by group
所以我有一个 parents 的数据集及其 children 的以下形式
Children_id Parent_id
10 1
11 1
12 1
13 2
14 2
我想要的是每个 child 的兄弟姐妹的长格式数据集,即
id sibling_id
10 11
10 12
11 10
11 12
12 10
12 11
13 14
14 13
实现此目的的最佳方法是什么,最好使用数据表?
示例数据:
df <- data.frame("Children_id" = c(10,11,12,13,14), "Parent_id" = c(1,1,1,2,2))
在base R
中,我们可以在split
ting
之后使用expand.grid
out <- do.call(rbind, lapply(split(df$Children_id, df$Parent_id), \(x)
subset(expand.grid(x, x), Var1 != Var2)[2:1]))
row.names(out) <- NULL
colnames(out) <- c("id", "sibling_id")
-输出
> out
id sibling_id
1 10 11
2 10 12
3 11 10
4 11 12
5 12 10
6 12 11
7 13 14
8 14 13
或使用 data.table
和 CJ
library(data.table)
setDT(df)[, CJ(id = Children_id, sibling_id = Children_id),
Parent_id][id != sibling_id, .(id, sibling_id)]
id sibling_id
<num> <num>
1: 10 11
2: 10 12
3: 11 10
4: 11 12
5: 12 10
6: 12 11
7: 13 14
8: 14 13
那里的图形专家可能会有更好的解决方案,但这里有一个 data.table
解决方案:
library(data.table)
setDT(df)[df,on=.(Parent_id), allow.cartesian=T] %>%
.[Children_id!=i.Children_id, .(id = i.Children_id, sibling=Children_id)]
输出:
id sibling
<num> <num>
1: 10 11
2: 10 12
3: 11 10
4: 11 12
5: 12 10
6: 12 11
7: 13 14
8: 14 13
dplyr
解决方案 inner_join
:
library(dplyr)
inner_join(df, df, by = "Parent_id") %>%
select(id = Children_id.x, siblings = Children_id.y) %>%
filter(id != siblings)
id siblings
1 10 11
2 10 12
3 11 10
4 11 12
5 12 10
6 12 11
7 13 14
8 14 13
或其他策略:
library(dplyr)
df %>%
group_by(Parent_id) %>%
mutate(siblings = list(unique(Children_id))) %>%
unnest(siblings) %>%
filter(Children_id != siblings)
所以我有一个 parents 的数据集及其 children 的以下形式
Children_id Parent_id
10 1
11 1
12 1
13 2
14 2
我想要的是每个 child 的兄弟姐妹的长格式数据集,即
id sibling_id
10 11
10 12
11 10
11 12
12 10
12 11
13 14
14 13
实现此目的的最佳方法是什么,最好使用数据表?
示例数据:
df <- data.frame("Children_id" = c(10,11,12,13,14), "Parent_id" = c(1,1,1,2,2))
在base R
中,我们可以在split
ting
expand.grid
out <- do.call(rbind, lapply(split(df$Children_id, df$Parent_id), \(x)
subset(expand.grid(x, x), Var1 != Var2)[2:1]))
row.names(out) <- NULL
colnames(out) <- c("id", "sibling_id")
-输出
> out
id sibling_id
1 10 11
2 10 12
3 11 10
4 11 12
5 12 10
6 12 11
7 13 14
8 14 13
或使用 data.table
和 CJ
library(data.table)
setDT(df)[, CJ(id = Children_id, sibling_id = Children_id),
Parent_id][id != sibling_id, .(id, sibling_id)]
id sibling_id
<num> <num>
1: 10 11
2: 10 12
3: 11 10
4: 11 12
5: 12 10
6: 12 11
7: 13 14
8: 14 13
那里的图形专家可能会有更好的解决方案,但这里有一个 data.table
解决方案:
library(data.table)
setDT(df)[df,on=.(Parent_id), allow.cartesian=T] %>%
.[Children_id!=i.Children_id, .(id = i.Children_id, sibling=Children_id)]
输出:
id sibling
<num> <num>
1: 10 11
2: 10 12
3: 11 10
4: 11 12
5: 12 10
6: 12 11
7: 13 14
8: 14 13
dplyr
解决方案 inner_join
:
library(dplyr)
inner_join(df, df, by = "Parent_id") %>%
select(id = Children_id.x, siblings = Children_id.y) %>%
filter(id != siblings)
id siblings
1 10 11
2 10 12
3 11 10
4 11 12
5 12 10
6 12 11
7 13 14
8 14 13
或其他策略:
library(dplyr)
df %>%
group_by(Parent_id) %>%
mutate(siblings = list(unique(Children_id))) %>%
unnest(siblings) %>%
filter(Children_id != siblings)