如何为共享成员的两个两个组创建一个通用 ID
How to create a common ID for 2 two groups that share members in common
我正在尝试为群组创建一个新 ID,这些群组应该链接在一起,因为它们有共同的成员。下面的示例显示了问题。
第一个 table 告诉我 dedupe id 1 有成员 12,23 和 34
Dedupe ID #
Member
1
12
1
23
1
34
2
56
...
...
虽然第二个 table 告诉我重复数据删除 ID 5 有成员 12、23 和 47。请注意,在第一个 table 中,12 和 23 与重复数据删除 ID 1 很常见。
Dedupe ID #
Member
5
12
5
23
5
47
2
7
...
...
由于第 1 组和第 5 组重叠 - 我想确保成员 12、23、34 和 47 加入 1 个重复数据删除 ID,如下所示
dedupe ID #
Member
1
12
1
23
1
34
1
47
2
56
...
...
@akrun,问得好。也许更好的方法是更新 table 1 以便 dedupeid 1 映射到成员 47(参见上面的示例)。 table 1 中的所有其他映射都应保留。
我担心这可能会屈服于多个共同成员,并且它迭代地附加到 dat1
在规模上有点低效,但是 ...
members1 <- split(dat1$Member, dat1$Group)
members2 <- split(dat2$Member, dat2$Group)
for (G in unique(dat1$Group)) {
m1 <- dat1$Member[dat1$Group == G]
incommon <- lengths(lapply(members2, function(m2) intersect(dat1$Member[dat1$Group == G], m2)))
move2 <- subset(dat2, Group %in% names(members2)[incommon > 0] & !Member %in% m1)
if (nrow(move2)) {
dat1 <- rbind(dat1, transform(move2, Group = G))
dat2 <- subset(dat2, !Group %in% names(members2)[incommon > 0])
}
}
rbind(dat1, dat2)
# Group Member
# 1 A 1
# 2 A 2
# 3 A 3
# 4 C 5
# 31 A 4
# 41 D 7
数据
dat1 <- structure(list(Group = c("A", "A", "A", "C"), Member = c(1L, 2L, 3L, 5L)), class = "data.frame", row.names = c(NA, -4L))
dat2 <- structure(list(Group = c("B", "B", "B", "D"), Member = c(2L, 3L, 4L, 7L)), class = "data.frame", row.names = c(NA, -4L))
library(tidyverse)
df1 %>%
group_by(Group) %>%
mutate(Group = if(any(Member %in% df2$Member))
paste0(Group, Member) else Group)
# A tibble: 4 x 2
# Groups: Group [4]
Group Member
<chr> <int>
1 A1 1
2 A2 2
3 A3 3
4 C 5
我想到的另一种方法
library(dplyr)
library(tidyr)
df1 %>%
full_join(
df2 %>%
left_join(df1,by = "member") %>%
group_by(id.x) %>%
fill(id.y) %>%
ungroup() %>%
mutate(id.y = if_else(is.na(id.y),id.x,id.y)) %>%
select(id = id.y, member)
)
Joining, by = c("id", "member")
id member
1 1 12
2 1 23
3 1 34
4 2 56
5 1 47
6 2 7
数据
df1 <-
structure(list(id = c(1, 1, 1, 2), member = c(12, 23, 34, 56)), class = "data.frame", row.names = c(NA,-4L))
df2 <-
structure(list(id = c(5, 5, 5, 2), member = c(12, 23, 47, 7)), class = "data.frame", row.names = c(NA,-4L))
我正在尝试为群组创建一个新 ID,这些群组应该链接在一起,因为它们有共同的成员。下面的示例显示了问题。
第一个 table 告诉我 dedupe id 1 有成员 12,23 和 34
Dedupe ID # | Member |
---|---|
1 | 12 |
1 | 23 |
1 | 34 |
2 | 56 |
... | ... |
虽然第二个 table 告诉我重复数据删除 ID 5 有成员 12、23 和 47。请注意,在第一个 table 中,12 和 23 与重复数据删除 ID 1 很常见。
Dedupe ID # | Member |
---|---|
5 | 12 |
5 | 23 |
5 | 47 |
2 | 7 |
... | ... |
由于第 1 组和第 5 组重叠 - 我想确保成员 12、23、34 和 47 加入 1 个重复数据删除 ID,如下所示
dedupe ID # | Member |
---|---|
1 | 12 |
1 | 23 |
1 | 34 |
1 | 47 |
2 | 56 |
... | ... |
@akrun,问得好。也许更好的方法是更新 table 1 以便 dedupeid 1 映射到成员 47(参见上面的示例)。 table 1 中的所有其他映射都应保留。
我担心这可能会屈服于多个共同成员,并且它迭代地附加到 dat1
在规模上有点低效,但是 ...
members1 <- split(dat1$Member, dat1$Group)
members2 <- split(dat2$Member, dat2$Group)
for (G in unique(dat1$Group)) {
m1 <- dat1$Member[dat1$Group == G]
incommon <- lengths(lapply(members2, function(m2) intersect(dat1$Member[dat1$Group == G], m2)))
move2 <- subset(dat2, Group %in% names(members2)[incommon > 0] & !Member %in% m1)
if (nrow(move2)) {
dat1 <- rbind(dat1, transform(move2, Group = G))
dat2 <- subset(dat2, !Group %in% names(members2)[incommon > 0])
}
}
rbind(dat1, dat2)
# Group Member
# 1 A 1
# 2 A 2
# 3 A 3
# 4 C 5
# 31 A 4
# 41 D 7
数据
dat1 <- structure(list(Group = c("A", "A", "A", "C"), Member = c(1L, 2L, 3L, 5L)), class = "data.frame", row.names = c(NA, -4L))
dat2 <- structure(list(Group = c("B", "B", "B", "D"), Member = c(2L, 3L, 4L, 7L)), class = "data.frame", row.names = c(NA, -4L))
library(tidyverse)
df1 %>%
group_by(Group) %>%
mutate(Group = if(any(Member %in% df2$Member))
paste0(Group, Member) else Group)
# A tibble: 4 x 2
# Groups: Group [4]
Group Member
<chr> <int>
1 A1 1
2 A2 2
3 A3 3
4 C 5
我想到的另一种方法
library(dplyr)
library(tidyr)
df1 %>%
full_join(
df2 %>%
left_join(df1,by = "member") %>%
group_by(id.x) %>%
fill(id.y) %>%
ungroup() %>%
mutate(id.y = if_else(is.na(id.y),id.x,id.y)) %>%
select(id = id.y, member)
)
Joining, by = c("id", "member")
id member
1 1 12
2 1 23
3 1 34
4 2 56
5 1 47
6 2 7
数据
df1 <-
structure(list(id = c(1, 1, 1, 2), member = c(12, 23, 34, 56)), class = "data.frame", row.names = c(NA,-4L))
df2 <-
structure(list(id = c(5, 5, 5, 2), member = c(12, 23, 47, 7)), class = "data.frame", row.names = c(NA,-4L))