在 r 中加入两个 data.tables:删除重叠重复项,同时在每个单独的数据集中保留重复项
Joining two data.tables in r: removing overlap duplicates while keeping duplicates in each separate dataset
有很多关于 joining 两个数据集的问题和答案,但我无法找到解决当前问题的方法。我在这里使用 data.table
。
我有两个数据集,dtx 和 dty,提供了玩具示例:
library(data.table)
dtx <-
structure(list(ID = c("a", "a", "a", "a", "c", "c", "c", "c",
"d", "d", "d", "d", "d", "d", "d", "d", "e", "e", "f", "f", "f",
"f", "f", "f", "f", "f", "g", "g", "g", "g", "g", "g", "g", "g"
), date = structure(c(939340800, 939340800, 949622400, 949622400,
887414400, 887414400, 920332800, 920332800, 831686400, 831686400,
831686400, 845078400, 845078400, 969062400, 969062400, 975369600,
979689600, 979689600, 892598400, 892598400, 921801600, 921974400,
968284800, 968284800, 968284800, 968284800, 927158400, 927158400,
993081600, 993081600, 993081600, 993081600, 1057190400, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1215,
1220, 1320, 1320, 236, 300, 1245, 850, 1415, 1415, 1279, 230,
230, 1115, 1215, 8749, 1212, 1212, 1112, 1112, 4561, 8145, 8145,
1497, 1112, 1112, 258, 1112, 230, 240, 1112, 1445, 260, 1112),
dataset = c("x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x")), row.names = c(NA, -34L), class = c("data.table", "data.frame"
))
dty <-
structure(list(ID = c("b", "b", "b", "b", "b", "b", "c", "c",
"c", "c", "c", "d", "d", "d", "d", "d", "e", "e", "e", "e", "e",
"e", "e", "e", "f", "f", "f", "f", "g", "g", "g", "g", "g"),
date = structure(c(1055203200, 1055203200, 1055635200, 1058918400,
1058918400, 1074211200, 974764800, 974764800, 974764800,
979516800, 979516800, 975369600, 983491200, 983491200, 984528000,
987984000, 979689600, 979689600, 992217600, 992217600, 994896000,
995068800, 999043200, 999043200, 968284800, 968284800, 968284800,
968284800, 993081600, 993081600, 993081600, 993081600, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1112,
890, 8125, 2369, 1485, 1112, 1645, 1645, 794, 236, 4578,
8749, 230, 1114, 690, 720, 1212, 1212, 1112, 1112, 2060,
310, 1415, 310, 8145, 1497, 1112, 1112, 230, 240, 1112, 1445,
1112), dataset = c("y", "y", "y", "y", "y", "y", "y", "y",
"y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",
"y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",
"y")), row.names = c(NA, -33L), class = c("data.table", "data.frame"
))
在每个数据集中,有一个与特定个体相关的 ID,一个事件发生时间的 date,一个 code 表示事件类型,还有一个虚拟变量表示每行的 dataset(这是为了跟踪一行来自哪个数据集加入后)。
一个人可以有多行代表多个日期的多个事件。此外,一个人可以在同一日期经历多个事件,包括多个相同类型的事件(即,同一事件代码可以在特定日期对一个人出现多次)。
我想加入这两个数据集,它们之间有唯一和共享的行。在加入它们时,我想删除重复的条目:在两个数据集中出现特定 ID-date-code 组合的地方,我只想保留 dtx[ 的版本=59=]。此外,我想记录该行最初来自哪个数据集,这就是为什么我有 dataset 虚拟列
这是所需的输出:
dtresult <- structure(list(ID = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d",
"d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "e", "e", "e",
"e", "e", "e", "e", "e", "f", "f", "f", "f", "f", "f", "f", "f",
"g", "g", "g", "g", "g", "g", "g", "g"), date = structure(c(939340800,
939340800, 949622400, 949622400, 1055203200, 1055203200, 1055635200,
1058918400, 1058918400, 1074211200, 887414400, 887414400, 920332800,
920332800, 974764800, 974764800, 974764800, 979516800, 979516800,
831686400, 831686400, 831686400, 845078400, 845078400, 969062400,
969062400, 975369600, 983491200, 983491200, 984528000, 987984000,
979689600, 979689600, 992217600, 992217600, 994896000, 995068800,
999043200, 999043200, 892598400, 892598400, 921801600, 921974400,
968284800, 968284800, 968284800, 968284800, 927158400, 927158400,
993081600, 993081600, 993081600, 993081600, 1057190400, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1215,
1220, 1320, 1320, 1112, 890, 8125, 2369, 1485, 1112, 236, 300,
1245, 850, 1645, 1645, 794, 236, 4578, 1415, 1415, 1279, 230,
230, 1115, 1215, 8749, 230, 1114, 690, 720, 1212, 1212, 1112,
1112, 2060, 310, 1415, 310, 1112, 1112, 4561, 8145, 8145, 1497,
1112, 1112, 258, 1112, 230, 240, 1112, 1445, 260, 1112), dataset = c("x",
"x", "x", "x", "y", "y", "y", "y", "y", "y", "x", "x", "x", "x",
"y", "y", "y", "y", "y", "x", "x", "x", "x", "x", "x", "x", "x",
"y", "y", "y", "y", "x", "x", "y", "y", "y", "y", "y", "y", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x")), row.names = c(NA, -55L), class = c("data.table",
"data.frame"))
在这种情况下,我无法加入两个数据集,因为一个人可能在一个日期有多个相同的事件类型。这意味着我不能简单地使用 unique
或 duplicated
来识别加倍的行。
在我第一次尝试加入数据集时,我想我会使用 rbindlist
:
l = list(dtx, dty)
dtxy = rbindlist(l, use.names = TRUE)
dtxy = dtxy[order(ID, date, code, dataset)]
但后来我意识到,如果不排除一个人在一个日期有多个相同事件的情况,我就无法删除重叠的行。
然后我尝试使用 merge
:
dtxy = merge(dtx,
dty,
all = TRUE,
by = c("ID", "date", "code"))
但此方法不处理同一事件类型在同一日期多次发生并且在两个数据集中重复的情况!
我真的很纠结这个 - 我也想过使用 funion
,但这与 merge
.
本质上是一样的
任何帮助将不胜感激(特别是使用 base 或 data.table
)
虽然很乱,但是
a <-dtx %>%
full_join(dty, by = c("ID", "date", "code")) %>%
arrange(ID) %>%
distinct()
#mutate(dataset = ifelse(!is.na(dataset.x), dataset.x, dataset.y))
b <- dtx %>%
full_join(dty, by = c("ID", "date", "code")) %>%
arrange(ID) %>%
group_by(ID, date, code) %>%
filter(n()>1) %>%
distinct()
c <- rbind(a,b) %>%
mutate(dataset = ifelse(!is.na(dataset.x), dataset.x, dataset.y)) %>%
select(-dataset.x, -dataset.y) %>%
arrange(ID, date)
sum(!(c %>% arrange(ID, date, code) == dtresult %>% arrange(ID, date, code)) )
[1] 0
c
就是你想要的
> str(c)
Classes ‘data.table’ and 'data.frame': 55 obs. of 4 variables:
$ ID : chr "a" "a" "a" "a" ...
$ date : POSIXct, format: "1999-10-08" "1999-10-08" "2000-02-04" "2000-02-04" ...
$ code : num 1215 1220 1320 1320 1112 ...
$ dataset: chr "x" "x" "x" "x" ...
- attr(*, ".internal.selfref")=<externalptr>
您可以删除来自 y
的 overlaps
:
l = list(dtx, dty)
dtxy = rbindlist(l, use.names = TRUE)
overlaps = merge(dtx,dty,by=c("ID","date","code"))[,.(ID,date,code,dataset = dataset.y)]
dtresultnew <- overlaps[dtxy,.(ID,date,code,x.dataset,i.dataset),on = .(ID,date,code,dataset)][
is.na(x.dataset),.(ID,date,code,dataset=i.dataset)]
identical(dtresult[order(ID,date,code)],dtresultnew[order(ID,date,code)])
[1] TRUE
有很多关于 joining 两个数据集的问题和答案,但我无法找到解决当前问题的方法。我在这里使用 data.table
。
我有两个数据集,dtx 和 dty,提供了玩具示例:
library(data.table)
dtx <-
structure(list(ID = c("a", "a", "a", "a", "c", "c", "c", "c",
"d", "d", "d", "d", "d", "d", "d", "d", "e", "e", "f", "f", "f",
"f", "f", "f", "f", "f", "g", "g", "g", "g", "g", "g", "g", "g"
), date = structure(c(939340800, 939340800, 949622400, 949622400,
887414400, 887414400, 920332800, 920332800, 831686400, 831686400,
831686400, 845078400, 845078400, 969062400, 969062400, 975369600,
979689600, 979689600, 892598400, 892598400, 921801600, 921974400,
968284800, 968284800, 968284800, 968284800, 927158400, 927158400,
993081600, 993081600, 993081600, 993081600, 1057190400, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1215,
1220, 1320, 1320, 236, 300, 1245, 850, 1415, 1415, 1279, 230,
230, 1115, 1215, 8749, 1212, 1212, 1112, 1112, 4561, 8145, 8145,
1497, 1112, 1112, 258, 1112, 230, 240, 1112, 1445, 260, 1112),
dataset = c("x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x")), row.names = c(NA, -34L), class = c("data.table", "data.frame"
))
dty <-
structure(list(ID = c("b", "b", "b", "b", "b", "b", "c", "c",
"c", "c", "c", "d", "d", "d", "d", "d", "e", "e", "e", "e", "e",
"e", "e", "e", "f", "f", "f", "f", "g", "g", "g", "g", "g"),
date = structure(c(1055203200, 1055203200, 1055635200, 1058918400,
1058918400, 1074211200, 974764800, 974764800, 974764800,
979516800, 979516800, 975369600, 983491200, 983491200, 984528000,
987984000, 979689600, 979689600, 992217600, 992217600, 994896000,
995068800, 999043200, 999043200, 968284800, 968284800, 968284800,
968284800, 993081600, 993081600, 993081600, 993081600, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1112,
890, 8125, 2369, 1485, 1112, 1645, 1645, 794, 236, 4578,
8749, 230, 1114, 690, 720, 1212, 1212, 1112, 1112, 2060,
310, 1415, 310, 8145, 1497, 1112, 1112, 230, 240, 1112, 1445,
1112), dataset = c("y", "y", "y", "y", "y", "y", "y", "y",
"y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",
"y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y", "y",
"y")), row.names = c(NA, -33L), class = c("data.table", "data.frame"
))
在每个数据集中,有一个与特定个体相关的 ID,一个事件发生时间的 date,一个 code 表示事件类型,还有一个虚拟变量表示每行的 dataset(这是为了跟踪一行来自哪个数据集加入后)。
一个人可以有多行代表多个日期的多个事件。此外,一个人可以在同一日期经历多个事件,包括多个相同类型的事件(即,同一事件代码可以在特定日期对一个人出现多次)。
我想加入这两个数据集,它们之间有唯一和共享的行。在加入它们时,我想删除重复的条目:在两个数据集中出现特定 ID-date-code 组合的地方,我只想保留 dtx[ 的版本=59=]。此外,我想记录该行最初来自哪个数据集,这就是为什么我有 dataset 虚拟列
这是所需的输出:
dtresult <- structure(list(ID = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "d", "d",
"d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "e", "e", "e",
"e", "e", "e", "e", "e", "f", "f", "f", "f", "f", "f", "f", "f",
"g", "g", "g", "g", "g", "g", "g", "g"), date = structure(c(939340800,
939340800, 949622400, 949622400, 1055203200, 1055203200, 1055635200,
1058918400, 1058918400, 1074211200, 887414400, 887414400, 920332800,
920332800, 974764800, 974764800, 974764800, 979516800, 979516800,
831686400, 831686400, 831686400, 845078400, 845078400, 969062400,
969062400, 975369600, 983491200, 983491200, 984528000, 987984000,
979689600, 979689600, 992217600, 992217600, 994896000, 995068800,
999043200, 999043200, 892598400, 892598400, 921801600, 921974400,
968284800, 968284800, 968284800, 968284800, 927158400, 927158400,
993081600, 993081600, 993081600, 993081600, 1057190400, 1057190400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), code = c(1215,
1220, 1320, 1320, 1112, 890, 8125, 2369, 1485, 1112, 236, 300,
1245, 850, 1645, 1645, 794, 236, 4578, 1415, 1415, 1279, 230,
230, 1115, 1215, 8749, 230, 1114, 690, 720, 1212, 1212, 1112,
1112, 2060, 310, 1415, 310, 1112, 1112, 4561, 8145, 8145, 1497,
1112, 1112, 258, 1112, 230, 240, 1112, 1445, 260, 1112), dataset = c("x",
"x", "x", "x", "y", "y", "y", "y", "y", "y", "x", "x", "x", "x",
"y", "y", "y", "y", "y", "x", "x", "x", "x", "x", "x", "x", "x",
"y", "y", "y", "y", "x", "x", "y", "y", "y", "y", "y", "y", "x",
"x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x",
"x", "x")), row.names = c(NA, -55L), class = c("data.table",
"data.frame"))
在这种情况下,我无法加入两个数据集,因为一个人可能在一个日期有多个相同的事件类型。这意味着我不能简单地使用 unique
或 duplicated
来识别加倍的行。
在我第一次尝试加入数据集时,我想我会使用 rbindlist
:
l = list(dtx, dty)
dtxy = rbindlist(l, use.names = TRUE)
dtxy = dtxy[order(ID, date, code, dataset)]
但后来我意识到,如果不排除一个人在一个日期有多个相同事件的情况,我就无法删除重叠的行。
然后我尝试使用 merge
:
dtxy = merge(dtx,
dty,
all = TRUE,
by = c("ID", "date", "code"))
但此方法不处理同一事件类型在同一日期多次发生并且在两个数据集中重复的情况!
我真的很纠结这个 - 我也想过使用 funion
,但这与 merge
.
任何帮助将不胜感激(特别是使用 base 或 data.table
)
虽然很乱,但是
a <-dtx %>%
full_join(dty, by = c("ID", "date", "code")) %>%
arrange(ID) %>%
distinct()
#mutate(dataset = ifelse(!is.na(dataset.x), dataset.x, dataset.y))
b <- dtx %>%
full_join(dty, by = c("ID", "date", "code")) %>%
arrange(ID) %>%
group_by(ID, date, code) %>%
filter(n()>1) %>%
distinct()
c <- rbind(a,b) %>%
mutate(dataset = ifelse(!is.na(dataset.x), dataset.x, dataset.y)) %>%
select(-dataset.x, -dataset.y) %>%
arrange(ID, date)
sum(!(c %>% arrange(ID, date, code) == dtresult %>% arrange(ID, date, code)) )
[1] 0
c
就是你想要的
> str(c)
Classes ‘data.table’ and 'data.frame': 55 obs. of 4 variables:
$ ID : chr "a" "a" "a" "a" ...
$ date : POSIXct, format: "1999-10-08" "1999-10-08" "2000-02-04" "2000-02-04" ...
$ code : num 1215 1220 1320 1320 1112 ...
$ dataset: chr "x" "x" "x" "x" ...
- attr(*, ".internal.selfref")=<externalptr>
您可以删除来自 y
的 overlaps
:
l = list(dtx, dty)
dtxy = rbindlist(l, use.names = TRUE)
overlaps = merge(dtx,dty,by=c("ID","date","code"))[,.(ID,date,code,dataset = dataset.y)]
dtresultnew <- overlaps[dtxy,.(ID,date,code,x.dataset,i.dataset),on = .(ID,date,code,dataset)][
is.na(x.dataset),.(ID,date,code,dataset=i.dataset)]
identical(dtresult[order(ID,date,code)],dtresultnew[order(ID,date,code)])
[1] TRUE