根据条件在 R 中使用 data.table 合并两行的值
Merging values of two rows based on conditions using data.table in R
在我有一个 activity、开始和停止时间以及一个 ID 的数据集中,我想合并同一列中两行的值,并在多个条件适用时更新其他列。
首先是一个数据示例:
library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
"grab", "walk", "delete", "talk", "remove"),
start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))
DT
我想为每个人更新开始和停止时间并合并列 'activity',如果:
- 活动并行进行。具体来说,如果后面activity的start_time在前面activityactivity的stop_time之前。或者:
- 如果一个人的活动开始时间或结束时间相同。
更新的行应反映组合 activity 的开始和停止时间,除更新的行外,所有行都应删除。以下是我希望通过我提供的数据样本实现的目标:
DT.goal <- data.table(person=c(1,1,2,2,3,3),
activity=c("grab + walk + remove", "delete", "run", "talk + walk + grab + remove",
"walk", "delete + talk + remove"),
start_time=c(0,6,0,2,3,6), stop_time=c(5,7,1,8,5,8))
DT.goal
到目前为止,我已经进行了以下不完整的尝试:
DT.test <- DT[start_time <= shift(stop_time, 1L, type="lag"),
cond := T, by=person]
DT.test <- DT.test[cond==T,
new_activity := paste(activity, shift(activity, 1L, type="lag")), by=person]
DT.test <- DT.test[, new_start := start_time, by=person][cond==T, new_start := min(start_time), by=person]
DT.test <- DT.test[, new_stop := stop_time, by=person][cond==T, new_stop := max(stop_time), by=person]
但是,使用 shift(, type="lag)
对于每个人的第一行不是很有用,因为它没有前一行可供查看。此外,如果条件未计算为 TRUE
,paste()
将粘贴 NA。
有人可以帮我吗?
请检查以下内容:
library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
"grab", "walk", "delete", "talk", "remove"),
start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))
setorder(DT, person, start_time)
DT[, concatenate := start_time %in% stop_time | stop_time %in% start_time | duplicated(start_time) | duplicated(start_time, fromLast=TRUE), by = "person"]
DT[, concatenate_grp := rleid(concatenate), by = "person"]
DT[, paste(activity, collapse = " + "), by = c("person", "concatenate_grp")]
DT.goal <- DT[, .(activity = paste(activity, collapse = " + "), start_time = min(start_time), stop_time = max(stop_time)), by = c("person", "concatenate_grp")][, concatenate_grp := NULL]
这导致:
person activity start_time stop_time
1: 1 grab + walk + remove 0 5
2: 1 delete 6 7
3: 2 run 0 1
4: 2 talk + walk + remove + grab 2 8
5: 3 walk 3 5
6: 3 delete + talk + remove 6 8
使用dplyr
,我们arrange
数据person
和开始和停止时间。我们创建一个 group
,其中每组中每个 person
和 select 的时间重叠 first
start_time
和 last
stop_time
并将每个组中的所有活动串联起来。
library(dplyr)
DT %>%
arrange(person, start_time, stop_time) %>%
group_by(person, group = cumsum(start_time >
lag(stop_time, default = first(stop_time)))) %>%
summarise(start_time = first(start_time),
stop_time = last(stop_time),
activity = paste(activity, collapse = " + ")) %>%
select(-group)
# person start_time stop_time activity
# <dbl> <dbl> <dbl> <chr>
#1 1 0 5 grab + walk + remove
#2 1 6 7 delete
#3 2 0 1 run
#4 2 2 8 talk + walk + grab + remove
#5 3 3 5 walk
#6 3 6 8 delete + talk + remove
setorder(DT, person, stop_time)
DT[,
break_here := start_time > shift(stop_time, 1, stop_time[1]) ,
by = person
][,
.(activity = paste(activity, collapse = " + "), start_time = start_time[1], stop_time = stop_time[.N]),
keyby = .(person, helper_var = cumsum(break_here))
][, !"helper_var"]
person activity start_time stop_time
1: 1 grab + walk + remove 0 5
2: 1 delete 6 7
3: 2 run 0 1
4: 2 talk + walk + grab + remove 2 8
5: 3 walk 3 5
6: 3 delete + talk + remove 6 8
另一种选择是借鉴 David Aurenburg 的解决方案
setorder(DT, person, start_time, stop_time)
DT[, g := c(0L, cumsum(shift(start_time, -1L) > cummax(stop_time))[-.N]), person]
DT[, .(activity=paste(activity, collapse=" + "),
start_time=min(start_time), stop_time=max(stop_time)),
.(person, g)]
输出:
person g activity start_time stop_time
1: 1 0 grab + walk + remove 0 5
2: 1 1 delete 6 7
3: 2 0 run 0 1
4: 2 1 talk + walk + grab + remove 2 8
5: 3 0 walk 3 5
6: 3 1 delete + talk + remove 6 8
在我有一个 activity、开始和停止时间以及一个 ID 的数据集中,我想合并同一列中两行的值,并在多个条件适用时更新其他列。 首先是一个数据示例:
library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
"grab", "walk", "delete", "talk", "remove"),
start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))
DT
我想为每个人更新开始和停止时间并合并列 'activity',如果:
- 活动并行进行。具体来说,如果后面activity的start_time在前面activityactivity的stop_time之前。或者:
- 如果一个人的活动开始时间或结束时间相同。
更新的行应反映组合 activity 的开始和停止时间,除更新的行外,所有行都应删除。以下是我希望通过我提供的数据样本实现的目标:
DT.goal <- data.table(person=c(1,1,2,2,3,3),
activity=c("grab + walk + remove", "delete", "run", "talk + walk + grab + remove",
"walk", "delete + talk + remove"),
start_time=c(0,6,0,2,3,6), stop_time=c(5,7,1,8,5,8))
DT.goal
到目前为止,我已经进行了以下不完整的尝试:
DT.test <- DT[start_time <= shift(stop_time, 1L, type="lag"),
cond := T, by=person]
DT.test <- DT.test[cond==T,
new_activity := paste(activity, shift(activity, 1L, type="lag")), by=person]
DT.test <- DT.test[, new_start := start_time, by=person][cond==T, new_start := min(start_time), by=person]
DT.test <- DT.test[, new_stop := stop_time, by=person][cond==T, new_stop := max(stop_time), by=person]
但是,使用 shift(, type="lag)
对于每个人的第一行不是很有用,因为它没有前一行可供查看。此外,如果条件未计算为 TRUE
,paste()
将粘贴 NA。
有人可以帮我吗?
请检查以下内容:
library(data.table)
DT <- data.table(person=c(1,1,1,1,2,2,2,2,2,3,3,3,3),
activity=c("grab", "walk", "remove", "delete", "run", "talk", "walk", "remove",
"grab", "walk", "delete", "talk", "remove"),
start_time=c(0,1,3,6,0,2,2,3,3,3,6,6,7), stop_time=c(1,3,5,7,1,4,4,8,4,5,7,7,8))
setorder(DT, person, start_time)
DT[, concatenate := start_time %in% stop_time | stop_time %in% start_time | duplicated(start_time) | duplicated(start_time, fromLast=TRUE), by = "person"]
DT[, concatenate_grp := rleid(concatenate), by = "person"]
DT[, paste(activity, collapse = " + "), by = c("person", "concatenate_grp")]
DT.goal <- DT[, .(activity = paste(activity, collapse = " + "), start_time = min(start_time), stop_time = max(stop_time)), by = c("person", "concatenate_grp")][, concatenate_grp := NULL]
这导致:
person activity start_time stop_time
1: 1 grab + walk + remove 0 5
2: 1 delete 6 7
3: 2 run 0 1
4: 2 talk + walk + remove + grab 2 8
5: 3 walk 3 5
6: 3 delete + talk + remove 6 8
使用dplyr
,我们arrange
数据person
和开始和停止时间。我们创建一个 group
,其中每组中每个 person
和 select 的时间重叠 first
start_time
和 last
stop_time
并将每个组中的所有活动串联起来。
library(dplyr)
DT %>%
arrange(person, start_time, stop_time) %>%
group_by(person, group = cumsum(start_time >
lag(stop_time, default = first(stop_time)))) %>%
summarise(start_time = first(start_time),
stop_time = last(stop_time),
activity = paste(activity, collapse = " + ")) %>%
select(-group)
# person start_time stop_time activity
# <dbl> <dbl> <dbl> <chr>
#1 1 0 5 grab + walk + remove
#2 1 6 7 delete
#3 2 0 1 run
#4 2 2 8 talk + walk + grab + remove
#5 3 3 5 walk
#6 3 6 8 delete + talk + remove
setorder(DT, person, stop_time)
DT[,
break_here := start_time > shift(stop_time, 1, stop_time[1]) ,
by = person
][,
.(activity = paste(activity, collapse = " + "), start_time = start_time[1], stop_time = stop_time[.N]),
keyby = .(person, helper_var = cumsum(break_here))
][, !"helper_var"]
person activity start_time stop_time
1: 1 grab + walk + remove 0 5
2: 1 delete 6 7
3: 2 run 0 1
4: 2 talk + walk + grab + remove 2 8
5: 3 walk 3 5
6: 3 delete + talk + remove 6 8
另一种选择是借鉴 David Aurenburg 的解决方案
setorder(DT, person, start_time, stop_time)
DT[, g := c(0L, cumsum(shift(start_time, -1L) > cummax(stop_time))[-.N]), person]
DT[, .(activity=paste(activity, collapse=" + "),
start_time=min(start_time), stop_time=max(stop_time)),
.(person, g)]
输出:
person g activity start_time stop_time
1: 1 0 grab + walk + remove 0 5
2: 1 1 delete 6 7
3: 2 0 run 0 1
4: 2 1 talk + walk + grab + remove 2 8
5: 3 0 walk 3 5
6: 3 1 delete + talk + remove 6 8