合并具有连续时间间隔的数据行
merge data rows if they have sequential time intervals
我有一个 Start.Date 和 Stop.Date 的患者用药数据集。每个都在一行中表示。我想合并时间间隔顺序如下的行:
ID = c(2, 2, 2, 2, 3, 5)
Medication = c("aspirin", "aspirin", "aspirin", "tylenol", "lipitor", "advil")
Start.Date = c("05/01/2017", "05/05/2017", "06/20/2017", "05/01/2017", "05/06/2017", "05/28/2017")
Stop.Date = c("05/04/2017", "05/10/2017", "06/27/2017", "05/15/2017", "05/12/2017", "06/13/2017")
df = data.frame(ID, Medication, Start.Date, Stop.Date)
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/04/2017
2 aspirin 05/05/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017
如果一个 Stop.Date 是下一个 Start.Date 的前一天,我想按 ID 和药物减少行数。它应该如下所示:
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017
使用 mdy
(来自 lubridate
)将 'Start' 和 'Stop' 日期列转换为 Date
class,按 [= 30=, 'Medication', filter
'Start.Date' 和 'Stop.Date' 的'lead' 不等于 1
的 abs
差值
library(dplyr)
library(lubridate)
df %>%
mutate_at(3:4, mdy) %>%
group_by(ID, Medication) %>%
filter(abs(lead(Start.Date, default = last(Start.Date)) - Stop.Date) != 1)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2 aspirin 2017-05-05 2017-05-10
#2 2 aspirin 2017-06-20 2017-06-27
#3 2 tylenol 2017-05-01 2017-05-15
#4 3 lipitor 2017-05-06 2017-05-12
#5 5 advil 2017-05-28 2017-06-13
或使用 data.table
中的类似方法
library(data.table)
setDT(df)[df[, (shift(mdy(Start.Date), type = 'lead',
fill = last(Start.Date)) - mdy(Stop.Date)) != 1 , ID]$V1]
# ID Medication Start.Date Stop.Date
#1: 2 aspirin 05/05/2017 05/10/2017
#2: 2 aspirin 06/20/2017 06/27/2017
#3: 2 tylenol 05/01/2017 05/15/2017
#4: 3 lipitor 05/06/2017 05/12/2017
#5: 5 advil 05/28/2017 06/13/2017
注意:我们可以像以前一样先将日期列转换为 Date
class
NOTE2: 都是基于OP提供的例子的简单方法
这个怎么样?
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-13
最好再用几个例子来测试这个,以确保稳健性。让我们尝试一个更复杂的例子
df <- structure(list(ID = c(2, 2, 2, 2, 2, 3, 5, 5), Medication = structure(c(2L,
2L, 2L, 2L, 4L, 3L, 1L, 1L), .Label = c("advil", "aspirin", "lipitor",
"tylenol"), class = "factor"), Start.Date = structure(c(1L, 2L,
6L, 7L, 1L, 3L, 4L, 5L), .Label = c("05/01/2017", "05/05/2017",
"05/06/2017", "05/28/2017", "06/14/2017", "06/20/2017", "06/28/2017"
), class = "factor"), Stop.Date = structure(c(2L, 3L, 8L, 1L,
5L, 4L, 6L, 7L), .Label = c("04/30/2017", "05/04/2017", "05/10/2017",
"05/12/2017", "05/15/2017", "06/13/2017", "06/20/2017", "06/27/2017"
), class = "factor")), .Names = c("ID", "Medication", "Start.Date",
"Stop.Date"), row.names = c(NA, -8L), class = "data.frame")
df;
# ID Medication Start.Date Stop.Date
#1 2 aspirin 05/01/2017 05/04/2017
#2 2 aspirin 05/05/2017 05/10/2017
#3 2 aspirin 06/20/2017 06/27/2017
#4 2 aspirin 06/28/2017 04/30/2017
#5 2 tylenol 05/01/2017 05/15/2017
#6 3 lipitor 05/06/2017 05/12/2017
#7 5 advil 05/28/2017 06/13/2017
#8 5 advil 06/14/2017 06/20/2017
请注意,这里我们有两个连续的 ID=2
块(第 1+2 行和第 3+4 行),以及一个连续的 ID=5
块(第7+8).
输出是
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-20
结果似乎很稳健。
library(tidyverse)
library(lubridate)
df%>%
group_by(Medication)%>%
mutate_at(vars(3:4),mdy)%>%
mutate(Start.Date = coalesce(
if_else((Start.Date-lag(Stop.Date))==1,lag(Start.Date),Start.Date),Start.Date),
s = lead(Start.Date)!=Start.Date)%>%
filter(s|is.na(s))%>%
select(-s)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
ID Medication Start.Date Stop.Date
<dbl> <chr> <date> <date>
1 2 aspirin 2017-05-01 2017-05-10
2 2 aspirin 2017-06-20 2017-06-27
3 2 tylenol 2017-05-01 2017-05-15
4 3 lipitor 2017-05-06 2017-05-12
5 5 advil 2017-05-28 2017-06-13
我有一个 Start.Date 和 Stop.Date 的患者用药数据集。每个都在一行中表示。我想合并时间间隔顺序如下的行:
ID = c(2, 2, 2, 2, 3, 5)
Medication = c("aspirin", "aspirin", "aspirin", "tylenol", "lipitor", "advil")
Start.Date = c("05/01/2017", "05/05/2017", "06/20/2017", "05/01/2017", "05/06/2017", "05/28/2017")
Stop.Date = c("05/04/2017", "05/10/2017", "06/27/2017", "05/15/2017", "05/12/2017", "06/13/2017")
df = data.frame(ID, Medication, Start.Date, Stop.Date)
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/04/2017
2 aspirin 05/05/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017
如果一个 Stop.Date 是下一个 Start.Date 的前一天,我想按 ID 和药物减少行数。它应该如下所示:
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017
使用 mdy
(来自 lubridate
)将 'Start' 和 'Stop' 日期列转换为 Date
class,按 [= 30=, 'Medication', filter
'Start.Date' 和 'Stop.Date' 的'lead' 不等于 1
abs
差值
library(dplyr)
library(lubridate)
df %>%
mutate_at(3:4, mdy) %>%
group_by(ID, Medication) %>%
filter(abs(lead(Start.Date, default = last(Start.Date)) - Stop.Date) != 1)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2 aspirin 2017-05-05 2017-05-10
#2 2 aspirin 2017-06-20 2017-06-27
#3 2 tylenol 2017-05-01 2017-05-15
#4 3 lipitor 2017-05-06 2017-05-12
#5 5 advil 2017-05-28 2017-06-13
或使用 data.table
library(data.table)
setDT(df)[df[, (shift(mdy(Start.Date), type = 'lead',
fill = last(Start.Date)) - mdy(Stop.Date)) != 1 , ID]$V1]
# ID Medication Start.Date Stop.Date
#1: 2 aspirin 05/05/2017 05/10/2017
#2: 2 aspirin 06/20/2017 06/27/2017
#3: 2 tylenol 05/01/2017 05/15/2017
#4: 3 lipitor 05/06/2017 05/12/2017
#5: 5 advil 05/28/2017 06/13/2017
注意:我们可以像以前一样先将日期列转换为 Date
class
NOTE2: 都是基于OP提供的例子的简单方法
这个怎么样?
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-13
最好再用几个例子来测试这个,以确保稳健性。让我们尝试一个更复杂的例子
df <- structure(list(ID = c(2, 2, 2, 2, 2, 3, 5, 5), Medication = structure(c(2L,
2L, 2L, 2L, 4L, 3L, 1L, 1L), .Label = c("advil", "aspirin", "lipitor",
"tylenol"), class = "factor"), Start.Date = structure(c(1L, 2L,
6L, 7L, 1L, 3L, 4L, 5L), .Label = c("05/01/2017", "05/05/2017",
"05/06/2017", "05/28/2017", "06/14/2017", "06/20/2017", "06/28/2017"
), class = "factor"), Stop.Date = structure(c(2L, 3L, 8L, 1L,
5L, 4L, 6L, 7L), .Label = c("04/30/2017", "05/04/2017", "05/10/2017",
"05/12/2017", "05/15/2017", "06/13/2017", "06/20/2017", "06/27/2017"
), class = "factor")), .Names = c("ID", "Medication", "Start.Date",
"Stop.Date"), row.names = c(NA, -8L), class = "data.frame")
df;
# ID Medication Start.Date Stop.Date
#1 2 aspirin 05/01/2017 05/04/2017
#2 2 aspirin 05/05/2017 05/10/2017
#3 2 aspirin 06/20/2017 06/27/2017
#4 2 aspirin 06/28/2017 04/30/2017
#5 2 tylenol 05/01/2017 05/15/2017
#6 3 lipitor 05/06/2017 05/12/2017
#7 5 advil 05/28/2017 06/13/2017
#8 5 advil 06/14/2017 06/20/2017
请注意,这里我们有两个连续的 ID=2
块(第 1+2 行和第 3+4 行),以及一个连续的 ID=5
块(第7+8).
输出是
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-20
结果似乎很稳健。
library(tidyverse)
library(lubridate)
df%>%
group_by(Medication)%>%
mutate_at(vars(3:4),mdy)%>%
mutate(Start.Date = coalesce(
if_else((Start.Date-lag(Stop.Date))==1,lag(Start.Date),Start.Date),Start.Date),
s = lead(Start.Date)!=Start.Date)%>%
filter(s|is.na(s))%>%
select(-s)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
ID Medication Start.Date Stop.Date
<dbl> <chr> <date> <date>
1 2 aspirin 2017-05-01 2017-05-10
2 2 aspirin 2017-06-20 2017-06-27
3 2 tylenol 2017-05-01 2017-05-15
4 3 lipitor 2017-05-06 2017-05-12
5 5 advil 2017-05-28 2017-06-13