计算 R 中的重叠日期 (dplyr)
Calculating overlapping dates in R (dplyr)
我有一个个人数据集 (CSN),每个人在入院期间都进行过从零到多次的干预(在本例中,中心线放置),每个都有开始和结束日期。我正在尝试生成一个新的日期范围来计算任何重叠的日期。换句话说,我正在尝试计算一个人有中心线时的总日期范围。
数据举例:
structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(1514937600,
1514937600, 1515024000, 1515024000, 1515110400, 1515974400, 1516147200,
1516147200, 1516147200, 1516233600, 1516233600, 1517097600, 1517097600,
1517702400, 1517356800, 1518220800, 1519257600, 1519948800, 1520812800,
1521504000, 1522022400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Last_day = structure(c(1515628800, 1515110400, 1515542400,
1515542400, 1515628800, 1516579200, 1516320000, 1517184000, 1516233600,
1517184000, 1517702400, 1517184000, 1517616000, 1517702400, 1518220800,
1518825600, 1519689600, 1520812800, 1521763200, 1522108800, 1522108800
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
理想情况下,输出将 return 所有重叠日期的单个日期范围,但如果每个日期都错过了一段日子,则会创建一个新的间隔。因此,对于第 1 组,第 1-5 行的开始 = 2018-01-03 和结束 = 2018-01-11,但是第 6 行的开始 = 2018-01-15 和结束 = 2018-01-22 .
我已尝试执行以下操作:
df %>%
arrange(CSN_id, First_day) %>%
mutate(First_day = ymd(First_day),
Last_day = ymd(Last_day),
start = ymd("1970-01-01"),
end = ymd("1970-01-01")) %>%
group_by(CSN_id) %>%
rowwise() %>%
mutate(test = if_else(row_number() == 1, interval(First_day, Last_day), interval(lag(start), lag(end))),
start = if_else(row_number() == 1, First_day,
if_else(First_day <= lag(end), lag(First_day), First_day)),
end = if_else(row_number() == 1, Last_day,
if_else(Last_day %within% lag(test) == TRUE, lag(end), Last_day)))
但是,我认为滞后函数没有按预期工作,并且出于某种原因它总是 returns Last_day。我尝试摆脱 rowwise,但随后间隔变得混乱(一直停留在 1970 年代)。
我得到的输出是:
structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(17534,
17534, 17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549,
17549, 17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602,
17610, 17616), class = "Date"), Last_day = structure(c(17542,
17536, 17541, 17541, 17542, 17553, 17550, 17560, 17549, 17560,
17566, 17560, 17565, 17566, 17572, 17579, 17589, 17602, 17613,
17617, 17617), class = "Date"), start = structure(c(17534, 17534,
17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549, 17549,
17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602, 17610,
17616), class = "Date"), end = structure(c(17542, 17536, 17541,
17541, 17542, 17553, 17550, 17560, 17549, 17560, 17566, 17560,
17565, 17566, 17572, 17579, 17589, 17602, 17613, 17617, 17617
), class = "Date"), test = new("Interval", .Data = c(691200,
172800, 518400, 518400, 518400, 604800, 172800, 1036800, 86400,
950400, 1468800, 86400, 518400, 0, 864000, 604800, 432000, 864000,
950400, 604800, 86400), start = structure(c(1514937600, 1514937600,
1515024000, 1515024000, 1515110400, 1515974400, 1516147200, 1516147200,
1516147200, 1516233600, 1516233600, 1517097600, 1517097600, 1517702400,
1517356800, 1518220800, 1519257600, 1519948800, 1520812800, 1521504000,
1522022400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
tzone = "UTC")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -21L), groups = structure(list(
CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame")))
我有什么明显的遗漏吗?任何帮助将不胜感激!
我不太确定你想要的输出是什么,但你可以试试这个方法:
dat %>%
arrange(CSN_id,First_day,Last_day) %>%
group_by(CSN_id, First_day) %>%
summarize(Last_day=max(Last_day,na.rm=T)) %>%
mutate(interval=as.numeric(First_day- lag(Last_day))>0,
interval=cumsum(if_else(is.na(interval),FALSE,interval))+1) %>%
group_by(CSN_id,interval) %>%
summarize(start = min(First_day),
end = max(Last_day))
输出:
CSN_id interval start end
<int> <dbl> <dttm> <dttm>
1 1 1 2018-01-03 00:00:00 2018-01-11 00:00:00
2 1 2 2018-01-15 00:00:00 2018-01-22 00:00:00
3 2 1 2018-01-17 00:00:00 2018-01-19 00:00:00
4 3 1 2018-01-17 00:00:00 2018-02-04 00:00:00
5 3 2 2018-02-04 00:00:00 2018-02-04 00:00:00
6 4 1 2018-01-31 00:00:00 2018-02-17 00:00:00
7 4 2 2018-02-22 00:00:00 2018-02-27 00:00:00
8 4 3 2018-03-02 00:00:00 2018-03-27 00:00:00
如果您希望保留所有原始行,并且所有日期都是日期而不是日期时间,您也可以这样做:
dat %>%
mutate(across(First_day:Last_day, ~as.Date(.x))) %>%
arrange(CSN_id,First_day,Last_day) %>%
group_by(CSN_id) %>%
mutate(interval=as.numeric(First_day- lag(Last_day))>0,
interval=cumsum(if_else(is.na(interval),FALSE,interval))+1) %>%
group_by(CSN_id,interval) %>%
mutate(start = min(First_day),
end = max(Last_day))
输出:
CSN_id First_day Last_day interval start end
<int> <date> <date> <dbl> <date> <date>
1 1 2018-01-03 2018-01-05 1 2018-01-03 2018-01-11
2 1 2018-01-03 2018-01-11 1 2018-01-03 2018-01-11
3 1 2018-01-04 2018-01-10 1 2018-01-03 2018-01-11
4 1 2018-01-04 2018-01-10 1 2018-01-03 2018-01-11
5 1 2018-01-05 2018-01-11 1 2018-01-03 2018-01-11
6 1 2018-01-15 2018-01-22 2 2018-01-15 2018-01-22
7 2 2018-01-17 2018-01-19 1 2018-01-17 2018-01-19
8 3 2018-01-17 2018-01-18 1 2018-01-17 2018-02-04
9 3 2018-01-17 2018-01-29 1 2018-01-17 2018-02-04
10 3 2018-01-18 2018-01-29 1 2018-01-17 2018-02-04
# ... with 11 more rows
这是另一个使用 IRanges
package on Bioconductor. The collapse_date_ranges
function is taken from 的选项,我只是根据
进行了调整
library(data.table)
library(tidyverse)
collapse_date_ranges <- function(w, min.gapwidth = 1L) {
IRanges::IRanges(start = as.integer(as.Date(w$First_day)),
end = as.integer(as.Date(w$Last_day))) %>%
IRanges::reduce(min.gapwidth = min.gapwidth) %>%
as.data.table() %>%
.[, lapply(.SD, lubridate::as_date),
.SDcols = c("start", "end")]
}
split(df, df$CSN_id) %>%
map(., ~collapse_date_ranges(., 0L)) %>%
bind_rows(., .id = 'id')
输出
id start end
1: 1 2018-01-03 2018-01-11
2: 1 2018-01-15 2018-01-22
3: 2 2018-01-17 2018-01-19
4: 3 2018-01-17 2018-02-04
5: 4 2018-01-31 2018-02-17
6: 4 2018-02-22 2018-02-27
7: 4 2018-03-02 2018-03-27
如果你想在原始数据框中包含这个,那么我们可以将数据连接回原始数据框,然后使用 fill
将日期添加到每一行。
split(df, df$CSN_id) %>%
map(., ~collapse_date_ranges(., 0L)) %>%
bind_rows(., .id = 'CSN_id2') %>%
data.frame %>%
mutate(CSN_id2 = as.integer(CSN_id2)) %>%
full_join(df, ., by = c("CSN_id" = "CSN_id2", "First_day" = "start"), keep = TRUE) %>%
select(-CSN_id2) %>%
group_by(CSN_id) %>%
fill(start, end, .direction = "down")
输出
CSN_id First_day Last_day start end
<int> <dttm> <dttm> <date> <date>
1 1 2018-01-03 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
2 1 2018-01-03 00:00:00 2018-01-05 00:00:00 2018-01-03 2018-01-11
3 1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
4 1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
5 1 2018-01-05 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
6 1 2018-01-15 00:00:00 2018-01-22 00:00:00 2018-01-15 2018-01-22
7 2 2018-01-17 00:00:00 2018-01-19 00:00:00 2018-01-17 2018-01-19
8 3 2018-01-17 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
9 3 2018-01-17 00:00:00 2018-01-18 00:00:00 2018-01-17 2018-02-04
10 3 2018-01-18 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
# … with 11 more rows
我有一个个人数据集 (CSN),每个人在入院期间都进行过从零到多次的干预(在本例中,中心线放置),每个都有开始和结束日期。我正在尝试生成一个新的日期范围来计算任何重叠的日期。换句话说,我正在尝试计算一个人有中心线时的总日期范围。
数据举例:
structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(1514937600,
1514937600, 1515024000, 1515024000, 1515110400, 1515974400, 1516147200,
1516147200, 1516147200, 1516233600, 1516233600, 1517097600, 1517097600,
1517702400, 1517356800, 1518220800, 1519257600, 1519948800, 1520812800,
1521504000, 1522022400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Last_day = structure(c(1515628800, 1515110400, 1515542400,
1515542400, 1515628800, 1516579200, 1516320000, 1517184000, 1516233600,
1517184000, 1517702400, 1517184000, 1517616000, 1517702400, 1518220800,
1518825600, 1519689600, 1520812800, 1521763200, 1522108800, 1522108800
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-21L), class = c("tbl_df", "tbl", "data.frame"))
理想情况下,输出将 return 所有重叠日期的单个日期范围,但如果每个日期都错过了一段日子,则会创建一个新的间隔。因此,对于第 1 组,第 1-5 行的开始 = 2018-01-03 和结束 = 2018-01-11,但是第 6 行的开始 = 2018-01-15 和结束 = 2018-01-22 .
我已尝试执行以下操作:
df %>%
arrange(CSN_id, First_day) %>%
mutate(First_day = ymd(First_day),
Last_day = ymd(Last_day),
start = ymd("1970-01-01"),
end = ymd("1970-01-01")) %>%
group_by(CSN_id) %>%
rowwise() %>%
mutate(test = if_else(row_number() == 1, interval(First_day, Last_day), interval(lag(start), lag(end))),
start = if_else(row_number() == 1, First_day,
if_else(First_day <= lag(end), lag(First_day), First_day)),
end = if_else(row_number() == 1, Last_day,
if_else(Last_day %within% lag(test) == TRUE, lag(end), Last_day)))
但是,我认为滞后函数没有按预期工作,并且出于某种原因它总是 returns Last_day。我尝试摆脱 rowwise,但随后间隔变得混乱(一直停留在 1970 年代)。
我得到的输出是:
structure(list(CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), First_day = structure(c(17534,
17534, 17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549,
17549, 17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602,
17610, 17616), class = "Date"), Last_day = structure(c(17542,
17536, 17541, 17541, 17542, 17553, 17550, 17560, 17549, 17560,
17566, 17560, 17565, 17566, 17572, 17579, 17589, 17602, 17613,
17617, 17617), class = "Date"), start = structure(c(17534, 17534,
17535, 17535, 17536, 17546, 17548, 17548, 17548, 17549, 17549,
17559, 17559, 17566, 17562, 17572, 17584, 17592, 17602, 17610,
17616), class = "Date"), end = structure(c(17542, 17536, 17541,
17541, 17542, 17553, 17550, 17560, 17549, 17560, 17566, 17560,
17565, 17566, 17572, 17579, 17589, 17602, 17613, 17617, 17617
), class = "Date"), test = new("Interval", .Data = c(691200,
172800, 518400, 518400, 518400, 604800, 172800, 1036800, 86400,
950400, 1468800, 86400, 518400, 0, 864000, 604800, 432000, 864000,
950400, 604800, 86400), start = structure(c(1514937600, 1514937600,
1515024000, 1515024000, 1515110400, 1515974400, 1516147200, 1516147200,
1516147200, 1516233600, 1516233600, 1517097600, 1517097600, 1517702400,
1517356800, 1518220800, 1519257600, 1519948800, 1520812800, 1521504000,
1522022400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
tzone = "UTC")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -21L), groups = structure(list(
CSN_id = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -21L), class = c("tbl_df",
"tbl", "data.frame")))
我有什么明显的遗漏吗?任何帮助将不胜感激!
我不太确定你想要的输出是什么,但你可以试试这个方法:
dat %>%
arrange(CSN_id,First_day,Last_day) %>%
group_by(CSN_id, First_day) %>%
summarize(Last_day=max(Last_day,na.rm=T)) %>%
mutate(interval=as.numeric(First_day- lag(Last_day))>0,
interval=cumsum(if_else(is.na(interval),FALSE,interval))+1) %>%
group_by(CSN_id,interval) %>%
summarize(start = min(First_day),
end = max(Last_day))
输出:
CSN_id interval start end
<int> <dbl> <dttm> <dttm>
1 1 1 2018-01-03 00:00:00 2018-01-11 00:00:00
2 1 2 2018-01-15 00:00:00 2018-01-22 00:00:00
3 2 1 2018-01-17 00:00:00 2018-01-19 00:00:00
4 3 1 2018-01-17 00:00:00 2018-02-04 00:00:00
5 3 2 2018-02-04 00:00:00 2018-02-04 00:00:00
6 4 1 2018-01-31 00:00:00 2018-02-17 00:00:00
7 4 2 2018-02-22 00:00:00 2018-02-27 00:00:00
8 4 3 2018-03-02 00:00:00 2018-03-27 00:00:00
如果您希望保留所有原始行,并且所有日期都是日期而不是日期时间,您也可以这样做:
dat %>%
mutate(across(First_day:Last_day, ~as.Date(.x))) %>%
arrange(CSN_id,First_day,Last_day) %>%
group_by(CSN_id) %>%
mutate(interval=as.numeric(First_day- lag(Last_day))>0,
interval=cumsum(if_else(is.na(interval),FALSE,interval))+1) %>%
group_by(CSN_id,interval) %>%
mutate(start = min(First_day),
end = max(Last_day))
输出:
CSN_id First_day Last_day interval start end
<int> <date> <date> <dbl> <date> <date>
1 1 2018-01-03 2018-01-05 1 2018-01-03 2018-01-11
2 1 2018-01-03 2018-01-11 1 2018-01-03 2018-01-11
3 1 2018-01-04 2018-01-10 1 2018-01-03 2018-01-11
4 1 2018-01-04 2018-01-10 1 2018-01-03 2018-01-11
5 1 2018-01-05 2018-01-11 1 2018-01-03 2018-01-11
6 1 2018-01-15 2018-01-22 2 2018-01-15 2018-01-22
7 2 2018-01-17 2018-01-19 1 2018-01-17 2018-01-19
8 3 2018-01-17 2018-01-18 1 2018-01-17 2018-02-04
9 3 2018-01-17 2018-01-29 1 2018-01-17 2018-02-04
10 3 2018-01-18 2018-01-29 1 2018-01-17 2018-02-04
# ... with 11 more rows
这是另一个使用 IRanges
package on Bioconductor. The collapse_date_ranges
function is taken from
library(data.table)
library(tidyverse)
collapse_date_ranges <- function(w, min.gapwidth = 1L) {
IRanges::IRanges(start = as.integer(as.Date(w$First_day)),
end = as.integer(as.Date(w$Last_day))) %>%
IRanges::reduce(min.gapwidth = min.gapwidth) %>%
as.data.table() %>%
.[, lapply(.SD, lubridate::as_date),
.SDcols = c("start", "end")]
}
split(df, df$CSN_id) %>%
map(., ~collapse_date_ranges(., 0L)) %>%
bind_rows(., .id = 'id')
输出
id start end
1: 1 2018-01-03 2018-01-11
2: 1 2018-01-15 2018-01-22
3: 2 2018-01-17 2018-01-19
4: 3 2018-01-17 2018-02-04
5: 4 2018-01-31 2018-02-17
6: 4 2018-02-22 2018-02-27
7: 4 2018-03-02 2018-03-27
如果你想在原始数据框中包含这个,那么我们可以将数据连接回原始数据框,然后使用 fill
将日期添加到每一行。
split(df, df$CSN_id) %>%
map(., ~collapse_date_ranges(., 0L)) %>%
bind_rows(., .id = 'CSN_id2') %>%
data.frame %>%
mutate(CSN_id2 = as.integer(CSN_id2)) %>%
full_join(df, ., by = c("CSN_id" = "CSN_id2", "First_day" = "start"), keep = TRUE) %>%
select(-CSN_id2) %>%
group_by(CSN_id) %>%
fill(start, end, .direction = "down")
输出
CSN_id First_day Last_day start end
<int> <dttm> <dttm> <date> <date>
1 1 2018-01-03 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
2 1 2018-01-03 00:00:00 2018-01-05 00:00:00 2018-01-03 2018-01-11
3 1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
4 1 2018-01-04 00:00:00 2018-01-10 00:00:00 2018-01-03 2018-01-11
5 1 2018-01-05 00:00:00 2018-01-11 00:00:00 2018-01-03 2018-01-11
6 1 2018-01-15 00:00:00 2018-01-22 00:00:00 2018-01-15 2018-01-22
7 2 2018-01-17 00:00:00 2018-01-19 00:00:00 2018-01-17 2018-01-19
8 3 2018-01-17 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
9 3 2018-01-17 00:00:00 2018-01-18 00:00:00 2018-01-17 2018-02-04
10 3 2018-01-18 00:00:00 2018-01-29 00:00:00 2018-01-17 2018-02-04
# … with 11 more rows