如何从 R 中的进入和退出日期最有效地计算每日入学人数?
How to most efficiently calculate daily enrollment from an entry and exit date in R?
以下代码有效,但似乎效率很低。有没有更直接的方法来计算从进入和退出日期开始按站点计算的每日注册人数。
数据:
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-20"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site") %>%
mutate(exit_date = as.character(exit_date))
df[8:10, 3] <- rep("NA", times = 3)
创建一个 ytd 序列:
date <- seq(as.Date("2022-01-01"), today(), by = 1)
并用以下函数迭代下面的函数:
enrolled_ytd <- map_dfr(date, ~ daily_enrollment_fun(.), .id = "date")
函数:
daily_enrollment_fun <- function(date){
df %>%
select(id, entry_date, exit_date, site) %>%
drop_na(site) %>%
mutate(enrolled_int = interval((entry_date), (exit_date))) %>%
distinct(id, .keep_all = T) %>%
mutate(enrolled = date %within% enrolled_int)
}
输出是一个数据框,每个 ID 和日期序列中的每一天都有一个 TRUE/FALSE 注册。
为了清理数据,我 运行 函数:
daily_enrollment_clean_fun <- function(date, origin = "2022-01-01") {
enrolled_ytd %>%
mutate(date = as.numeric(date) -1,
date = as.Date(date, origin = origin)) %>%
group_by(date, site, enrolled) %>%
count() %>%
filter(enrolled == "TRUE") %>%
ungroup() %>%
select(-enrolled) %>%
arrange(desc(date))
}
这能满足您的需求吗?我希望这应该更高效,因为一旦我们将进入流和退出流变成更长的形式,这里的计算就会被矢量化。
library(tidyverse)
df %>%
pivot_longer(entry_date:exit_date) %>%
filter(!is.na(value)) %>%
mutate(change = if_else(name == "entry_date", 1, -1)) %>%
group_by(site) %>%
arrange(site, value) %>%
mutate(enrollment = cumsum(change)) %>%
complete(value = seq.Date(min(value, na.rm = TRUE),
max(value, na.rm = TRUE),
by = "day")) %>%
fill(enrollment) %>%
ungroup()
结果为 ggplot %>% ggplot(aes(value, enrollment, color = site)) + geom_point()
修改输入(将两个日期列都保留为日期)
library(tidyverse)
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-21"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site")
df[8:10, 3] <- rep(NA_real_, times = 3)
这是使用 data.table
的有效方法
- setDT()
- 更新缺少 exit_date 以等于
date
中的最终日期
- 加入 id 和日期值的组合 (
CJ
),按 id
- 将注册人数设置为
date
属于进入和退出日期的行的总和,按日期和地点
library(data.table)
setDT(df)[is.na(exit_date), exit_date:=max(date)][CJ(date,id=df$id), on=.(id)][
, .(enrollment= sum(date>=entry_date & date<=exit_date)), by=.(date,site)]
输出:
date site enrollment
1: 2022-01-01 Loc1 1
2: 2022-01-01 Loc2 0
3: 2022-01-02 Loc1 1
4: 2022-01-02 Loc2 1
5: 2022-01-03 Loc1 2
---
286: 2022-05-23 Loc2 4
287: 2022-05-24 Loc1 3
288: 2022-05-24 Loc2 4
289: 2022-05-25 Loc1 3
290: 2022-05-25 Loc2 3
输入数据:
library(lubridate)
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-21"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site") %>%
mutate(exit_date = as.character(exit_date))
df[8:10, 3] <- rep("NA", times = 3)
date <- seq(as.Date("2022-01-01"), lubridate::today(), by = 1
以下代码有效,但似乎效率很低。有没有更直接的方法来计算从进入和退出日期开始按站点计算的每日注册人数。
数据:
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-20"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site") %>%
mutate(exit_date = as.character(exit_date))
df[8:10, 3] <- rep("NA", times = 3)
创建一个 ytd 序列:
date <- seq(as.Date("2022-01-01"), today(), by = 1)
并用以下函数迭代下面的函数:
enrolled_ytd <- map_dfr(date, ~ daily_enrollment_fun(.), .id = "date")
函数:
daily_enrollment_fun <- function(date){
df %>%
select(id, entry_date, exit_date, site) %>%
drop_na(site) %>%
mutate(enrolled_int = interval((entry_date), (exit_date))) %>%
distinct(id, .keep_all = T) %>%
mutate(enrolled = date %within% enrolled_int)
}
输出是一个数据框,每个 ID 和日期序列中的每一天都有一个 TRUE/FALSE 注册。 为了清理数据,我 运行 函数:
daily_enrollment_clean_fun <- function(date, origin = "2022-01-01") {
enrolled_ytd %>%
mutate(date = as.numeric(date) -1,
date = as.Date(date, origin = origin)) %>%
group_by(date, site, enrolled) %>%
count() %>%
filter(enrolled == "TRUE") %>%
ungroup() %>%
select(-enrolled) %>%
arrange(desc(date))
}
这能满足您的需求吗?我希望这应该更高效,因为一旦我们将进入流和退出流变成更长的形式,这里的计算就会被矢量化。
library(tidyverse)
df %>%
pivot_longer(entry_date:exit_date) %>%
filter(!is.na(value)) %>%
mutate(change = if_else(name == "entry_date", 1, -1)) %>%
group_by(site) %>%
arrange(site, value) %>%
mutate(enrollment = cumsum(change)) %>%
complete(value = seq.Date(min(value, na.rm = TRUE),
max(value, na.rm = TRUE),
by = "day")) %>%
fill(enrollment) %>%
ungroup()
结果为 ggplot %>% ggplot(aes(value, enrollment, color = site)) + geom_point()
修改输入(将两个日期列都保留为日期)
library(tidyverse)
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-21"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site")
df[8:10, 3] <- rep(NA_real_, times = 3)
这是使用 data.table
- setDT()
- 更新缺少 exit_date 以等于
date
中的最终日期
- 加入 id 和日期值的组合 (
CJ
),按 id - 将注册人数设置为
date
属于进入和退出日期的行的总和,按日期和地点
library(data.table)
setDT(df)[is.na(exit_date), exit_date:=max(date)][CJ(date,id=df$id), on=.(id)][
, .(enrollment= sum(date>=entry_date & date<=exit_date)), by=.(date,site)]
输出:
date site enrollment
1: 2022-01-01 Loc1 1
2: 2022-01-01 Loc2 0
3: 2022-01-02 Loc1 1
4: 2022-01-02 Loc2 1
5: 2022-01-03 Loc1 2
---
286: 2022-05-23 Loc2 4
287: 2022-05-24 Loc1 3
288: 2022-05-24 Loc2 4
289: 2022-05-25 Loc1 3
290: 2022-05-25 Loc2 3
输入数据:
library(lubridate)
df <- data.frame(
id <- seq_along(1:10),
entry_date <- seq(as.Date("2022-01-01"), as.Date("2022-01-10"), by = 1),
exit_date <- seq(as.Date("2022-05-21"), as.Date("2022-05-30"), by = 1),
site <- rep(c("Loc1", "Loc2"), times= 5)
) %>%
set_names("id", "entry_date", "exit_date", "site") %>%
mutate(exit_date = as.character(exit_date))
df[8:10, 3] <- rep("NA", times = 3)
date <- seq(as.Date("2022-01-01"), lubridate::today(), by = 1