count,对于一个日期列表,有多少个时间间隔包含一个日期
count, for a list of date, how many time intervals contains a date
即使看了几个关于堆栈溢出的答案,我也无法解决这个问题。
我有一个包含日期列表和间隔列表的数据集。
对于给定的日期,我需要找出有多少间隔包含一个日期。
我可以找到几个关于间隔中包含多少日期的问题,但这不是我要找的。
这是一个可重现的例子
d<-data.frame(ID=c(80, 736, 54, 259, 826, 446, 950, 841, 433, 518, 1357,
3686, 4042, 749, 2716, 4568, 1424, 332, 1000, 575, 1815, 3074,
3768, 932, 4, 3872, 2033, 2495, 3310),
date=ymd(c("2022-02-20", "2022-02-21", "2022-02-22", "2022-02-23", "2022-02-24",
"2022-02-25", "2022-02-26", "2022-02-27", "2022-02-28",
"2022-03-01", "2022-03-02", "2022-03-02", "2022-03-03", "2022-03-04",
"2022-03-05", "2022-03-05", "2022-03-06", "2022-03-07", "2022-03-08",
"2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10", "2022-03-11",
"2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13")),
start.date= ymd(c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-02", "2022-03-02",
"2022-03-03", "2022-03-04", "2022-03-05", "2022-03-05", "2022-03-06",
NA, "2022-03-08", "2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10",
NA, "2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13")),
end.date=ymd(c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-15", "2022-03-10",
"2022-03-07", "2022-03-14", "2022-03-29", "2022-03-17", "2022-03-21",
NA, "2022-03-27", "2022-03-16", "2022-03-16", "2022-03-24", "2022-03-18",
NA, "2022-03-22", "2022-03-18", "2022-03-22", "2022-03-30", "2022-03-19"
)))
d<-d %>% mutate(interval=(start.date %--% end.date)) %>% select(-start.date,-end.date)
我想得到的是,在一个新列中,对于每个日期,包含该日期的间隔数。
我通常使用 dplyr-lubridate,我试图用 purrr 解决这个问题但没有成功。
有什么建议吗?
谢谢
EDIT:i 尝试使用如下所示的 purrr 解决方案
d %>% mutate(dates_in_intv = map_int(interval, function(x) sum(.$date %within% x)))
这是计算一个间隔跨越多少个日期,但我需要的是
d %>% mutate(intv_contains_dates= map_int(date, function(x) sum(.$interval "contains" x)))
这是您要找的吗?
library(tidyverse)
library(lubridate)
library(ivs)
d <- data.frame(
ID = c(
80, 736, 54, 259, 826, 446, 950, 841, 433, 518, 1357,
3686, 4042, 749, 2716, 4568, 1424, 332, 1000, 575, 1815, 3074,
3768, 932, 4, 3872, 2033, 2495, 3310
),
date = ymd(c(
"2022-02-20", "2022-02-21", "2022-02-22", "2022-02-23", "2022-02-24",
"2022-02-25", "2022-02-26", "2022-02-27", "2022-02-28",
"2022-03-01", "2022-03-02", "2022-03-02", "2022-03-03", "2022-03-04",
"2022-03-05", "2022-03-05", "2022-03-06", "2022-03-07", "2022-03-08",
"2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10", "2022-03-11",
"2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13"
)),
start.date = ymd(c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-02", "2022-03-02",
"2022-03-03", "2022-03-04", "2022-03-05", "2022-03-05", "2022-03-06",
NA, "2022-03-08", "2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10",
NA, "2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13"
)),
end.date = ymd(c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-15", "2022-03-10",
"2022-03-07", "2022-03-14", "2022-03-29", "2022-03-17", "2022-03-21",
NA, "2022-03-27", "2022-03-16", "2022-03-16", "2022-03-24", "2022-03-18",
NA, "2022-03-22", "2022-03-18", "2022-03-22", "2022-03-30", "2022-03-19"
))
)
d %>%
mutate(iv = iv(start.date, end.date),
count = iv_count_between(date, iv))
#> ID date start.date end.date iv count
#> 1 80 2022-02-20 <NA> <NA> [NA, NA) 0
#> 2 736 2022-02-21 <NA> <NA> [NA, NA) 0
#> 3 54 2022-02-22 <NA> <NA> [NA, NA) 0
#> 4 259 2022-02-23 <NA> <NA> [NA, NA) 0
#> 5 826 2022-02-24 <NA> <NA> [NA, NA) 0
#> 6 446 2022-02-25 <NA> <NA> [NA, NA) 0
#> 7 950 2022-02-26 <NA> <NA> [NA, NA) 0
#> 8 841 2022-02-27 <NA> <NA> [NA, NA) 0
#> 9 433 2022-02-28 <NA> <NA> [NA, NA) 0
#> 10 518 2022-03-01 <NA> <NA> [NA, NA) 0
#> 11 1357 2022-03-02 2022-03-02 2022-03-15 [2022-03-02, 2022-03-15) 2
#> 12 3686 2022-03-02 2022-03-02 2022-03-10 [2022-03-02, 2022-03-10) 2
#> 13 4042 2022-03-03 2022-03-03 2022-03-07 [2022-03-03, 2022-03-07) 3
#> 14 749 2022-03-04 2022-03-04 2022-03-14 [2022-03-04, 2022-03-14) 4
#> 15 2716 2022-03-05 2022-03-05 2022-03-29 [2022-03-05, 2022-03-29) 6
#> 16 4568 2022-03-05 2022-03-05 2022-03-17 [2022-03-05, 2022-03-17) 6
#> 17 1424 2022-03-06 2022-03-06 2022-03-21 [2022-03-06, 2022-03-21) 7
#> 18 332 2022-03-07 <NA> <NA> [NA, NA) 6
#> 19 1000 2022-03-08 2022-03-08 2022-03-27 [2022-03-08, 2022-03-27) 7
#> 20 575 2022-03-09 2022-03-09 2022-03-16 [2022-03-09, 2022-03-16) 8
#> 21 1815 2022-03-10 2022-03-10 2022-03-16 [2022-03-10, 2022-03-16) 10
#> 22 3074 2022-03-10 2022-03-10 2022-03-24 [2022-03-10, 2022-03-24) 10
#> 23 3768 2022-03-10 2022-03-10 2022-03-18 [2022-03-10, 2022-03-18) 10
#> 24 932 2022-03-11 <NA> <NA> [NA, NA) 10
#> 25 4 2022-03-12 2022-03-12 2022-03-22 [2022-03-12, 2022-03-22) 12
#> 26 3872 2022-03-12 2022-03-12 2022-03-18 [2022-03-12, 2022-03-18) 12
#> 27 2033 2022-03-13 2022-03-13 2022-03-22 [2022-03-13, 2022-03-22) 15
#> 28 2495 2022-03-13 2022-03-13 2022-03-30 [2022-03-13, 2022-03-30) 15
#> 29 3310 2022-03-13 2022-03-13 2022-03-19 [2022-03-13, 2022-03-19) 15
由 reprex package (v2.0.1)
于 2022-05-28 创建
还有一个 purrr 方法:
period_df <- d |>
select(start.date, end.date) |>
drop_na(start.date)
map2_dfr(period_df$start.date, period_df$end.date, function(x, y) {
d |>
distinct(date) |>
mutate(count = if_else(date >= x & date <= y, 1, 0))
}) |>
group_by(date) |>
summarise(count = sum(count)) |>
arrange(desc(count))
即使看了几个关于堆栈溢出的答案,我也无法解决这个问题。 我有一个包含日期列表和间隔列表的数据集。 对于给定的日期,我需要找出有多少间隔包含一个日期。 我可以找到几个关于间隔中包含多少日期的问题,但这不是我要找的。
这是一个可重现的例子
d<-data.frame(ID=c(80, 736, 54, 259, 826, 446, 950, 841, 433, 518, 1357,
3686, 4042, 749, 2716, 4568, 1424, 332, 1000, 575, 1815, 3074,
3768, 932, 4, 3872, 2033, 2495, 3310),
date=ymd(c("2022-02-20", "2022-02-21", "2022-02-22", "2022-02-23", "2022-02-24",
"2022-02-25", "2022-02-26", "2022-02-27", "2022-02-28",
"2022-03-01", "2022-03-02", "2022-03-02", "2022-03-03", "2022-03-04",
"2022-03-05", "2022-03-05", "2022-03-06", "2022-03-07", "2022-03-08",
"2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10", "2022-03-11",
"2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13")),
start.date= ymd(c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-02", "2022-03-02",
"2022-03-03", "2022-03-04", "2022-03-05", "2022-03-05", "2022-03-06",
NA, "2022-03-08", "2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10",
NA, "2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13")),
end.date=ymd(c( NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-15", "2022-03-10",
"2022-03-07", "2022-03-14", "2022-03-29", "2022-03-17", "2022-03-21",
NA, "2022-03-27", "2022-03-16", "2022-03-16", "2022-03-24", "2022-03-18",
NA, "2022-03-22", "2022-03-18", "2022-03-22", "2022-03-30", "2022-03-19"
)))
d<-d %>% mutate(interval=(start.date %--% end.date)) %>% select(-start.date,-end.date)
我想得到的是,在一个新列中,对于每个日期,包含该日期的间隔数。
我通常使用 dplyr-lubridate,我试图用 purrr 解决这个问题但没有成功。
有什么建议吗?
谢谢
EDIT:i 尝试使用如下所示的 purrr 解决方案
d %>% mutate(dates_in_intv = map_int(interval, function(x) sum(.$date %within% x)))
这是计算一个间隔跨越多少个日期,但我需要的是
d %>% mutate(intv_contains_dates= map_int(date, function(x) sum(.$interval "contains" x)))
这是您要找的吗?
library(tidyverse)
library(lubridate)
library(ivs)
d <- data.frame(
ID = c(
80, 736, 54, 259, 826, 446, 950, 841, 433, 518, 1357,
3686, 4042, 749, 2716, 4568, 1424, 332, 1000, 575, 1815, 3074,
3768, 932, 4, 3872, 2033, 2495, 3310
),
date = ymd(c(
"2022-02-20", "2022-02-21", "2022-02-22", "2022-02-23", "2022-02-24",
"2022-02-25", "2022-02-26", "2022-02-27", "2022-02-28",
"2022-03-01", "2022-03-02", "2022-03-02", "2022-03-03", "2022-03-04",
"2022-03-05", "2022-03-05", "2022-03-06", "2022-03-07", "2022-03-08",
"2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10", "2022-03-11",
"2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13"
)),
start.date = ymd(c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-02", "2022-03-02",
"2022-03-03", "2022-03-04", "2022-03-05", "2022-03-05", "2022-03-06",
NA, "2022-03-08", "2022-03-09", "2022-03-10", "2022-03-10", "2022-03-10",
NA, "2022-03-12", "2022-03-12", "2022-03-13", "2022-03-13", "2022-03-13"
)),
end.date = ymd(c(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2022-03-15", "2022-03-10",
"2022-03-07", "2022-03-14", "2022-03-29", "2022-03-17", "2022-03-21",
NA, "2022-03-27", "2022-03-16", "2022-03-16", "2022-03-24", "2022-03-18",
NA, "2022-03-22", "2022-03-18", "2022-03-22", "2022-03-30", "2022-03-19"
))
)
d %>%
mutate(iv = iv(start.date, end.date),
count = iv_count_between(date, iv))
#> ID date start.date end.date iv count
#> 1 80 2022-02-20 <NA> <NA> [NA, NA) 0
#> 2 736 2022-02-21 <NA> <NA> [NA, NA) 0
#> 3 54 2022-02-22 <NA> <NA> [NA, NA) 0
#> 4 259 2022-02-23 <NA> <NA> [NA, NA) 0
#> 5 826 2022-02-24 <NA> <NA> [NA, NA) 0
#> 6 446 2022-02-25 <NA> <NA> [NA, NA) 0
#> 7 950 2022-02-26 <NA> <NA> [NA, NA) 0
#> 8 841 2022-02-27 <NA> <NA> [NA, NA) 0
#> 9 433 2022-02-28 <NA> <NA> [NA, NA) 0
#> 10 518 2022-03-01 <NA> <NA> [NA, NA) 0
#> 11 1357 2022-03-02 2022-03-02 2022-03-15 [2022-03-02, 2022-03-15) 2
#> 12 3686 2022-03-02 2022-03-02 2022-03-10 [2022-03-02, 2022-03-10) 2
#> 13 4042 2022-03-03 2022-03-03 2022-03-07 [2022-03-03, 2022-03-07) 3
#> 14 749 2022-03-04 2022-03-04 2022-03-14 [2022-03-04, 2022-03-14) 4
#> 15 2716 2022-03-05 2022-03-05 2022-03-29 [2022-03-05, 2022-03-29) 6
#> 16 4568 2022-03-05 2022-03-05 2022-03-17 [2022-03-05, 2022-03-17) 6
#> 17 1424 2022-03-06 2022-03-06 2022-03-21 [2022-03-06, 2022-03-21) 7
#> 18 332 2022-03-07 <NA> <NA> [NA, NA) 6
#> 19 1000 2022-03-08 2022-03-08 2022-03-27 [2022-03-08, 2022-03-27) 7
#> 20 575 2022-03-09 2022-03-09 2022-03-16 [2022-03-09, 2022-03-16) 8
#> 21 1815 2022-03-10 2022-03-10 2022-03-16 [2022-03-10, 2022-03-16) 10
#> 22 3074 2022-03-10 2022-03-10 2022-03-24 [2022-03-10, 2022-03-24) 10
#> 23 3768 2022-03-10 2022-03-10 2022-03-18 [2022-03-10, 2022-03-18) 10
#> 24 932 2022-03-11 <NA> <NA> [NA, NA) 10
#> 25 4 2022-03-12 2022-03-12 2022-03-22 [2022-03-12, 2022-03-22) 12
#> 26 3872 2022-03-12 2022-03-12 2022-03-18 [2022-03-12, 2022-03-18) 12
#> 27 2033 2022-03-13 2022-03-13 2022-03-22 [2022-03-13, 2022-03-22) 15
#> 28 2495 2022-03-13 2022-03-13 2022-03-30 [2022-03-13, 2022-03-30) 15
#> 29 3310 2022-03-13 2022-03-13 2022-03-19 [2022-03-13, 2022-03-19) 15
由 reprex package (v2.0.1)
于 2022-05-28 创建还有一个 purrr 方法:
period_df <- d |>
select(start.date, end.date) |>
drop_na(start.date)
map2_dfr(period_df$start.date, period_df$end.date, function(x, y) {
d |>
distinct(date) |>
mutate(count = if_else(date >= x & date <= y, 1, 0))
}) |>
group_by(date) |>
summarise(count = sum(count)) |>
arrange(desc(count))