如何创建一个变量,该变量是给定时间范围内按 id 连续行的总和
How to create a variable that is the sum of consecutive rows within a given time frame and by id
我正在尝试实现连续值的总和,这些值彼此相隔 365 天,按 R 中的唯一标识符分组。例如,对于特定 ID 的日期 1,我们将添加日期 2,3 ,4(在 365 天内)相同的 ID 以获得日期 1 的总成本。然后对于日期 2,我们将添加 3 和 4 以获得总成本等等。我尝试了几个具有某些限制的滚动总和 () and similar solutions from dplyr that take sum consecutive values (Calculate sum of a column if the difference between consecutive rows meets a condition),但无法获得区分天数的代码。我已经包含了一个示例数据集和一个解决方案数据集,作为我正在寻找的示例。
起始数据集
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df2<-data.frame(ID,admitdt,cost,cost365)
ID admitdt cost
1 1 2014-10-19 2000
2 1 2014-10-24 14077
3 1 2015-01-31 5000
4 1 2016-01-20 200
5 1 2017-06-30 560
6 1 2017-07-17 5000
7 2 2015-04-21 888
8 2 2015-04-22 5959
9 2 2015-05-04 1819
10 2 2015-07-25 7508
11 3 2014-11-11 6406
解决方案:
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,500,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df2<-data.frame(ID,admitdt,cost,cost365)
ID admitdt cost cost365
1 1 2014-10-19 2000 21077
2 1 2014-10-24 14077 19077
3 1 2015-01-31 5000 5200
4 1 2016-01-20 200 200
5 1 2017-06-30 560 5560
6 1 2017-07-17 5000 5000
7 2 2015-04-21 888 16174
8 2 2015-04-22 5959 15286
9 2 2015-05-04 1819 9327
10 2 2015-07-25 7508 7508
11 3 2014-11-11 6406 6406
我在 slider
和 runner
中分别给出了 2 个方法。其中我喜欢 slider
因为它的语法清晰。尽管如此,两者的策略几乎相同,
date
列将在两者中充当 index
。
- slider 提供更多控制,因为它具有
.before
和 .after
参数,在当前情况下您只需要 after = days(365)
(与 lubridate 集成)
- 在跑步者 k 总是向后所以我在那里使用
-364
。
- 剩下的很清楚。如果需要进一步说明,请询问。
在slider
你可以做到
library(tidyverse)
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df<-data.frame(ID,admitdt,cost)
df
#> ID admitdt cost
#> 1 1 2014-10-19 2000
#> 2 1 2014-10-24 14077
#> 3 1 2015-01-31 5000
#> 4 1 2016-01-20 200
#> 5 1 2017-06-30 560
#> 6 1 2017-07-17 5000
#> 7 2 2015-04-21 888
#> 8 2 2015-04-22 5959
#> 9 2 2015-05-04 1819
#> 10 2 2015-07-25 7508
#> 11 3 2014-11-11 6406
library(slider)
library(lubridate)
df %>% group_by(ID) %>%
mutate(admitdt = as.Date(admitdt),
cost365 = slider::slide_index_sum(x = cost,
i = admitdt,
after = days(365)))
#> # A tibble: 11 x 4
#> # Groups: ID [3]
#> ID admitdt cost cost365
#> <dbl> <date> <dbl> <dbl>
#> 1 1 2014-10-19 2000 21077
#> 2 1 2014-10-24 14077 19077
#> 3 1 2015-01-31 5000 5200
#> 4 1 2016-01-20 200 200
#> 5 1 2017-06-30 560 5560
#> 6 1 2017-07-17 5000 5000
#> 7 2 2015-04-21 888 16174
#> 8 2 2015-04-22 5959 15286
#> 9 2 2015-05-04 1819 9327
#> 10 2 2015-07-25 7508 7508
#> 11 3 2014-11-11 6406 6406
或在runner
library(dplyr, warn.conflicts = F)
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df<-data.frame(ID,admitdt,cost)
library(runner)
df %>% group_by(ID) %>%
mutate(admitdt = as.Date(admitdt),
cost365 = runner::sum_run(x = cost,
idx = admitdt,
k = 365,
lag = -364))
#> # A tibble: 11 x 4
#> # Groups: ID [3]
#> ID admitdt cost cost365
#> <dbl> <date> <dbl> <dbl>
#> 1 1 2014-10-19 2000 21077
#> 2 1 2014-10-24 14077 19077
#> 3 1 2015-01-31 5000 5200
#> 4 1 2016-01-20 200 200
#> 5 1 2017-06-30 560 5560
#> 6 1 2017-07-17 5000 5000
#> 7 2 2015-04-21 888 16174
#> 8 2 2015-04-22 5959 15286
#> 9 2 2015-05-04 1819 9327
#> 10 2 2015-07-25 7508 7508
#> 11 3 2014-11-11 6406 6406
由 reprex package (v2.0.0)
于 2021-07-19 创建
这是 purrr::map
的方法:
library(dplyr); library(purrr)
df2 %>%
mutate(admitdt = as.Date(admitdt)) %>%
group_by(ID) %>%
mutate(cost365 = map_dbl(admitdt,~sum(cost[(.x - admitdt) <= 0 &
(.x - admitdt) >= -365])))
# A tibble: 11 x 4
# Groups: ID [3]
ID admitdt cost cost365
<dbl> <date> <dbl> <dbl>
1 1 2014-10-19 2000 21077
2 1 2014-10-24 14077 19077
3 1 2015-01-31 5000 5200
4 1 2016-01-20 200 200
5 1 2017-06-30 560 1060
6 1 2017-07-17 500 500
7 2 2015-04-21 888 16174
8 2 2015-04-22 5959 15286
9 2 2015-05-04 1819 9327
10 2 2015-07-25 7508 7508
11 3 2014-11-11 6406 6406
我们也可以采用以下解决方案:
library(dplyr)
library(purrr)
library(lubridate)
df2 %>%
mutate(rolls = map2(ymd(admitdt), ID, ~ df2 %>%
filter(ID == .y & ymd(admitdt) %within% interval(.x, .x + 365)) %>%
pull(cost) %>%
reduce(`+`)))
ID admitdt cost cost365 rolls
1 1 2014-10-19 2000 21077 21077
2 1 2014-10-24 14077 19077 19077
3 1 2015-01-31 5000 5200 5200
4 1 2016-01-20 200 200 200
5 1 2017-06-30 560 5560 5560
6 1 2017-07-17 5000 5000 5000
7 2 2015-04-21 888 16174 16174
8 2 2015-04-22 5959 15286 15286
9 2 2015-05-04 1819 9327 9327
10 2 2015-07-25 7508 7508 7508
11 3 2014-11-11 6406 6406 6406
或在 base R:
df2$rolls <- mapply(function(x, y) {
df2 <- transform(df2, admitdt = as.Date(admitdt, format = "%Y-%m-%d"))
tmp <- subset(df2, ID == x & admitdt >= y & admitdt <= y + 365)
sum(tmp$cost)
}, df2$ID, as.Date(df2$admitdt, format = "%Y-%m-%d"))
我正在尝试实现连续值的总和,这些值彼此相隔 365 天,按 R 中的唯一标识符分组。例如,对于特定 ID 的日期 1,我们将添加日期 2,3 ,4(在 365 天内)相同的 ID 以获得日期 1 的总成本。然后对于日期 2,我们将添加 3 和 4 以获得总成本等等。我尝试了几个具有某些限制的滚动总和 (
起始数据集
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df2<-data.frame(ID,admitdt,cost,cost365)
ID admitdt cost
1 1 2014-10-19 2000
2 1 2014-10-24 14077
3 1 2015-01-31 5000
4 1 2016-01-20 200
5 1 2017-06-30 560
6 1 2017-07-17 5000
7 2 2015-04-21 888
8 2 2015-04-22 5959
9 2 2015-05-04 1819
10 2 2015-07-25 7508
11 3 2014-11-11 6406
解决方案:
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,500,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df2<-data.frame(ID,admitdt,cost,cost365)
ID admitdt cost cost365
1 1 2014-10-19 2000 21077
2 1 2014-10-24 14077 19077
3 1 2015-01-31 5000 5200
4 1 2016-01-20 200 200
5 1 2017-06-30 560 5560
6 1 2017-07-17 5000 5000
7 2 2015-04-21 888 16174
8 2 2015-04-22 5959 15286
9 2 2015-05-04 1819 9327
10 2 2015-07-25 7508 7508
11 3 2014-11-11 6406 6406
我在 slider
和 runner
中分别给出了 2 个方法。其中我喜欢 slider
因为它的语法清晰。尽管如此,两者的策略几乎相同,
date
列将在两者中充当index
。- slider 提供更多控制,因为它具有
.before
和.after
参数,在当前情况下您只需要after = days(365)
(与 lubridate 集成) - 在跑步者 k 总是向后所以我在那里使用
-364
。 - 剩下的很清楚。如果需要进一步说明,请询问。
在slider
你可以做到
library(tidyverse)
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df<-data.frame(ID,admitdt,cost)
df
#> ID admitdt cost
#> 1 1 2014-10-19 2000
#> 2 1 2014-10-24 14077
#> 3 1 2015-01-31 5000
#> 4 1 2016-01-20 200
#> 5 1 2017-06-30 560
#> 6 1 2017-07-17 5000
#> 7 2 2015-04-21 888
#> 8 2 2015-04-22 5959
#> 9 2 2015-05-04 1819
#> 10 2 2015-07-25 7508
#> 11 3 2014-11-11 6406
library(slider)
library(lubridate)
df %>% group_by(ID) %>%
mutate(admitdt = as.Date(admitdt),
cost365 = slider::slide_index_sum(x = cost,
i = admitdt,
after = days(365)))
#> # A tibble: 11 x 4
#> # Groups: ID [3]
#> ID admitdt cost cost365
#> <dbl> <date> <dbl> <dbl>
#> 1 1 2014-10-19 2000 21077
#> 2 1 2014-10-24 14077 19077
#> 3 1 2015-01-31 5000 5200
#> 4 1 2016-01-20 200 200
#> 5 1 2017-06-30 560 5560
#> 6 1 2017-07-17 5000 5000
#> 7 2 2015-04-21 888 16174
#> 8 2 2015-04-22 5959 15286
#> 9 2 2015-05-04 1819 9327
#> 10 2 2015-07-25 7508 7508
#> 11 3 2014-11-11 6406 6406
或在runner
library(dplyr, warn.conflicts = F)
ID <- c(1,1,1,1,1,1,2,2,2,2,3)
admitdt <-c("2014-10-19","2014-10-24","2015-01-31","2016-01-20","2017-06-30","2017-07-17","2015-04-21","2015-04-22","2015-05-04","2015-07-25","2014-11-11")
cost<-c(2000,14077,5000,200,560,5000,888,5959,1819,7508,6406)
cost365<-c(21077,19077,5200,200,5560,5000,16174,15286,9327,7508,6406)
df<-data.frame(ID,admitdt,cost)
library(runner)
df %>% group_by(ID) %>%
mutate(admitdt = as.Date(admitdt),
cost365 = runner::sum_run(x = cost,
idx = admitdt,
k = 365,
lag = -364))
#> # A tibble: 11 x 4
#> # Groups: ID [3]
#> ID admitdt cost cost365
#> <dbl> <date> <dbl> <dbl>
#> 1 1 2014-10-19 2000 21077
#> 2 1 2014-10-24 14077 19077
#> 3 1 2015-01-31 5000 5200
#> 4 1 2016-01-20 200 200
#> 5 1 2017-06-30 560 5560
#> 6 1 2017-07-17 5000 5000
#> 7 2 2015-04-21 888 16174
#> 8 2 2015-04-22 5959 15286
#> 9 2 2015-05-04 1819 9327
#> 10 2 2015-07-25 7508 7508
#> 11 3 2014-11-11 6406 6406
由 reprex package (v2.0.0)
于 2021-07-19 创建这是 purrr::map
的方法:
library(dplyr); library(purrr)
df2 %>%
mutate(admitdt = as.Date(admitdt)) %>%
group_by(ID) %>%
mutate(cost365 = map_dbl(admitdt,~sum(cost[(.x - admitdt) <= 0 &
(.x - admitdt) >= -365])))
# A tibble: 11 x 4
# Groups: ID [3]
ID admitdt cost cost365
<dbl> <date> <dbl> <dbl>
1 1 2014-10-19 2000 21077
2 1 2014-10-24 14077 19077
3 1 2015-01-31 5000 5200
4 1 2016-01-20 200 200
5 1 2017-06-30 560 1060
6 1 2017-07-17 500 500
7 2 2015-04-21 888 16174
8 2 2015-04-22 5959 15286
9 2 2015-05-04 1819 9327
10 2 2015-07-25 7508 7508
11 3 2014-11-11 6406 6406
我们也可以采用以下解决方案:
library(dplyr)
library(purrr)
library(lubridate)
df2 %>%
mutate(rolls = map2(ymd(admitdt), ID, ~ df2 %>%
filter(ID == .y & ymd(admitdt) %within% interval(.x, .x + 365)) %>%
pull(cost) %>%
reduce(`+`)))
ID admitdt cost cost365 rolls
1 1 2014-10-19 2000 21077 21077
2 1 2014-10-24 14077 19077 19077
3 1 2015-01-31 5000 5200 5200
4 1 2016-01-20 200 200 200
5 1 2017-06-30 560 5560 5560
6 1 2017-07-17 5000 5000 5000
7 2 2015-04-21 888 16174 16174
8 2 2015-04-22 5959 15286 15286
9 2 2015-05-04 1819 9327 9327
10 2 2015-07-25 7508 7508 7508
11 3 2014-11-11 6406 6406 6406
或在 base R:
df2$rolls <- mapply(function(x, y) {
df2 <- transform(df2, admitdt = as.Date(admitdt, format = "%Y-%m-%d"))
tmp <- subset(df2, ID == x & admitdt >= y & admitdt <= y + 365)
sum(tmp$cost)
}, df2$ID, as.Date(df2$admitdt, format = "%Y-%m-%d"))