Select 在分组数据中操作的前 X 个观察值
Select precending X observations for operation in grouped data
我观察了几天不同客户的购买数量(1 = 购买,0 = 不购买)。现在对于每一天,我想总结前两天和当天特定客户的购买数量,所以总共3天。
示例数据:
da <- data.frame(customer_id = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4),
day = c("2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15"),
purchase = c(1,1,1,0,1,0,0,1,1,0,1,1,1,1,0,1,0,1,0,1))
> da
customer_id day purchase
1 1 2016-04-11 1
2 1 2016-04-12 1
3 1 2016-04-13 1
4 1 2016-04-14 0
5 1 2016-04-15 1
6 2 2016-04-11 0
7 2 2016-04-12 0
8 2 2016-04-13 1
9 2 2016-04-14 1
10 2 2016-04-15 0
11 3 2016-04-11 1
12 3 2016-04-12 1
13 3 2016-04-13 1
14 3 2016-04-14 1
15 3 2016-04-15 0
16 4 2016-04-11 1
17 4 2016-04-12 0
18 4 2016-04-13 1
19 4 2016-04-14 0
20 4 2016-04-15 1
我正在寻找的输出:
output_da <- data.frame(customer_id = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4),
day = c("2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15"),
purchase = c(1,1,1,0,1,0,0,1,1,0,1,1,1,1,0,1,0,1,0,1),
purchases_last_3_days = c(1,2,3,2,2,0,0,1,2,2,1,2,3,3,2,1,1,2,1,2))
output_da
customer_id day purchase purchases_last_3_days
1 1 2016-04-11 1 1
2 1 2016-04-12 1 2
3 1 2016-04-13 1 3
4 1 2016-04-14 0 2
5 1 2016-04-15 1 2
6 2 2016-04-11 0 0
7 2 2016-04-12 0 0
8 2 2016-04-13 1 1
9 2 2016-04-14 1 2
10 2 2016-04-15 0 2
11 3 2016-04-11 1 1
12 3 2016-04-12 1 2
13 3 2016-04-13 1 3
14 3 2016-04-14 1 3
15 3 2016-04-15 0 2
16 4 2016-04-11 1 1
17 4 2016-04-12 0 1
18 4 2016-04-13 1 2
19 4 2016-04-14 0 1
20 4 2016-04-15 1 2
我知道 cumsum 函数,但我不知道如何 select 当前每一天之前的 X 行数。
如果您只需要最后 3 行,您可以手动完成:
library(dplyr)
output_da <- da %>%
group_by(customer_id) %>%
mutate(pday = lag(purchase, default = 0),
apday = lag(purchase, n = 2, default = 0),
purchases_last_3_days = rowSums(across(c(purchase, pday, apday))),
pday = NULL, apday = NULL)
更新
如果您需要的不仅仅是最后 3 行,可以稍微自动化一点,将 3 更改为合适的数字:
library(dplyr)
library(purrr)
last_day_rows <- 3
da %>%
group_by(customer_id) %>%
mutate(purchases_last_3_days =
rowSums(map_dfc(c(1:last_day_rows),
~lag(purchase, n = .x - 1, default = 0))))
这是一个基本的 R 选项,来自 ave
output_da <- within(
da,
purchases_last_3_days <- ave(purchase,
customer_id,
FUN = function(v) sapply(seq_along(v), function(k) sum(v[pmax(k - 2, 1):k]))
)
)
我观察了几天不同客户的购买数量(1 = 购买,0 = 不购买)。现在对于每一天,我想总结前两天和当天特定客户的购买数量,所以总共3天。
示例数据:
da <- data.frame(customer_id = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4),
day = c("2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15"),
purchase = c(1,1,1,0,1,0,0,1,1,0,1,1,1,1,0,1,0,1,0,1))
> da
customer_id day purchase
1 1 2016-04-11 1
2 1 2016-04-12 1
3 1 2016-04-13 1
4 1 2016-04-14 0
5 1 2016-04-15 1
6 2 2016-04-11 0
7 2 2016-04-12 0
8 2 2016-04-13 1
9 2 2016-04-14 1
10 2 2016-04-15 0
11 3 2016-04-11 1
12 3 2016-04-12 1
13 3 2016-04-13 1
14 3 2016-04-14 1
15 3 2016-04-15 0
16 4 2016-04-11 1
17 4 2016-04-12 0
18 4 2016-04-13 1
19 4 2016-04-14 0
20 4 2016-04-15 1
我正在寻找的输出:
output_da <- data.frame(customer_id = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4),
day = c("2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15","2016-04-11","2016-04-12","2016-04-13","2016-04-14","2016-04-15"),
purchase = c(1,1,1,0,1,0,0,1,1,0,1,1,1,1,0,1,0,1,0,1),
purchases_last_3_days = c(1,2,3,2,2,0,0,1,2,2,1,2,3,3,2,1,1,2,1,2))
output_da
customer_id day purchase purchases_last_3_days
1 1 2016-04-11 1 1
2 1 2016-04-12 1 2
3 1 2016-04-13 1 3
4 1 2016-04-14 0 2
5 1 2016-04-15 1 2
6 2 2016-04-11 0 0
7 2 2016-04-12 0 0
8 2 2016-04-13 1 1
9 2 2016-04-14 1 2
10 2 2016-04-15 0 2
11 3 2016-04-11 1 1
12 3 2016-04-12 1 2
13 3 2016-04-13 1 3
14 3 2016-04-14 1 3
15 3 2016-04-15 0 2
16 4 2016-04-11 1 1
17 4 2016-04-12 0 1
18 4 2016-04-13 1 2
19 4 2016-04-14 0 1
20 4 2016-04-15 1 2
我知道 cumsum 函数,但我不知道如何 select 当前每一天之前的 X 行数。
如果您只需要最后 3 行,您可以手动完成:
library(dplyr)
output_da <- da %>%
group_by(customer_id) %>%
mutate(pday = lag(purchase, default = 0),
apday = lag(purchase, n = 2, default = 0),
purchases_last_3_days = rowSums(across(c(purchase, pday, apday))),
pday = NULL, apday = NULL)
更新
如果您需要的不仅仅是最后 3 行,可以稍微自动化一点,将 3 更改为合适的数字:
library(dplyr)
library(purrr)
last_day_rows <- 3
da %>%
group_by(customer_id) %>%
mutate(purchases_last_3_days =
rowSums(map_dfc(c(1:last_day_rows),
~lag(purchase, n = .x - 1, default = 0))))
这是一个基本的 R 选项,来自 ave
output_da <- within(
da,
purchases_last_3_days <- ave(purchase,
customer_id,
FUN = function(v) sapply(seq_along(v), function(k) sum(v[pmax(k - 2, 1):k]))
)
)