计算最近观测值的累积平均值

Calculating cumulative mean of recent observations

我的数据集具有以下特征:玩家 ID、周数和积分。

我想计算前几周的平均点数,但不是过去所有周,只计算最后 5 个或更少(如果当前周小于 5)。

示例:对于 player_id = 5,周 = 7,结果将是 player_id = 5 和第 2、3、4、5 和 6 周的 POINTS 的平均值。

以下代码已经计算了前一周的平均值,因此我需要进行调整以使其仅适用于前 5 周。

player_id<-c(rep(1,30),rep(2,30),rep(3,30),rep(4,30),rep(5,30))
week<-1:30
points<-round(runif(150,1,10),0) 
mydata<- data.frame(player_id=player_id,week=rep(week,5),points)


mydata<-mydata %>% 
        group_by(player_id) %>%    # the group to perform the stat on
        arrange(week) %>%          # order the weeks within each group
        mutate(previous_mean = cummean(points) ) %>% # for each week get the 
cumulative mean
        mutate(previous_mean = lag(previous_mean) ) %>% # shift cumulative 
mean back one week
        arrange(player_id) # sort by player_id

您可以使用 slice 到 select 每个组的最后 5 周。试试这个:

player_id<-c(rep(1,30),rep(2,30),rep(3,30),rep(4,30),rep(5,30))
week<-1:30
points<-round(runif(150,1,10),0) 
mydata<- data.frame(player_id=player_id,week=rep(week,5),points)

library(dplyr)

mydata <- mydata %>% 
    group_by(player_id) %>%    # the group to perform the stat on
    arrange(week) %>% # order the weeks within each group
    slice( (n()-4):n() ) %>%  # "slice" the last 5 rows (weeks) of every group
    mutate(previous_mean = cummean(points) ) %>% # for each week get the cumulative mean
mutate(previous_mean = lag(previous_mean) ) %>% # shift cumulative mean back one week
arrange(player_id) # sort by player_id

slice( (n()-4):n() )

selects 行在范围 [(last row - 4) : last row] 内,每个组

编辑:为避免当前周小于 5 时出现问题,请使用 ifelse 语句来验证:

mydata %>% 
    group_by(player_id) %>%    # the group to perform the stat on
    arrange(week) %>% # order the weeks within each group
    slice(ifelse(n() < 5, 1:n(), n()-4):n()) %>%  # "slice" the last 5 rows (weeks) of every group
    mutate(previous_mean = cummean(points) ) %>% # for each week get the cumulative mean
    mutate(previous_mean = lag(previous_mean) ) %>% # shift cumulative mean back one week
    arrange(player_id) # sort by player_id

HAVB's approach is great, but depending on what you want, here is another. This approach is adapted from this answer 到另一个问题,但根据您的情况进行了更改:

library(dplyr)
library(zoo)
# set the seed for reproducibility
set.seed(123)
player_id<-c(rep(1,30),rep(2,30),rep(3,30),rep(4,30),rep(5,30))
week<-1:30
points<-round(runif(150,1,10),0) 
mydata<- data.frame(player_id=player_id,week=rep(week,5),points)

roll_mean <- function(x, k) {
    result <- rollapplyr(x, k, mean, partial=TRUE, na.rm=TRUE)
    result[is.nan(result)] <- NA
    return( result )
}

mydata<- data.frame(player_id=player_id,week=rep(week,5),points)

mydata<-mydata %>% 
    group_by(player_id) %>%
    arrange(week) %>%
    mutate(rolling_mean = roll_mean(x=lag(points), k=5) ) %>%
    arrange(player_id)

然后我们可以查看一个子集来证明它有效:

mydata[mydata$player_id %in% 1:2 & mydata$week %in% 1:6, ]
# A tibble: 12 x 4
# Groups:   player_id [2]
   player_id  week points rolling_mean
       <dbl> <int>  <dbl>        <dbl>
 1         1     1      4           NA
 2         1     2      8     4.000000
 3         1     3      5     6.000000
 4         1     4      9     5.666667
 5         1     5      9     6.500000
 6         1     6      1     7.000000
 7         2     1     10           NA
 8         2     2      9    10.000000
 9         2     3      7     9.500000
10         2     4      8     8.666667
11         2     5      1     8.500000
12         2     6      5     7.000000

所以我们可以看到每次 t,玩家 irolling_mean 将是 points 玩家 i 在 {t - 1, ..., min(1, t - 5)}.