滚动平均值、1、2 和 3 滞后统计 - R

Rolling Average, 1, 2, and 3 Lag of Statistics - R

我的数据集图片如下:

对于从 HomeTeam 开始及以上的每个变量,我想要获得全时滞后滚动平均值、1 个记录滞后平均值、2 个记录滞后平均值以及每个团队的 3 个记录滞后平均值。下面是我在 Excel 中为 2016 年德州技术团队的 HomeTeam 变量所做的一个示例。

几个注意事项:

  1. 我将需要对右侧 HomeTeam 中的每个变量执行此操作。显然不需要它向左走。
  2. 这里有多年的数据,所以我们需要按年份和团队代码分组才能准确地做到这一点。

提前谢谢大家。

使用伪造的 + 廉价的 OCRed 数据(当 OP 给我们一个可重现的例子时我会很乐意更新):

使用 tidyverse 进行争论和旋转,使用 zoo 进行滚动,尝试,

library(tidyverse)
library(zoo)

df_games <- 
  tibble(game_code = c(1:21), 
         team_code = c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3), 
         opponent_code = c(231, 306, 129, 27, 694, 277, 107, 1320, 440, 314, 465, 380, 164, 295, 528, 428, 458, 367, 736, 574, 772), 
         year = 2016, 
         date = seq.Date(from = as.Date("2016-01-01"), by = 7, length.out = 21),
         team = c("TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TB", "TB", "TB", "TB", "TB", "TB", "TC", "TC", "TC", "TC", "TC", "TC", "TC"), 
         opp = c("FI", "I", "CM", "AS", "T", "Hi", "C", "P", "MS", "JS", "U", "M", "C", "I2", "OS", "M", "C2", "L", "V","R", "WK"), 
         home_team = c(0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0), 
         points = c(34, 13, 3, 20, 13, 51, 31, 49, 20, 63, 13, 24, 21, 17, 30, 23, 70, 14, 13, 46, 14), 
         opp_points = c(13, 34, 49, 13, 20, 31, 51, 3, 17, 13, 63, 21, 24, 20, 23, 30, 14, 70, 10, 14, 46), 
         total = c(47, 47, 52, 33, 33, 82, 82, 52, 37, 76, 76, 45, 45, 37, 53, 53, 84, 84, 23, 60, 60), 
         mov = c(-21, 21, 46, -7, 7, -20, 20, -46, -3, -50, 50, -3, 3, 3, -7, 7, -56, 56, -3, -32, 32), 
         spread = c(-4, 4, 32, -22.5, 22.5, -22, 22, -32.1, -6.5, -40, 40, -27, 27, 65, -9.5, 9.5, -38.5, 38.5, 2.5, -16.5, 16.5), 
         book_total = c(61, 61, 52, 56, 56, 63, 63, 52, 37, 76, 76, 45, 45, 37, 57, 57, 59.5, 59.5, 46.5, 60, 60), 
         book_pts = c(32.5, 28.5, 10, 39.25, 16.75, 42.5, 20.5, 42, 21.75, 58, 18, 36, 9, 15.25, 33.25, 23.75, 49, 10.5, 22, 38.25, 21.75), 
         book_opp_pts = c(28.5, 32.5, 42, 16.75, 39.25, 20.5, 42.5, 10, 15.25, 18, 58, 9, 36, 21.75, 23.75, 33.25, 10.5, 49, 24.5, 21.75, 38.25), 
         rush_att = c(52, 21, 33, 43, 43, 35, 38, 38, 35, 44, 35, 50, 19, 38, 47, 34, 35, 31, 32, 30, 33), 
         rush_yard = c(246, 63, 44, 127, 184, 189, 248, 255, 225, 280, 108, 200, 40, 105, 150, 89, 272, 48, 121, 97, 105))


df_games %>% 
  pivot_longer(# Here we pivot from wide format into long format in order to efficiently create all the lagged values with a single call for all variables
    cols = !c(game_code:home_team), names_to = "metric", values_to = "values") %>% 
  arrange(team_code, metric, date) %>% 
  group_by(team_code, year, metric) %>% #The next lines create the lagged means
  mutate(cumean = lag(cummean(values)), # Here is the grouped cumulative mean up to that match
         lag_01 = zoo::rollmean(x = values, k = 2, fill = NA, align = "right"),
         lag_02 = zoo::rollmean(x = values, k = 3, fill = NA, align = "right"),
         lag_03 = zoo::rollmean(x = values, k = 4, fill = NA, align = "right")) %>% 
  select(-values) %>%
  pivot_wider(#Here we pivot back into wide format.
    names_from = metric, values_from = c(lag_01:lag_03)) %>% arrange(team_code, date)

# A tibble: 21 x 48
# Groups:   team_code, year [3]
   game_code team_code opponent_code  year date       team  opp   home_team cumean_book_opp_pts cumean_book_pts
       <int>     <dbl>         <dbl> <dbl> <date>     <chr> <chr>     <dbl>               <dbl>           <dbl>
 1         1         1           231  2016 2016-01-01 TA    FI            0                NA              NA  
 2         2         1           306  2016 2016-01-08 TA    I             1                28.5            32.5
 3         3         1           129  2016 2016-01-15 TA    CM            0                30.5            30.5
 4         4         1            27  2016 2016-01-22 TA    AS            1                34.3            23.7
 5         5         1           694  2016 2016-01-29 TA    T             0                29.9            27.6
 6         6         1           277  2016 2016-02-05 TA    Hi            0                31.8            25.4
 7         7         1           107  2016 2016-02-12 TA    C             0                29.9            28.2
 8         8         1          1320  2016 2016-02-19 TA    P             1                31.7            27.1
 9         9         2           440  2016 2016-02-26 TB    MS            1                NA              NA  
10        10         2           314  2016 2016-03-04 TB    JS            1                15.2            21.8
# ... with 11 more rows, and 38 more variables: cumean_book_total <dbl>, cumean_mov <dbl>,
#   cumean_opp_points <dbl>, cumean_points <dbl>, cumean_rush_att <dbl>, cumean_rush_yard <dbl>,
#   cumean_spread <dbl>, cumean_total <dbl>, lag_01_book_opp_pts <dbl>, lag_01_book_pts <dbl>,
#   lag_01_book_total <dbl>, lag_01_mov <dbl>, lag_01_opp_points <dbl>, lag_01_points <dbl>,
#   lag_01_rush_att <dbl>, lag_01_rush_yard <dbl>, lag_01_spread <dbl>, lag_01_total <dbl>,
#   lag_02_book_opp_pts <dbl>, lag_02_book_pts <dbl>, lag_02_book_total <dbl>, lag_02_mov <dbl>,
#   lag_02_opp_points <dbl>, lag_02_points <dbl>, lag_02_rush_att <dbl>, lag_02_rush_yard <dbl>,
#   lag_02_spread <dbl>, lag_02_total <dbl>, lag_03_book_opp_pts <dbl>, lag_03_book_pts <dbl>,
#   lag_03_book_total <dbl>, lag_03_mov <dbl>, lag_03_opp_points <dbl>, lag_03_points <dbl>,
#   lag_03_rush_att <dbl>, lag_03_rush_yard <dbl>, lag_03_spread <dbl>, lag_03_total <dbl>