在 R 的数据框中计算具有非连续天数的变量的滚动总和

Question

我有一些数据，我想根据这些数据计算出连续 14 天 window 的获胜百分比，尽管有大约 7 年的结果。这些日子是不连续的，所以每当我按 'Trainer' 变量和运行 rollapplyr 或 runSum/ sum_run 分组时，我都会总结过去的 14 个事件，但是无法弄清楚如何将 14 天分组。当我尝试使用日期中的日期定义宽度或 k 值时，出现错误

invalid time series parameters specified

或vec' must be sorted non-decreasingly and not contain NAs

编辑- 下面的代码给出了上面的错误

df %>% group_by(Trainer) %>% mutate(Fourteen_day_wins =             rollapplyr(Wins, width = 1:n() - findInterval( Date %d-% Days(14), Date), sum)) %>%  ungroup

我想在我的 df 的新列中按 Trainer 分组时获得 14 天滚动期间的总获胜次数和事件数。有人可以指出我正确的方向吗？仍然是 R 的新手所以到目前为止一直难倒我！

样本 df:

structure(list(Trainer = c("Appleby, Charlie", "Haggas, W J",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J",  "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J",  "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie",  "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J",  "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie",  "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie",  "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie",  "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J",  "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie",  "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J",  "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie",  "Haggas, W J"), Wins = c(1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,  0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,  0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,  1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,  1, 0, 1, 0, 0), Date = structure(c(1508025600, 1508112000, 1508112000,  1508112000, 1508198400, 1508284800, 1508284800, 1508284800, 1508457600,  1508457600, 1508544000, 1508544000, 1508544000, 1508716800, 1508716800,  1508716800, 1508803200, 1508803200, 1508803200, 1508889600, 1508889600,  1508889600, 1508889600, 1508889600, 1508889600, 1508889600, 1509062400,  1509062400, 1509062400, 1509062400, 1509062400, 1509148800, 1509148800,  1509148800, 1509148800, 1509148800, 1509148800, 1509321600, 1509321600,  1509321600, 1509321600, 1509494400, 1509667200, 1509667200, 1509753600,  1509753600, 1509753600, 1509753600, 1509753600, 1509753600, 1509753600,  1510099200, 1510099200, 1510099200, 1510358400, 1510358400, 1510358400,  1521936000, 1521936000, 1523923200, 1523923200, 1523923200, 1524009600,  1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600,  1524009600, 1524009600, 1524009600, 1524096000, 1524096000, 1524096000,  1524096000, 1524096000, 1524096000, 1524096000, 1524182400, 1524182400,  1524182400, 1524268800, 1524268800, 1524268800, 1524528000, 1524528000,  1524528000, 1524528000, 1524614400, 1524614400, 1524614400, 1524787200,  1524787200, 1524787200, 1524787200, 1524787200, 1525132800, 1525219200,  1525219200, 1525219200), tzone = "UTC", class = c("POSIXct",  "POSIXt"))), row.names = c(NA, -101L), class = c("tbl_df", "tbl",  "data.frame"))

Answer 1

一种选择是将所有天数和所有培训师合并，将其与原始数据合并，然后使用 14 天 window:

library(zoo)
#> 
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
library(tidyverse)
df <- structure(list(Trainer = c("Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J", "Haggas, W J", "Appleby, Charlie", "Haggas, W J", "Appleby, Charlie", "Appleby, Charlie", "Haggas, W J"), Wins = c(1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0), Date = structure(c(1508025600, 1508112000, 1508112000, 1508112000, 1508198400, 1508284800, 1508284800, 1508284800, 1508457600, 1508457600, 1508544000, 1508544000, 1508544000, 1508716800, 1508716800, 1508716800, 1508803200, 1508803200, 1508803200, 1508889600, 1508889600, 1508889600, 1508889600, 1508889600, 1508889600, 1508889600, 1509062400, 1509062400, 1509062400, 1509062400, 1509062400, 1509148800, 1509148800, 1509148800, 1509148800, 1509148800, 1509148800, 1509321600, 1509321600, 1509321600, 1509321600, 1509494400, 1509667200, 1509667200, 1509753600, 1509753600, 1509753600, 1509753600, 1509753600, 1509753600, 1509753600, 1510099200, 1510099200, 1510099200, 1510358400, 1510358400, 1510358400, 1521936000, 1521936000, 1523923200, 1523923200, 1523923200, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524009600, 1524096000, 1524096000, 1524096000, 1524096000, 1524096000, 1524096000, 1524096000, 1524182400, 1524182400, 1524182400, 1524268800, 1524268800, 1524268800, 1524528000, 1524528000, 1524528000, 1524528000, 1524614400, 1524614400, 1524614400, 1524787200, 1524787200, 1524787200, 1524787200, 1524787200, 1525132800, 1525219200, 1525219200, 1525219200), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, -101L), class = c("tbl_df", "tbl", "data.frame"))

all_dates <- with(df, expand_grid(Trainer = unique(Trainer), 
                                  Date = seq(min(Date), max(Date), by="1 day")))

all_dates <- left_join(all_dates, df)
#> Joining, by = c("Trainer", "Date")

all_dates %>% 
  group_by(Trainer) %>% 
  mutate(win_pct = rollapplyr(Wins, 
                              width=14, 
                              mean, 
                              partial = TRUE, 
                              align="right", 
                              na.rm=TRUE, 
                              fill=TRUE))
#> # A tibble: 460 × 4
#> # Groups:   Trainer [2]
#>    Trainer          Date                 Wins win_pct
#>    <chr>            <dttm>              <dbl>   <dbl>
#>  1 Appleby, Charlie 2017-10-15 00:00:00     1   1    
#>  2 Appleby, Charlie 2017-10-16 00:00:00    NA   1    
#>  3 Appleby, Charlie 2017-10-17 00:00:00    NA   1    
#>  4 Appleby, Charlie 2017-10-18 00:00:00     1   1    
#>  5 Appleby, Charlie 2017-10-18 00:00:00     0   0.667
#>  6 Appleby, Charlie 2017-10-19 00:00:00    NA   0.667
#>  7 Appleby, Charlie 2017-10-20 00:00:00    NA   0.667
#>  8 Appleby, Charlie 2017-10-21 00:00:00    NA   0.667
#>  9 Appleby, Charlie 2017-10-22 00:00:00    NA   0.667
#> 10 Appleby, Charlie 2017-10-23 00:00:00     0   0.5  
#> # … with 450 more rows

^{由 reprex package (v2.0.1)}

创建于 2022-05-31

Answer 2

问题是 findInterval 的参数应该是数字并且是有序的。

为了解决这个问题，将日期转换为日期 class，然后转换为数字，这样下面的 d 就是自纪元以来的天数。现在我们可以将它与 findInterval 一起使用，如图所示。如果数据已经排序，则可以省略排列行。

library(dplyr, exclude = c("filter", "lag"))
library(zoo)

DF %>%
  arrange(Trainer, Date) %>%
  group_by(Trainer) %>%
  mutate(d = as.numeric(as.Date(Date)), 
         Wins14 = rollapplyr(Wins, 1:n() - findInterval(d - 14, d), sum)) %>%
  ungroup %>%
  select(-d)

给予：

# A tibble: 101 x 4
   Trainer           Wins Date                Wins14
   <chr>            <dbl> <dttm>               <dbl>
 1 Appleby, Charlie     1 2017-10-15 00:00:00      1
 2 Appleby, Charlie     1 2017-10-18 00:00:00      2
 3 Appleby, Charlie     0 2017-10-18 00:00:00      2
 4 Appleby, Charlie     0 2017-10-23 00:00:00      2
 5 Appleby, Charlie     1 2017-10-25 00:00:00      3
 6 Appleby, Charlie     0 2017-10-25 00:00:00      3
 7 Appleby, Charlie     0 2017-10-25 00:00:00      3
 8 Appleby, Charlie     1 2017-10-25 00:00:00      4
 9 Appleby, Charlie     0 2017-10-27 00:00:00      4
10 Appleby, Charlie     0 2017-10-27 00:00:00      4
# ... with 91 more rows

Answer 3

您可以使用 complete 来完成您的数据，然后使用 14 window 句点

df %>%
       group_by(Trainer) %>%
       complete(Date = seq(min(Date), max(Date), '1 day')) %>%
       mutate(runMeans = zoo::rollmean(Wins, 14,0,na.rm = TRUE))
    # A tibble: 459 x 4
    # Groups:   Trainer [2]
       Trainer          Date                 Wins runMeans
       <chr>            <dttm>              <dbl>    <dbl>
     1 Appleby, Charlie 2017-10-15 00:00:00     1    0    
     2 Appleby, Charlie 2017-10-16 00:00:00    NA    0    
     3 Appleby, Charlie 2017-10-17 00:00:00    NA    0    
     4 Appleby, Charlie 2017-10-18 00:00:00     1    0    
     5 Appleby, Charlie 2017-10-18 00:00:00     0    0    
     6 Appleby, Charlie 2017-10-19 00:00:00    NA    0    
     7 Appleby, Charlie 2017-10-20 00:00:00    NA    0.429
     8 Appleby, Charlie 2017-10-21 00:00:00    NA    0.429
     9 Appleby, Charlie 2017-10-22 00:00:00    NA    0.429
    10 Appleby, Charlie 2017-10-23 00:00:00     0    0.375

在 R 的数据框中计算具有非连续天数的变量的滚动总和

Work out rolling sums for variables with non-consecutive days in a dataframe in R

r

zoo