滚动申请最后报告的值而不是最后报告的时间段

rollapply for last reported values and not last reported time periods

我有一些数据如下:

   date       ID       var1    var2    var3
   <date>     <chr>   <dbl>   <dbl>   <dbl>
 1 2005-02-22 5D0EAE -0.682 -0.682  -0.682 
 2 2005-04-29 5D0EAE  0.458  0.458   0.458 
 3 2005-06-28 80D368  0.178  0.0276  0.0435
 4 2005-06-29 80D368  0.563  0.54    0.575 
 5 2005-07-06 7CCD06  0.36   0.36    0.36  
 6 2005-07-08 7CCD06  0.64   0.64    0.64  
 7 2005-07-12 7CCD06 -0.74  NA      NA    

我有 3 个不同的变量报告不常见日期的数据。我想使用 'rollapply' 函数获取最后 n 报告的值并对其应用时间序列计算。

即运行:

df %>% 
  group_by(ID) %>% 
  summarise(n = n())

给予,

# A tibble: 3 × 2
  ID         n
  <chr>  <int>
1 5D0EAE     2
2 7CCD06    71
3 80D368    29

所以如果n = 5第一个'ID'将被“忽略”,其他2个ID将应用5个时间段的滚动功能。

我可以 pad 使用以下日期:

df2 <- df %>% 
  complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's 
  na_if(0) %>% 
  arrange(date, ID)

我不一定要像通常在 rollapply 函数中那样使用最后 5 天,但我想使用最后 5 个报告值(按组)

df2 %>% 
  group_by(ID) %>% 
  mutate(
    myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
  ) # gives an error

所以,我的问题是,如何将 zoo rollapply 函数应用于给定 ID 的最后 5 个报告值,而不是最后 5 个报告的时间段。忽略报告值的数量小于阈值(即 5)的情况。

每个 ID 的长度都不同。

数据:

df <- structure(list(date = structure(c(12836, 12902, 12962, 12963, 
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004, 
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053, 
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074, 
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117, 
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157, 
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245, 
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307, 
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391, 
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516, 
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572, 
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"), 
    ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", 
    "80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368", 
    "80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368", 
    "80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06", 
    "80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368", 
    "7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06", 
    "80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", 
    "7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375, 
    0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077, 
    -0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55, 
    0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32, 
    0.246666666666667, 0.0000000000000000130678250083694, 0, 
    0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39, 
    0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889, 
    0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63, 
    0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27, 
    0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333, 
    0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58, 
    -0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025, 
    0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364, 
    -0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667, 
    0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95, 
    0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58, 
    0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74, 
    0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35, 
    0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA, 
    0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077, 
    0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46, 
    -0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154, 
    0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58, 
    -0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475, 
    0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875, 
    0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667, 
    0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001, 
    -0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA, 
    0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA, 
    0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35, 
    0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA, 
    0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077, 
    0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46, 
    -0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667, 
    NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9, 
    -0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0, 
    -0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, 
    -0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA, 
-102L), class = c("tbl_df", "tbl", "data.frame"))

我们可以使用 across - 在 complete 之后 'ID' 未被 fill 编辑。也许,我们执行 fill 然后按 'ID' 分组,循环 across 'var' 列并应用 rollapply

library(zoo)
library(dplyr)
library(tidyr)
df2 %>%
   fill(ID) %>%
   group_by(ID) %>% 
   mutate(across(starts_with('var'),
    ~ zoo::rollapply(., width = 5, FUN = mean, na.rm = TRUE, 
        fill = NA, by = 1))) %>%
   ungroup

或仅对最后 5 个非 NA 元素 (i1)

执行 rollapply 来更正上述内容
library(tidyr)
df2 %>% 
   fill(ID) %>% 
   group_by(ID) %>% 
   mutate((across(starts_with('var'), ~ {
     i1 <- tail(row_number()[!is.na(.)], 5)
    replace(., i1, rollapply(.[i1], width = 5, FUN = mean, partial = TRUE))
   })))