R:面板数据的移动平均公式

R: Formula for moving average for panel data

我有一个包含面板数据的数据框。

一个例子:

date    code_ID name_ID new_value
2021-03-10T17:00:00 13  Alpha   372
2021-03-11T17:00:00 13  Alpha   608
2021-03-12T17:00:00 13  Alpha   515
2021-03-13T17:00:00 13  Alpha   320
2021-03-14T17:00:00 13  Alpha   323
2021-03-15T17:00:00 13  Alpha   329
2021-03-16T17:00:00 13  Alpha   212
2021-03-17T17:00:00 13  Alpha   304
2021-03-18T17:00:00 13  Alpha   462
2021-03-10T17:00:00 17  Beta    115
2021-03-11T17:00:00 17  Beta    151
2021-03-12T17:00:00 17  Beta    141
2021-03-13T17:00:00 17  Beta    137
2021-03-14T17:00:00 17  Beta    106
2021-03-15T17:00:00 17  Beta    67
2021-03-16T17:00:00 17  Beta    166
2021-03-17T17:00:00 17  Beta    126
2021-03-18T17:00:00 17  Beta    179
2021-03-10T17:00:00 8   eta-firm    2155
2021-03-11T17:00:00 8   eta-firm    2845
2021-03-12T17:00:00 8   eta-firm    3477
2021-03-13T17:00:00 8   eta-firm    2950
2021-03-14T17:00:00 8   eta-firm    3023
2021-03-15T17:00:00 8   eta-firm    2822
2021-03-16T17:00:00 8   eta-firm    2184
2021-03-17T17:00:00 8   eta-firm    2026
2021-03-18T17:00:00 8   eta-firm    2531
2021-03-10T17:00:00 6   phi hotel   866
2021-03-11T17:00:00 6   phi hotel   991
2021-03-12T17:00:00 6   phi hotel   971
2021-03-13T17:00:00 6   phi hotel   953
2021-03-14T17:00:00 6   phi hotel   604
2021-03-15T17:00:00 6   phi hotel   398
2021-03-16T17:00:00 6   phi hotel   672
2021-03-17T17:00:00 6   phi hotel   986
2021-03-18T17:00:00 6   phi hotel   1058

如何使用 R 的基础包,根据 code_ID 和日期制作计算移动平均值的公式?

公式为(latex格式):

 {\hat{y}_{t}} = \frac{y_{t-6} + y_{t-5} + y_{t-4} + y_{t-3} + y_{t-2} + y_{t-1} + y_{t}}{7}

这个怎么样:

dat <- tibble::tribble(~date,    ~code_ID, ~name_ID, ~new_value,
"2021-03-10 17:00:00",  13,   "Alpha",   372, 
"2021-03-11 17:00:00",  13,   "Alpha",   608, 
"2021-03-12 17:00:00",  13,   "Alpha",   515, 
"2021-03-13 17:00:00",  13,   "Alpha",   320, 
"2021-03-14 17:00:00",  13,   "Alpha",   323, 
"2021-03-15 17:00:00",  13,   "Alpha",   329, 
"2021-03-16 17:00:00",  13,   "Alpha",   212, 
"2021-03-17 17:00:00",  13,   "Alpha",   304, 
"2021-03-18 17:00:00",  13,   "Alpha",   462, 
"2021-03-10 17:00:00",  17,   "Beta",    115, 
"2021-03-11 17:00:00",  17,   "Beta",    151, 
"2021-03-12 17:00:00",  17,   "Beta",    141, 
"2021-03-13 17:00:00",  17,   "Beta",    137, 
"2021-03-14 17:00:00",  17,   "Beta",    106, 
"2021-03-15 17:00:00",  17,   "Beta",    67, 
"2021-03-16 17:00:00",  17,   "Beta",    166, 
"2021-03-17 17:00:00",  17,   "Beta",    126, 
"2021-03-18 17:00:00",  17,   "Beta",    179, 
"2021-03-10 17:00:00",  8 ,   "eta-firm",    2155, 
"2021-03-11 17:00:00",  8 ,   "eta-firm",    2845, 
"2021-03-12 17:00:00",  8 ,   "eta-firm",    3477, 
"2021-03-13 17:00:00",  8 ,   "eta-firm",    2950, 
"2021-03-14 17:00:00",  8 ,   "eta-firm",    3023, 
"2021-03-15 17:00:00",  8 ,   "eta-firm",    2822, 
"2021-03-16 17:00:00",  8 ,   "eta-firm",    2184, 
"2021-03-17 17:00:00",  8 ,   "eta-firm",    2026, 
"2021-03-18 17:00:00",  8 ,   "eta-firm",    2531, 
"2021-03-10 17:00:00",  6 ,   "phi hotel",   866, 
"2021-03-11 17:00:00",  6 ,   "phi hotel",   991, 
"2021-03-12 17:00:00",  6 ,   "phi hotel",   971, 
"2021-03-13 17:00:00",  6 ,   "phi hotel",   953, 
"2021-03-14 17:00:00",  6 ,   "phi hotel",   604, 
"2021-03-15 17:00:00",  6 ,   "phi hotel",   398, 
"2021-03-16 17:00:00",  6 ,   "phi hotel",   672, 
"2021-03-17 17:00:00",  6 ,   "phi hotel",   986, 
"2021-03-18 17:00:00",  6 ,   "phi hotel",   1058)

dat$date <- anytime::anytime(dat$date)
id <- dat$code_ID
s <- split(dat, id)
l <- lapply(s, function(x)cbind(x, ma=rowMeans(sapply(0:6, function(t)lag(x$new_value, t)))))
out <- do.call(bind_rows, l)
out
#                   date code_ID   name_ID new_value        ma
# 1  2021-03-10 17:00:00       6 phi hotel       866        NA
# 2  2021-03-11 17:00:00       6 phi hotel       991        NA
# 3  2021-03-12 17:00:00       6 phi hotel       971        NA
# 4  2021-03-13 17:00:00       6 phi hotel       953        NA
# 5  2021-03-14 17:00:00       6 phi hotel       604        NA
# 6  2021-03-15 17:00:00       6 phi hotel       398        NA
# 7  2021-03-16 17:00:00       6 phi hotel       672  779.2857
# 8  2021-03-17 17:00:00       6 phi hotel       986  796.4286
# 9  2021-03-18 17:00:00       6 phi hotel      1058  806.0000
# 10 2021-03-10 17:00:00       8  eta-firm      2155        NA
# 11 2021-03-11 17:00:00       8  eta-firm      2845        NA
# 12 2021-03-12 17:00:00       8  eta-firm      3477        NA
# 13 2021-03-13 17:00:00       8  eta-firm      2950        NA
# 14 2021-03-14 17:00:00       8  eta-firm      3023        NA
# 15 2021-03-15 17:00:00       8  eta-firm      2822        NA
# 16 2021-03-16 17:00:00       8  eta-firm      2184 2779.4286
# 17 2021-03-17 17:00:00       8  eta-firm      2026 2761.0000
# 18 2021-03-18 17:00:00       8  eta-firm      2531 2716.1429
# 19 2021-03-10 17:00:00      13     Alpha       372        NA
# 20 2021-03-11 17:00:00      13     Alpha       608        NA
# 21 2021-03-12 17:00:00      13     Alpha       515        NA
# 22 2021-03-13 17:00:00      13     Alpha       320        NA
# 23 2021-03-14 17:00:00      13     Alpha       323        NA
# 24 2021-03-15 17:00:00      13     Alpha       329        NA
# 25 2021-03-16 17:00:00      13     Alpha       212  382.7143
# 26 2021-03-17 17:00:00      13     Alpha       304  373.0000
# 27 2021-03-18 17:00:00      13     Alpha       462  352.1429
# 28 2021-03-10 17:00:00      17      Beta       115        NA
# 29 2021-03-11 17:00:00      17      Beta       151        NA
# 30 2021-03-12 17:00:00      17      Beta       141        NA
# 31 2021-03-13 17:00:00      17      Beta       137        NA
# 32 2021-03-14 17:00:00      17      Beta       106        NA
# 33 2021-03-15 17:00:00      17      Beta        67        NA
# 34 2021-03-16 17:00:00      17      Beta       166  126.1429
# 35 2021-03-17 17:00:00      17      Beta       126  127.7143
# 36 2021-03-18 17:00:00      17      Beta       179  131.7143

以上是基本的 R 解决方案。如果你愿意使用dplyrzoo,你可以这样做:

dat %>% 
  group_by(code_ID) %>% 
  mutate(ma = zoo::rollmean(new_value, k=7, fill=NA, align="right"))

这里有两种使用包 zoo.

中的函数 rollmeanr 的方法

第一个不将输出分配给新列,第二个分配。

library(zoo)

by(df1$new_value, df1$code_ID, function(x)
  rollmeanr(x, k = 7, fill = NA)
)

df1$mean6 <- with(df1, ave(new_value, code_ID, FUN = function(x) rollmeanr(x, k = 7, fill = NA)))
head(df1, 10)
#                  date code_ID name_ID new_value    mean6
#1  2021-03-10 17:00:00      13   Alpha       372       NA
#2  2021-03-11 17:00:00      13   Alpha       608       NA
#3  2021-03-12 17:00:00      13   Alpha       515       NA
#4  2021-03-13 17:00:00      13   Alpha       320       NA
#5  2021-03-14 17:00:00      13   Alpha       323       NA
#6  2021-03-15 17:00:00      13   Alpha       329       NA
#7  2021-03-16 17:00:00      13   Alpha       212 382.7143
#8  2021-03-17 17:00:00      13   Alpha       304 373.0000
#9  2021-03-18 17:00:00      13   Alpha       462 352.1429
#10 2021-03-10 17:00:00      17    Beta       115       NA

dput 格式的数据。

df1 <-
structure(list(date = structure(c(1615395600, 1615482000, 1615568400, 
1615654800, 1615741200, 1615827600, 1615914000, 1616000400, 1616086800, 
1615395600, 1615482000, 1615568400, 1615654800, 1615741200, 1615827600, 
1615914000, 1616000400, 1616086800, 1615395600, 1615482000, 1615568400, 
1615654800, 1615741200, 1615827600, 1615914000, 1616000400, 1616086800, 
1615395600, 1615482000, 1615568400, 1615654800, 1615741200, 1615827600, 
1615914000, 1616000400, 1616086800), class = c("POSIXct", "POSIXt"
), tzone = ""), code_ID = c(13L, 13L, 13L, 13L, 13L, 13L, 13L, 
13L, 13L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L
), name_ID = c("Alpha", "Alpha", "Alpha", "Alpha", "Alpha", "Alpha", 
"Alpha", "Alpha", "Alpha", "Beta", "Beta", "Beta", "Beta", "Beta", 
"Beta", "Beta", "Beta", "Beta", "eta-firm", "eta-firm", "eta-firm", 
"eta-firm", "eta-firm", "eta-firm", "eta-firm", "eta-firm", "eta-firm", 
"phi hotel", "phi hotel", "phi hotel", "phi hotel", "phi hotel", 
"phi hotel", "phi hotel", "phi hotel", "phi hotel"), new_value = c(372L, 
608L, 515L, 320L, 323L, 329L, 212L, 304L, 462L, 115L, 151L, 141L, 
137L, 106L, 67L, 166L, 126L, 179L, 2155L, 2845L, 3477L, 2950L, 
3023L, 2822L, 2184L, 2026L, 2531L, 866L, 991L, 971L, 953L, 604L, 
398L, 672L, 986L, 1058L), mean6 = c(NA, NA, NA, NA, NA, NA, 382.714285714286, 
373, 352.142857142857, NA, NA, NA, NA, NA, NA, 126.142857142857, 
127.714285714286, 131.714285714286, NA, NA, NA, NA, NA, NA, 2779.42857142857, 
2761, 2716.14285714286, NA, NA, NA, NA, NA, NA, 779.285714285714, 
796.428571428571, 806)), row.names = c(NA, -36L), class = "data.frame")

如果你愿意使用包会稍微容易一些,但由于问题要求基础 R,因此仅使用我们有以下内容。 ave 通过 code_ID 将滚动应用到 new_value 并且滚动是通过采用嵌入的行方式或使用过滤器或 cumsum 和 diff

的组合来实现的
roll <- function(x, n = 7) c(rep(NA, n-1), rowMeans(embed(x, n)))
dat2 <- transform(dat, mean7 = ave(new_value, code_ID, FUN = roll))

或使用以下替代方法之一进行滚动:

roll2 <- function(x, n = 7) stats::filter(x, rep(1, n) / n, sides = 1)

roll3 <- function(x, n = 7) c(rep(NA, n-1), diff(cumsum(c(0, x)), n)/n)

roll4 <- function(x, n = 7) c(rep(NA, n-1), apply(embed(x, n), 1, mean))