滚动申请最后报告的值而不是最后报告的时间段
rollapply for last reported values and not last reported time periods
我有一些数据如下:
date ID var1 var2 var3
<date> <chr> <dbl> <dbl> <dbl>
1 2005-02-22 5D0EAE -0.682 -0.682 -0.682
2 2005-04-29 5D0EAE 0.458 0.458 0.458
3 2005-06-28 80D368 0.178 0.0276 0.0435
4 2005-06-29 80D368 0.563 0.54 0.575
5 2005-07-06 7CCD06 0.36 0.36 0.36
6 2005-07-08 7CCD06 0.64 0.64 0.64
7 2005-07-12 7CCD06 -0.74 NA NA
我有 3 个不同的变量报告不常见日期的数据。我想使用 'rollapply' 函数获取最后 n
报告的值并对其应用时间序列计算。
即运行:
df %>%
group_by(ID) %>%
summarise(n = n())
给予,
# A tibble: 3 × 2
ID n
<chr> <int>
1 5D0EAE 2
2 7CCD06 71
3 80D368 29
所以如果n = 5
第一个'ID'将被“忽略”,其他2个ID将应用5个时间段的滚动功能。
我可以 pad
使用以下日期:
df2 <- df %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's
na_if(0) %>%
arrange(date, ID)
我不一定要像通常在 rollapply 函数中那样使用最后 5 天,但我想使用最后 5 个报告值(按组)
df2 %>%
group_by(ID) %>%
mutate(
myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
) # gives an error
所以,我的问题是,如何将 zoo
rollapply 函数应用于给定 ID 的最后 5 个报告值,而不是最后 5 个报告的时间段。忽略报告值的数量小于阈值(即 5)的情况。
每个 ID 的长度都不同。
数据:
df <- structure(list(date = structure(c(12836, 12902, 12962, 12963,
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004,
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053,
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074,
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117,
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157,
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245,
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307,
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391,
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516,
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572,
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"),
ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368",
"80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06",
"80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375,
0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077,
-0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55,
0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32,
0.246666666666667, 0.0000000000000000130678250083694, 0,
0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39,
0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889,
0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63,
0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27,
0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333,
0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58,
-0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025,
0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364,
-0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667,
0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95,
0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58,
0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74,
0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA,
0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077,
0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46,
-0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154,
0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58,
-0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475,
0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875,
0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667,
0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001,
-0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA,
0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA,
0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA,
0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077,
0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46,
-0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667,
NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9,
-0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0,
-0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667,
-0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA,
-102L), class = c("tbl_df", "tbl", "data.frame"))
我们可以使用 across
- 在 complete
之后 'ID' 未被 fill
编辑。也许,我们执行 fill
然后按 'ID' 分组,循环 across
'var' 列并应用 rollapply
library(zoo)
library(dplyr)
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate(across(starts_with('var'),
~ zoo::rollapply(., width = 5, FUN = mean, na.rm = TRUE,
fill = NA, by = 1))) %>%
ungroup
或仅对最后 5 个非 NA 元素 (i1
)
执行 rollapply
来更正上述内容
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate((across(starts_with('var'), ~ {
i1 <- tail(row_number()[!is.na(.)], 5)
replace(., i1, rollapply(.[i1], width = 5, FUN = mean, partial = TRUE))
})))
我有一些数据如下:
date ID var1 var2 var3
<date> <chr> <dbl> <dbl> <dbl>
1 2005-02-22 5D0EAE -0.682 -0.682 -0.682
2 2005-04-29 5D0EAE 0.458 0.458 0.458
3 2005-06-28 80D368 0.178 0.0276 0.0435
4 2005-06-29 80D368 0.563 0.54 0.575
5 2005-07-06 7CCD06 0.36 0.36 0.36
6 2005-07-08 7CCD06 0.64 0.64 0.64
7 2005-07-12 7CCD06 -0.74 NA NA
我有 3 个不同的变量报告不常见日期的数据。我想使用 'rollapply' 函数获取最后 n
报告的值并对其应用时间序列计算。
即运行:
df %>%
group_by(ID) %>%
summarise(n = n())
给予,
# A tibble: 3 × 2
ID n
<chr> <int>
1 5D0EAE 2
2 7CCD06 71
3 80D368 29
所以如果n = 5
第一个'ID'将被“忽略”,其他2个ID将应用5个时间段的滚动功能。
我可以 pad
使用以下日期:
df2 <- df %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's
na_if(0) %>%
arrange(date, ID)
我不一定要像通常在 rollapply 函数中那样使用最后 5 天,但我想使用最后 5 个报告值(按组)
df2 %>%
group_by(ID) %>%
mutate(
myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
) # gives an error
所以,我的问题是,如何将 zoo
rollapply 函数应用于给定 ID 的最后 5 个报告值,而不是最后 5 个报告的时间段。忽略报告值的数量小于阈值(即 5)的情况。
每个 ID 的长度都不同。
数据:
df <- structure(list(date = structure(c(12836, 12902, 12962, 12963,
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004,
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053,
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074,
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117,
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157,
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245,
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307,
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391,
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516,
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572,
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"),
ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368",
"80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06",
"80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375,
0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077,
-0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55,
0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32,
0.246666666666667, 0.0000000000000000130678250083694, 0,
0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39,
0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889,
0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63,
0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27,
0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333,
0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58,
-0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025,
0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364,
-0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667,
0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95,
0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58,
0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74,
0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA,
0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077,
0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46,
-0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154,
0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58,
-0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475,
0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875,
0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667,
0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001,
-0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA,
0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA,
0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA,
0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077,
0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46,
-0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667,
NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9,
-0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0,
-0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667,
-0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA,
-102L), class = c("tbl_df", "tbl", "data.frame"))
我们可以使用 across
- 在 complete
之后 'ID' 未被 fill
编辑。也许,我们执行 fill
然后按 'ID' 分组,循环 across
'var' 列并应用 rollapply
library(zoo)
library(dplyr)
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate(across(starts_with('var'),
~ zoo::rollapply(., width = 5, FUN = mean, na.rm = TRUE,
fill = NA, by = 1))) %>%
ungroup
或仅对最后 5 个非 NA 元素 (i1
)
rollapply
来更正上述内容
library(tidyr)
df2 %>%
fill(ID) %>%
group_by(ID) %>%
mutate((across(starts_with('var'), ~ {
i1 <- tail(row_number()[!is.na(.)], 5)
replace(., i1, rollapply(.[i1], width = 5, FUN = mean, partial = TRUE))
})))