用前 16 天值的 mean/max 填充缺失值
filling missing values with mean/max of previous 16 days values
我有一个给定的时间序列数据集:
d1<- structure(list(date = c("8/11/2020", "8/7/2020", "8/4/2020",
"7/28/2020", "7/27/2020", "7/23/2020", "7/20/2020", "7/13/2020",
"7/8/2020", "6/25/2020", "6/24/2020", "6/23/2020", "6/18/2020",
"6/15/2020", "6/10/2020", "6/9/2020", "6/8/2020", "6/5/2020",
"6/3/2020", "6/1/2020", "5/31/2020", "5/24/2020", "5/21/2020",
"5/14/2020", "5/11/2020", "5/9/2020", "5/8/2020", "5/7/2020",
"5/4/2020", "4/22/2020", "4/21/2020", "4/16/2020", "4/14/2020",
"4/9/2020", "4/6/2020", "4/5/2020"), x = c(-3.444434096, -5.554643467,
-5.819128168, -6.528574452, -6.528574452, -8.59555826, -3.025536602,
-2.698376659, -3.483474963, -7.696235263, NA, -7.572170935, -7.185040842,
-7.492766547, -3.728460293, -7.773342378, -7.773342378, NA, -6.601276462,
NA, -6.315658227, -5.421106712, -5.421106712, -2.3212135, -3.40345796,
-2.942817915, -2.942817915, NA, -1.858551108, -0.264005923, -0.264005923,
0.192899359, -0.204841155, -0.107794142, -0.087664372, NA), y = c(-0.095663228,
-2.228724135, NA, -1.287448285, -2.090320147, -2.090320147, 1.269873112,
-2.64716097, -2.680828961, -2.335090584, -4.117893947, 0.375126608,
0.375126608, NA, -3.939176552, NA, -1.797405353, -5.27192525,
-5.27192525, -4.937367195, -4.885790778, -5.611024982, -5.215911023,
-9.277147196, -9.277147196, -4.896429851, -7.053994787, 1.022381641,
-1.398690039, 0.058918339, 0.058918339, -1.900802177, -1.253476157,
1.283432302, 0.519769206, 0.519769206)), class = "data.frame", row.names = c(NA,
-36L))
现在,我想使用前 16 天的平均值(不是前 15 次观察,而是相对于日期的 16 天)来填充变量 x 的那些 NA
。此外,我想使用 16 天的最大值(考虑前 7 天和 7 天后)填充变量 y 的 NA
值。我试图使用 dplyr 包来做到这一点,但不能
我可以在 dplyr 或 tidyr 中执行此操作吗?或任何其他包或基本 r 代码?
非常感谢任何帮助!
您可以尝试 dplyr
s group_modify
功能。
代码
library(tidyverse)
d1 <- d1 %>%
mutate(date = as.Date(date, format = "%m/%d/%Y"))
d2 <- d1 %>%
group_by(row = row_number()) %>%
group_modify(~{
.x %>%
mutate(x = ifelse(is.na(x),
d1 %>%
filter(between(date, .x$date - 16, .x$date)) %>%
summarise(res = mean(x, na.rm = T)) %>%
pull(res),
x),
y = ifelse(is.na(y),
d1 %>%
filter(between(date, .x$date - 7, .x$date + 7)) %>%
summarise(res = max(y, na.rm = T)) %>%
pull(res),
y))
}) %>%
ungroup() %>%
select(-row)
前20行的输出
# A tibble: 20 x 3
date x y
<date> <dbl> <dbl>
1 2020-08-11 -3.44 -0.0957
2 2020-08-07 -5.55 -2.23
3 2020-08-04 -5.82 -0.0957
4 2020-07-28 -6.53 -1.29
5 2020-07-27 -6.53 -2.09
6 2020-07-23 -8.60 -2.09
7 2020-07-20 -3.03 1.27
8 2020-07-13 -2.70 -2.65
9 2020-07-08 -3.48 -2.68
10 2020-06-25 -7.70 -2.34
11 2020-06-24 -6.92 -4.12
12 2020-06-23 -7.57 0.375
13 2020-06-18 -7.19 0.375
14 2020-06-15 -7.49 0.375
15 2020-06-10 -3.73 -3.94
16 2020-06-09 -7.77 -1.80
17 2020-06-08 -7.77 -1.80
18 2020-06-05 -5.94 -5.27
19 2020-06-03 -6.60 -5.27
20 2020-06-01 -5.72 -4.94
我有一个给定的时间序列数据集:
d1<- structure(list(date = c("8/11/2020", "8/7/2020", "8/4/2020",
"7/28/2020", "7/27/2020", "7/23/2020", "7/20/2020", "7/13/2020",
"7/8/2020", "6/25/2020", "6/24/2020", "6/23/2020", "6/18/2020",
"6/15/2020", "6/10/2020", "6/9/2020", "6/8/2020", "6/5/2020",
"6/3/2020", "6/1/2020", "5/31/2020", "5/24/2020", "5/21/2020",
"5/14/2020", "5/11/2020", "5/9/2020", "5/8/2020", "5/7/2020",
"5/4/2020", "4/22/2020", "4/21/2020", "4/16/2020", "4/14/2020",
"4/9/2020", "4/6/2020", "4/5/2020"), x = c(-3.444434096, -5.554643467,
-5.819128168, -6.528574452, -6.528574452, -8.59555826, -3.025536602,
-2.698376659, -3.483474963, -7.696235263, NA, -7.572170935, -7.185040842,
-7.492766547, -3.728460293, -7.773342378, -7.773342378, NA, -6.601276462,
NA, -6.315658227, -5.421106712, -5.421106712, -2.3212135, -3.40345796,
-2.942817915, -2.942817915, NA, -1.858551108, -0.264005923, -0.264005923,
0.192899359, -0.204841155, -0.107794142, -0.087664372, NA), y = c(-0.095663228,
-2.228724135, NA, -1.287448285, -2.090320147, -2.090320147, 1.269873112,
-2.64716097, -2.680828961, -2.335090584, -4.117893947, 0.375126608,
0.375126608, NA, -3.939176552, NA, -1.797405353, -5.27192525,
-5.27192525, -4.937367195, -4.885790778, -5.611024982, -5.215911023,
-9.277147196, -9.277147196, -4.896429851, -7.053994787, 1.022381641,
-1.398690039, 0.058918339, 0.058918339, -1.900802177, -1.253476157,
1.283432302, 0.519769206, 0.519769206)), class = "data.frame", row.names = c(NA,
-36L))
现在,我想使用前 16 天的平均值(不是前 15 次观察,而是相对于日期的 16 天)来填充变量 x 的那些 NA
。此外,我想使用 16 天的最大值(考虑前 7 天和 7 天后)填充变量 y 的 NA
值。我试图使用 dplyr 包来做到这一点,但不能
我可以在 dplyr 或 tidyr 中执行此操作吗?或任何其他包或基本 r 代码?
非常感谢任何帮助!
您可以尝试 dplyr
s group_modify
功能。
代码
library(tidyverse)
d1 <- d1 %>%
mutate(date = as.Date(date, format = "%m/%d/%Y"))
d2 <- d1 %>%
group_by(row = row_number()) %>%
group_modify(~{
.x %>%
mutate(x = ifelse(is.na(x),
d1 %>%
filter(between(date, .x$date - 16, .x$date)) %>%
summarise(res = mean(x, na.rm = T)) %>%
pull(res),
x),
y = ifelse(is.na(y),
d1 %>%
filter(between(date, .x$date - 7, .x$date + 7)) %>%
summarise(res = max(y, na.rm = T)) %>%
pull(res),
y))
}) %>%
ungroup() %>%
select(-row)
前20行的输出
# A tibble: 20 x 3
date x y
<date> <dbl> <dbl>
1 2020-08-11 -3.44 -0.0957
2 2020-08-07 -5.55 -2.23
3 2020-08-04 -5.82 -0.0957
4 2020-07-28 -6.53 -1.29
5 2020-07-27 -6.53 -2.09
6 2020-07-23 -8.60 -2.09
7 2020-07-20 -3.03 1.27
8 2020-07-13 -2.70 -2.65
9 2020-07-08 -3.48 -2.68
10 2020-06-25 -7.70 -2.34
11 2020-06-24 -6.92 -4.12
12 2020-06-23 -7.57 0.375
13 2020-06-18 -7.19 0.375
14 2020-06-15 -7.49 0.375
15 2020-06-10 -3.73 -3.94
16 2020-06-09 -7.77 -1.80
17 2020-06-08 -7.77 -1.80
18 2020-06-05 -5.94 -5.27
19 2020-06-03 -6.60 -5.27
20 2020-06-01 -5.72 -4.94