估算中间缺失值
Impute Intermediate Missing Values
我有组级别的数据。 data 看起来像下面这样。
我的实际数据是 "Value" & 所需数据是 "Expected_Value".
我尝试了以下代码:
setDT(file_to_share)[,Expected_Value := na.locf(na.locf(Value, na.rm=FALSE), fromLast=TRUE),by = c("Group_A", "Group_B")]
但在此代码中,插补是针对整个缺失值完成的。如果缺失值介于两个值之间,我想计算缺失值。缺失值将是先前可用值的复制。
如果有人能指导我怎么做,那将是一个很大的帮助。
注意 :我尝试使用 data.table
和 zoo
进行计算。但任何其他方法也可以。
即使您正在寻找 data.table
解决方案,这里也有一个使用 tidyverse
方法的解决方案。 (如果时间允许,我可能会尝试翻译成 data.table
)。
我们的想法是创建一个分组变量来捕获您的星期以及 fill
GroupA、GroupB 和星期分组下的值(此处称为 grp
)。我们还从最后创建了 Value
到 fill
的副本(tidyr
术语是 .direction = 'up'
)。然后,我们使用 NA
值的累积总和创建另一个分组变量,并在假设新组大小 (Group_A
, GROUP_B
、grp
和 grp1
) 为 1,其 value1
为 NA
。这给出了预期的结果。
library(tidyverse)
df2 <- df1 %>%
mutate(Date = as.POSIXct(Date, format = '%m/%d/%Y')) %>%
mutate(value1 = Value) %>%
group_by(Group_A, GROUP_B, grp = cumsum(format(Date, '%d')=='01'))%>%
fill(Value) %>%
fill(value1, .direction = 'up') %>%
mutate(grp1 = cumsum(is.na(Value))) %>%
group_by(Group_A, GROUP_B, grp, grp1) %>%
mutate(new = n(), Value = replace(Value, new == 1 | is.na(value1), NA)) %>%
ungroup() %>%
select(-c(value1, grp, grp1, new))
这给出了,
# A tibble: 42 × 5
Group_A GROUP_B Date Value Expected_Value
<chr> <chr> <dttm> <int> <int>
1 GROUP_1 Group_1_1 2017-01-01 NA NA
2 GROUP_1 Group_1_1 2017-01-02 NA NA
3 GROUP_1 Group_1_1 2017-01-03 34 34
4 GROUP_1 Group_1_1 2017-01-04 20 20
5 GROUP_1 Group_1_1 2017-01-05 20 20
6 GROUP_1 Group_1_1 2017-01-06 20 20
7 GROUP_1 Group_1_1 2017-01-07 38 38
8 GROUP_1 Group_1_2 2017-01-01 35 35
9 GROUP_1 Group_1_2 2017-01-02 28 28
10 GROUP_1 Group_1_2 2017-01-03 28 28
# ... with 32 more rows
#Where,
identical(df2$Value, df2$Expected_Value)
#[1] TRUE
OP 已请求仅填写 NA
个值,这些值介于每个组中的其他值之间。这意味着在应用 zoo::na.locf()
.
时跳过每组开始或结束处的任何 NA
值序列
使用 data.table
,这可以通过识别要跳过的行的索引和一种 anti-join:
来完成
library(data.table)
setDT(DT)[!DT[, {
na_grp <- rleid(is.na(Value))
.I[na_grp %in% c(1L, max(na_grp))]
}, by = .(Group_A, GROUP_B)]$V1, Value := zoo::na.locf(Value)][]
Group_A GROUP_B Date Value Expected_Value
1: GROUP_1 Group_1_1 1/1/2017 NA NA
2: GROUP_1 Group_1_1 1/2/2017 NA NA
3: GROUP_1 Group_1_1 1/3/2017 34 34
4: GROUP_1 Group_1_1 1/4/2017 20 20
5: GROUP_1 Group_1_1 1/5/2017 20 20
6: GROUP_1 Group_1_1 1/6/2017 20 20
7: GROUP_1 Group_1_1 1/7/2017 38 38
8: GROUP_1 Group_1_2 1/1/2017 35 35
9: GROUP_1 Group_1_2 1/2/2017 28 28
10: GROUP_1 Group_1_2 1/3/2017 20 28
11: GROUP_1 Group_1_2 1/4/2017 32 32
12: GROUP_1 Group_1_2 1/5/2017 39 39
13: GROUP_1 Group_1_2 1/6/2017 28 28
14: GROUP_1 Group_1_2 1/7/2017 NA NA
15: GROUP_2 Group_1_11 1/1/2017 NA NA
16: GROUP_2 Group_1_11 1/2/2017 NA NA
17: GROUP_2 Group_1_11 1/3/2017 40 40
18: GROUP_2 Group_1_11 1/4/2017 32 32
19: GROUP_2 Group_1_11 1/5/2017 20 20
20: GROUP_2 Group_1_11 1/6/2017 NA NA
21: GROUP_2 Group_1_11 1/7/2017 NA NA
22: GROUP_2 Group_1_21 1/1/2017 NA NA
23: GROUP_2 Group_1_21 1/2/2017 32 32
24: GROUP_2 Group_1_21 1/3/2017 36 36
25: GROUP_2 Group_1_21 1/4/2017 36 36
26: GROUP_2 Group_1_21 1/5/2017 28 28
27: GROUP_2 Group_1_21 1/6/2017 33 33
28: GROUP_2 Group_1_21 1/7/2017 40 40
29: GROUP_3 Group_1_13 1/1/2017 NA NA
30: GROUP_3 Group_1_13 1/2/2017 NA NA
31: GROUP_3 Group_1_13 1/3/2017 NA NA
32: GROUP_3 Group_1_13 1/4/2017 29 29
33: GROUP_3 Group_1_13 1/5/2017 31 31
34: GROUP_3 Group_1_13 1/6/2017 31 31
35: GROUP_3 Group_1_13 1/7/2017 34 34
36: GROUP_3 Group_1_23 1/1/2017 26 26
37: GROUP_3 Group_1_23 1/2/2017 33 33
38: GROUP_3 Group_1_23 1/3/2017 27 27
39: GROUP_3 Group_1_23 1/4/2017 23 23
40: GROUP_3 Group_1_23 1/5/2017 25 25
41: GROUP_3 Group_1_23 1/6/2017 41 41
42: GROUP_3 Group_1_23 1/7/2017 25 25
Group_A GROUP_B Date Value Expected_Value
说明
- 对于每个组,
NA
/非NA
值的条纹被编号
- 挑选每组中的第一个和最后一个条纹,并从特殊符号
.I
中检索索引。 (由于 Value
将 就地 进行更新,因此第一个或最后一个连胜是否包含 NA
并不重要;无论如何它们都不会更新。)
- 找到的索引
DT[, {na_grp <- rleid(is.na(Value)); .I[na_grp %in% c(1L, max(na_grp))]}, by = .(Group_A, GROUP_B)]$V1
被排除在外,因此 zoo::na.locf(Value)
仅应用于每个组的“内部”条纹。
数据
DT <- structure(list(Group_A = c("GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1",
"GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1",
"GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_2", "GROUP_2",
"GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2",
"GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2",
"GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3",
"GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3",
"GROUP_3", "GROUP_3"), GROUP_B = c("Group_1_1", "Group_1_1",
"Group_1_1", "Group_1_1", "Group_1_1", "Group_1_1", "Group_1_1",
"Group_1_2", "Group_1_2", "Group_1_2", "Group_1_2", "Group_1_2",
"Group_1_2", "Group_1_2", "Group_1_11", "Group_1_11", "Group_1_11",
"Group_1_11", "Group_1_11", "Group_1_11", "Group_1_11", "Group_1_21",
"Group_1_21", "Group_1_21", "Group_1_21", "Group_1_21", "Group_1_21",
"Group_1_21", "Group_1_13", "Group_1_13", "Group_1_13", "Group_1_13",
"Group_1_13", "Group_1_13", "Group_1_13", "Group_1_23", "Group_1_23",
"Group_1_23", "Group_1_23", "Group_1_23", "Group_1_23", "Group_1_23"
), Date = c("1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017",
"1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017",
"1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017", "1/3/2017",
"1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017",
"1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017",
"1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017",
"1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017",
"1/7/2017"), Value = c(NA, NA, 34L, 20L, NA, NA, 38L, 35L, 28L,
NA, 32L, 39L, 28L, NA, NA, NA, 40L, 32L, 20L, NA, NA, NA, 32L,
36L, NA, 28L, 33L, 40L, NA, NA, NA, 29L, 31L, NA, 34L, 26L, 33L,
27L, 23L, 25L, 41L, 25L), Expected_Value = c(NA, NA, 34L, 20L,
20L, 20L, 38L, 35L, 28L, 28L, 32L, 39L, 28L, NA, NA, NA, 40L,
32L, 20L, NA, NA, NA, 32L, 36L, 36L, 28L, 33L, 40L, NA, NA, NA,
29L, 31L, 31L, 34L, 26L, 33L, 27L, 23L, 25L, 41L, 25L)), .Names = c("Group_A",
"GROUP_B", "Date", "Value", "Expected_Value"), row.names = c(NA,
-42L), class = "data.frame")
我有组级别的数据。 data 看起来像下面这样。
我的实际数据是 "Value" & 所需数据是 "Expected_Value".
我尝试了以下代码:
setDT(file_to_share)[,Expected_Value := na.locf(na.locf(Value, na.rm=FALSE), fromLast=TRUE),by = c("Group_A", "Group_B")]
但在此代码中,插补是针对整个缺失值完成的。如果缺失值介于两个值之间,我想计算缺失值。缺失值将是先前可用值的复制。
如果有人能指导我怎么做,那将是一个很大的帮助。
注意 :我尝试使用 data.table
和 zoo
进行计算。但任何其他方法也可以。
即使您正在寻找 data.table
解决方案,这里也有一个使用 tidyverse
方法的解决方案。 (如果时间允许,我可能会尝试翻译成 data.table
)。
我们的想法是创建一个分组变量来捕获您的星期以及 fill
GroupA、GroupB 和星期分组下的值(此处称为 grp
)。我们还从最后创建了 Value
到 fill
的副本(tidyr
术语是 .direction = 'up'
)。然后,我们使用 NA
值的累积总和创建另一个分组变量,并在假设新组大小 (Group_A
, GROUP_B
、grp
和 grp1
) 为 1,其 value1
为 NA
。这给出了预期的结果。
library(tidyverse)
df2 <- df1 %>%
mutate(Date = as.POSIXct(Date, format = '%m/%d/%Y')) %>%
mutate(value1 = Value) %>%
group_by(Group_A, GROUP_B, grp = cumsum(format(Date, '%d')=='01'))%>%
fill(Value) %>%
fill(value1, .direction = 'up') %>%
mutate(grp1 = cumsum(is.na(Value))) %>%
group_by(Group_A, GROUP_B, grp, grp1) %>%
mutate(new = n(), Value = replace(Value, new == 1 | is.na(value1), NA)) %>%
ungroup() %>%
select(-c(value1, grp, grp1, new))
这给出了,
# A tibble: 42 × 5 Group_A GROUP_B Date Value Expected_Value <chr> <chr> <dttm> <int> <int> 1 GROUP_1 Group_1_1 2017-01-01 NA NA 2 GROUP_1 Group_1_1 2017-01-02 NA NA 3 GROUP_1 Group_1_1 2017-01-03 34 34 4 GROUP_1 Group_1_1 2017-01-04 20 20 5 GROUP_1 Group_1_1 2017-01-05 20 20 6 GROUP_1 Group_1_1 2017-01-06 20 20 7 GROUP_1 Group_1_1 2017-01-07 38 38 8 GROUP_1 Group_1_2 2017-01-01 35 35 9 GROUP_1 Group_1_2 2017-01-02 28 28 10 GROUP_1 Group_1_2 2017-01-03 28 28 # ... with 32 more rows
#Where,
identical(df2$Value, df2$Expected_Value)
#[1] TRUE
OP 已请求仅填写 NA
个值,这些值介于每个组中的其他值之间。这意味着在应用 zoo::na.locf()
.
NA
值序列
使用 data.table
,这可以通过识别要跳过的行的索引和一种 anti-join:
library(data.table)
setDT(DT)[!DT[, {
na_grp <- rleid(is.na(Value))
.I[na_grp %in% c(1L, max(na_grp))]
}, by = .(Group_A, GROUP_B)]$V1, Value := zoo::na.locf(Value)][]
Group_A GROUP_B Date Value Expected_Value 1: GROUP_1 Group_1_1 1/1/2017 NA NA 2: GROUP_1 Group_1_1 1/2/2017 NA NA 3: GROUP_1 Group_1_1 1/3/2017 34 34 4: GROUP_1 Group_1_1 1/4/2017 20 20 5: GROUP_1 Group_1_1 1/5/2017 20 20 6: GROUP_1 Group_1_1 1/6/2017 20 20 7: GROUP_1 Group_1_1 1/7/2017 38 38 8: GROUP_1 Group_1_2 1/1/2017 35 35 9: GROUP_1 Group_1_2 1/2/2017 28 28 10: GROUP_1 Group_1_2 1/3/2017 20 28 11: GROUP_1 Group_1_2 1/4/2017 32 32 12: GROUP_1 Group_1_2 1/5/2017 39 39 13: GROUP_1 Group_1_2 1/6/2017 28 28 14: GROUP_1 Group_1_2 1/7/2017 NA NA 15: GROUP_2 Group_1_11 1/1/2017 NA NA 16: GROUP_2 Group_1_11 1/2/2017 NA NA 17: GROUP_2 Group_1_11 1/3/2017 40 40 18: GROUP_2 Group_1_11 1/4/2017 32 32 19: GROUP_2 Group_1_11 1/5/2017 20 20 20: GROUP_2 Group_1_11 1/6/2017 NA NA 21: GROUP_2 Group_1_11 1/7/2017 NA NA 22: GROUP_2 Group_1_21 1/1/2017 NA NA 23: GROUP_2 Group_1_21 1/2/2017 32 32 24: GROUP_2 Group_1_21 1/3/2017 36 36 25: GROUP_2 Group_1_21 1/4/2017 36 36 26: GROUP_2 Group_1_21 1/5/2017 28 28 27: GROUP_2 Group_1_21 1/6/2017 33 33 28: GROUP_2 Group_1_21 1/7/2017 40 40 29: GROUP_3 Group_1_13 1/1/2017 NA NA 30: GROUP_3 Group_1_13 1/2/2017 NA NA 31: GROUP_3 Group_1_13 1/3/2017 NA NA 32: GROUP_3 Group_1_13 1/4/2017 29 29 33: GROUP_3 Group_1_13 1/5/2017 31 31 34: GROUP_3 Group_1_13 1/6/2017 31 31 35: GROUP_3 Group_1_13 1/7/2017 34 34 36: GROUP_3 Group_1_23 1/1/2017 26 26 37: GROUP_3 Group_1_23 1/2/2017 33 33 38: GROUP_3 Group_1_23 1/3/2017 27 27 39: GROUP_3 Group_1_23 1/4/2017 23 23 40: GROUP_3 Group_1_23 1/5/2017 25 25 41: GROUP_3 Group_1_23 1/6/2017 41 41 42: GROUP_3 Group_1_23 1/7/2017 25 25 Group_A GROUP_B Date Value Expected_Value
说明
- 对于每个组,
NA
/非NA
值的条纹被编号 - 挑选每组中的第一个和最后一个条纹,并从特殊符号
.I
中检索索引。 (由于Value
将 就地 进行更新,因此第一个或最后一个连胜是否包含NA
并不重要;无论如何它们都不会更新。) - 找到的索引
DT[, {na_grp <- rleid(is.na(Value)); .I[na_grp %in% c(1L, max(na_grp))]}, by = .(Group_A, GROUP_B)]$V1
被排除在外,因此zoo::na.locf(Value)
仅应用于每个组的“内部”条纹。
数据
DT <- structure(list(Group_A = c("GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1",
"GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1",
"GROUP_1", "GROUP_1", "GROUP_1", "GROUP_1", "GROUP_2", "GROUP_2",
"GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2",
"GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2", "GROUP_2",
"GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3",
"GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3", "GROUP_3",
"GROUP_3", "GROUP_3"), GROUP_B = c("Group_1_1", "Group_1_1",
"Group_1_1", "Group_1_1", "Group_1_1", "Group_1_1", "Group_1_1",
"Group_1_2", "Group_1_2", "Group_1_2", "Group_1_2", "Group_1_2",
"Group_1_2", "Group_1_2", "Group_1_11", "Group_1_11", "Group_1_11",
"Group_1_11", "Group_1_11", "Group_1_11", "Group_1_11", "Group_1_21",
"Group_1_21", "Group_1_21", "Group_1_21", "Group_1_21", "Group_1_21",
"Group_1_21", "Group_1_13", "Group_1_13", "Group_1_13", "Group_1_13",
"Group_1_13", "Group_1_13", "Group_1_13", "Group_1_23", "Group_1_23",
"Group_1_23", "Group_1_23", "Group_1_23", "Group_1_23", "Group_1_23"
), Date = c("1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017",
"1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017",
"1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017", "1/3/2017",
"1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017", "1/2/2017",
"1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017", "1/1/2017",
"1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017", "1/7/2017",
"1/1/2017", "1/2/2017", "1/3/2017", "1/4/2017", "1/5/2017", "1/6/2017",
"1/7/2017"), Value = c(NA, NA, 34L, 20L, NA, NA, 38L, 35L, 28L,
NA, 32L, 39L, 28L, NA, NA, NA, 40L, 32L, 20L, NA, NA, NA, 32L,
36L, NA, 28L, 33L, 40L, NA, NA, NA, 29L, 31L, NA, 34L, 26L, 33L,
27L, 23L, 25L, 41L, 25L), Expected_Value = c(NA, NA, 34L, 20L,
20L, 20L, 38L, 35L, 28L, 28L, 32L, 39L, 28L, NA, NA, NA, 40L,
32L, 20L, NA, NA, NA, 32L, 36L, 36L, 28L, 33L, 40L, NA, NA, NA,
29L, 31L, 31L, 34L, 26L, 33L, 27L, 23L, 25L, 41L, 25L)), .Names = c("Group_A",
"GROUP_B", "Date", "Value", "Expected_Value"), row.names = c(NA,
-42L), class = "data.frame")