在具有嵌套组的数据框中插入带零的行
Insert rows with zeros in data frames with nested groups
我有一个包含嵌套组且缺少一些行的数据集:
set.seed(123)
df <- data.frame(Gr1 = rep(c("x", "y"), each = 10),
Gr2 = rep(c("x1", "x2", "y1", "y2"), each = 5),
ID = paste0(rep(c("x", "y"), each = 10), letters[1:5]),
var1 = round(rnorm(20), 2),
var2 = round(rnorm(20), 2))
rmv.rows <- sample(1:20, 5)
df <- df[-rmv.rows, ]
Gr1 Gr2 ID var1 var2
1 x x1 xa -0.56 -1.07
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.30
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.50 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.70 -0.31
20 y y2 ye -0.47 -0.38
我想用零填充缺失的行(即 Gr1
、Gr2
和 ID
的组合)。
我尝试了 建议的方法,但是 returns Gr1
、Gr2
和 ID
的所有可能组合,而不是数据中存在的组合.换句话说,我只想插入 Gr1
、Gr2
和 ID
的现有组合。所需的输出应为:
Gr1 Gr2 ID var1 var2
1 x x1 xa -0.56 -1.07
2 x x1 xb 0.00 0.00
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
5 x x1 xe 0.00 0.00
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
8 x x2 xc 0.00 0.00
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.30
13 y y1 yc 0.00 0.00
14 y y1 yd 0.00 0.00
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.50 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.70 -0.31
20 y y2 ye -0.47 -0.38
这是一个使用data.table
的选项:
library(data.table)
setDT(df)
all_comb <- df[, CJ(Gr2, ID, unique = TRUE), by = Gr1]
df_out <- df[all_comb, on = .(Gr1, Gr2, ID)]
df_out[is.na(df_out)] <- 0
df_out
# Gr1 Gr2 ID var1 var2
# 1: x x1 xa -0.56 -1.07
# 2: x x1 xb -0.23 -0.22
# 3: x x1 xc 1.56 -1.03
# 4: x x1 xd 0.07 -0.73
# 5: x x1 xe 0.13 -0.63
# 6: x x2 xa 0.00 0.00
# 7: x x2 xb 0.00 0.00
# 8: x x2 xc 0.00 0.00
# 9: x x2 xd -0.69 -1.14
# 10: x x2 xe -0.45 1.25
# 11: y y1 ya 0.00 0.00
# 12: y y1 yb 0.36 -0.30
# 13: y y1 yc 0.40 0.90
# 14: y y1 yd 0.11 0.88
# 15: y y1 ye 0.00 0.00
# 16: y y2 ya 1.79 0.69
# 17: y y2 yb 0.50 0.55
# 18: y y2 yc -1.97 -0.06
# 19: y y2 yd 0.70 -0.31
# 20: y y2 ye -0.47 -0.38
PS.
对于尚未更新到 R 3.6 的用户,这里是当前版本 R 中 OP 代码生成的数据:
df <- structure(list(Gr1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", "y"), class = "factor"),
Gr2 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L), .Label = c("x1", "x2", "y1", "y2"), class = "factor"),
ID = structure(c(1L, 2L, 3L, 4L, 5L, 4L, 5L, 7L, 8L, 9L,
6L, 7L, 8L, 9L, 10L), .Label = c("xa", "xb", "xc", "xd",
"xe", "ya", "yb", "yc", "yd", "ye"), class = "factor"), var1 = c(-0.56,
-0.23, 1.56, 0.07, 0.13, -0.69, -0.45, 0.36, 0.4, 0.11, 1.79,
0.5, -1.97, 0.7, -0.47), var2 = c(-1.07, -0.22, -1.03, -0.73,
-0.63, -1.14, 1.25, -0.3, 0.9, 0.88, 0.69, 0.55, -0.06, -0.31,
-0.38)), row.names = c(1L, 2L, 3L, 4L, 5L, 9L, 10L, 12L,
13L, 14L, 16L, 17L, 18L, 19L, 20L), class = "data.frame")
我们可以用 complete
和 nesting
从 tidyr
:
library(dplyr)
library(tidyr)
df %>%
group_by(Gr1) %>%
complete(nesting(ID), nesting(Gr2), fill = list(var1 = 0, var2 = 0)) %>%
arrange(Gr1, Gr2, ID) %>%
select(Gr1, Gr2, ID, everything())
输出:
# A tibble: 20 x 5
# Groups: Gr1 [2]
Gr1 Gr2 ID var1 var2
<fct> <fct> <fct> <dbl> <dbl>
1 x x1 xa -0.56 -1.07
2 x x1 xb 0 0
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
5 x x1 xe 0 0
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
8 x x2 xc 0 0
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.3
13 y y1 yc 0 0
14 y y1 yd 0 0
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.5 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.7 -0.31
20 y y2 ye -0.47 -0.38
带有 expand
和 left_join
的选项
library(tidyverse)
df %>%
group_by(Gr1) %>%
expand(nesting(ID), nesting(Gr2)) %>%
left_join(df) %>%
mutate_at(vars(var1:var2), replace_na, 0) %>%
arrange(Gr1, Gr2, ID) %>%
select(names(df))
# A tibble: 20 x 5
# Groups: Gr1 [2]
# Gr1 Gr2 ID var1 var2
# <chr> <chr> <chr> <dbl> <dbl>
# 1 x x1 xa -0.56 -1.07
# 2 x x1 xb 0 0
# 3 x x1 xc 1.56 -1.03
# 4 x x1 xd 0.07 -0.73
# 5 x x1 xe 0 0
# 6 x x2 xa 1.72 -1.69
# 7 x x2 xb 0.46 0.84
# 8 x x2 xc 0 0
# 9 x x2 xd -0.69 -1.14
#10 x x2 xe -0.45 1.25
#11 y y1 ya 1.22 0.43
#12 y y1 yb 0.36 -0.3
#13 y y1 yc 0 0
#14 y y1 yd 0 0
#15 y y1 ye -0.56 0.82
#16 y y2 ya 1.79 0.69
#17 y y2 yb 0.5 0.55
#18 y y2 yc -1.97 -0.06
#19 y y2 yd 0.7 -0.31
#20 y y2 ye -0.47 -0.38
我有一个包含嵌套组且缺少一些行的数据集:
set.seed(123)
df <- data.frame(Gr1 = rep(c("x", "y"), each = 10),
Gr2 = rep(c("x1", "x2", "y1", "y2"), each = 5),
ID = paste0(rep(c("x", "y"), each = 10), letters[1:5]),
var1 = round(rnorm(20), 2),
var2 = round(rnorm(20), 2))
rmv.rows <- sample(1:20, 5)
df <- df[-rmv.rows, ]
Gr1 Gr2 ID var1 var2
1 x x1 xa -0.56 -1.07
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.30
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.50 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.70 -0.31
20 y y2 ye -0.47 -0.38
我想用零填充缺失的行(即 Gr1
、Gr2
和 ID
的组合)。
我尝试了 Gr1
、Gr2
和 ID
的所有可能组合,而不是数据中存在的组合.换句话说,我只想插入 Gr1
、Gr2
和 ID
的现有组合。所需的输出应为:
Gr1 Gr2 ID var1 var2
1 x x1 xa -0.56 -1.07
2 x x1 xb 0.00 0.00
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
5 x x1 xe 0.00 0.00
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
8 x x2 xc 0.00 0.00
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.30
13 y y1 yc 0.00 0.00
14 y y1 yd 0.00 0.00
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.50 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.70 -0.31
20 y y2 ye -0.47 -0.38
这是一个使用data.table
的选项:
library(data.table)
setDT(df)
all_comb <- df[, CJ(Gr2, ID, unique = TRUE), by = Gr1]
df_out <- df[all_comb, on = .(Gr1, Gr2, ID)]
df_out[is.na(df_out)] <- 0
df_out
# Gr1 Gr2 ID var1 var2
# 1: x x1 xa -0.56 -1.07
# 2: x x1 xb -0.23 -0.22
# 3: x x1 xc 1.56 -1.03
# 4: x x1 xd 0.07 -0.73
# 5: x x1 xe 0.13 -0.63
# 6: x x2 xa 0.00 0.00
# 7: x x2 xb 0.00 0.00
# 8: x x2 xc 0.00 0.00
# 9: x x2 xd -0.69 -1.14
# 10: x x2 xe -0.45 1.25
# 11: y y1 ya 0.00 0.00
# 12: y y1 yb 0.36 -0.30
# 13: y y1 yc 0.40 0.90
# 14: y y1 yd 0.11 0.88
# 15: y y1 ye 0.00 0.00
# 16: y y2 ya 1.79 0.69
# 17: y y2 yb 0.50 0.55
# 18: y y2 yc -1.97 -0.06
# 19: y y2 yd 0.70 -0.31
# 20: y y2 ye -0.47 -0.38
PS.
对于尚未更新到 R 3.6 的用户,这里是当前版本 R 中 OP 代码生成的数据:
df <- structure(list(Gr1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", "y"), class = "factor"),
Gr2 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L), .Label = c("x1", "x2", "y1", "y2"), class = "factor"),
ID = structure(c(1L, 2L, 3L, 4L, 5L, 4L, 5L, 7L, 8L, 9L,
6L, 7L, 8L, 9L, 10L), .Label = c("xa", "xb", "xc", "xd",
"xe", "ya", "yb", "yc", "yd", "ye"), class = "factor"), var1 = c(-0.56,
-0.23, 1.56, 0.07, 0.13, -0.69, -0.45, 0.36, 0.4, 0.11, 1.79,
0.5, -1.97, 0.7, -0.47), var2 = c(-1.07, -0.22, -1.03, -0.73,
-0.63, -1.14, 1.25, -0.3, 0.9, 0.88, 0.69, 0.55, -0.06, -0.31,
-0.38)), row.names = c(1L, 2L, 3L, 4L, 5L, 9L, 10L, 12L,
13L, 14L, 16L, 17L, 18L, 19L, 20L), class = "data.frame")
我们可以用 complete
和 nesting
从 tidyr
:
library(dplyr)
library(tidyr)
df %>%
group_by(Gr1) %>%
complete(nesting(ID), nesting(Gr2), fill = list(var1 = 0, var2 = 0)) %>%
arrange(Gr1, Gr2, ID) %>%
select(Gr1, Gr2, ID, everything())
输出:
# A tibble: 20 x 5
# Groups: Gr1 [2]
Gr1 Gr2 ID var1 var2
<fct> <fct> <fct> <dbl> <dbl>
1 x x1 xa -0.56 -1.07
2 x x1 xb 0 0
3 x x1 xc 1.56 -1.03
4 x x1 xd 0.07 -0.73
5 x x1 xe 0 0
6 x x2 xa 1.72 -1.69
7 x x2 xb 0.46 0.84
8 x x2 xc 0 0
9 x x2 xd -0.69 -1.14
10 x x2 xe -0.45 1.25
11 y y1 ya 1.22 0.43
12 y y1 yb 0.36 -0.3
13 y y1 yc 0 0
14 y y1 yd 0 0
15 y y1 ye -0.56 0.82
16 y y2 ya 1.79 0.69
17 y y2 yb 0.5 0.55
18 y y2 yc -1.97 -0.06
19 y y2 yd 0.7 -0.31
20 y y2 ye -0.47 -0.38
带有 expand
和 left_join
library(tidyverse)
df %>%
group_by(Gr1) %>%
expand(nesting(ID), nesting(Gr2)) %>%
left_join(df) %>%
mutate_at(vars(var1:var2), replace_na, 0) %>%
arrange(Gr1, Gr2, ID) %>%
select(names(df))
# A tibble: 20 x 5
# Groups: Gr1 [2]
# Gr1 Gr2 ID var1 var2
# <chr> <chr> <chr> <dbl> <dbl>
# 1 x x1 xa -0.56 -1.07
# 2 x x1 xb 0 0
# 3 x x1 xc 1.56 -1.03
# 4 x x1 xd 0.07 -0.73
# 5 x x1 xe 0 0
# 6 x x2 xa 1.72 -1.69
# 7 x x2 xb 0.46 0.84
# 8 x x2 xc 0 0
# 9 x x2 xd -0.69 -1.14
#10 x x2 xe -0.45 1.25
#11 y y1 ya 1.22 0.43
#12 y y1 yb 0.36 -0.3
#13 y y1 yc 0 0
#14 y y1 yd 0 0
#15 y y1 ye -0.56 0.82
#16 y y2 ya 1.79 0.69
#17 y y2 yb 0.5 0.55
#18 y y2 yc -1.97 -0.06
#19 y y2 yd 0.7 -0.31
#20 y y2 ye -0.47 -0.38