如何在考虑缺失组合的同时完成数据框缺失组合

How to complete data frame missing combinations while accounting for the missing ones

这是一个关于完成数据的简单问题。可能是重复的,但我没有找到答案。

考虑以下 df,它包含字母(ae)和数字(110)的组合,但不是全部有可能的组合。

library(dplyr, warn.conflicts = FALSE)
library(tidyr)

set.seed(123)

df <-
  expand_grid(lttrs = letters[1:5], num = 1:10) |>
  group_by(lttrs) |>
  slice_sample(n = 5) |>
  ungroup() |> 
  arrange(lttrs, num)

df |> 
  print(n = 25)
#> # A tibble: 25 × 2
#>    lttrs   num
#>    <chr> <int>
#>  1 a         2
#>  2 a         3
#>  3 a         6
#>  4 a         8
#>  5 a        10
#>  6 b         1
#>  7 b         4
#>  8 b         5
#>  9 b         6
#> 10 b         8
#> 11 c         1
#> 12 c         3
#> 13 c         5
#> 14 c         8
#> 15 c        10
#> 16 d         2
#> 17 d         3
#> 18 d         5
#> 19 d         9
#> 20 d        10
#> 21 e         3
#> 22 e         4
#> 23 e         6
#> 24 e         7
#> 25 e         9

reprex package (v2.0.1)

创建于 2022-05-29

我想完成1:10 的每封信,在它们丢失的地方,同时解决这种缺失。也就是说,我正在寻找以下所需的输出:

tibble::tribble(
  ~lttrs, ~orig_num, ~completed_num,
     "a",        NA,              1,
     "a",        2L,              2,
     "a",        3L,              3,
     "a",        NA,              4,
     "a",        NA,              5,
     "a",        6L,              6,
     "a",        NA,              7,
     "a",        8L,              8,
     "a",        NA,              9,
     "a",       10L,             10,
     "b",        1L,              1,
     "b",        NA,              2,
     "b",        NA,              3,
     "b",        4L,              4,
     "b",        5L,              5,
     "b",        6L,              6,
     "b",        NA,              7,
     "b",        8L,              8,
     "b",        NA,              9,
     "b",        NA,             10,
     "c",        1L,              1,
     "c",        NA,              2,
     "c",        3L,              3,
     "c",        NA,              4,
     "c",        5L,              5,
     "c",        NA,              6,
     "c",        NA,              7,
     "c",        8L,              8,
     "c",        NA,              9,
     "c",       10L,             10,
     "d",        NA,              1,
     "d",        2L,              2,
     "d",        3L,              3,
     "d",        NA,              4,
     "d",        5L,              5,
     "d",        NA,              6,
     "d",        NA,              7,
     "d",        NA,              8,
     "d",        9L,              9,
     "d",       10L,             10,
     "e",        NA,              1,
     "e",        NA,              2,
     "e",        3L,              3,
     "e",        4L,              4,
     "e",        NA,              5,
     "e",        6L,              6,
     "e",        7L,              7,
     "e",        NA,              8,
     "e",        9L,              9,
     "e",        NA,             10
  )
#> # A tibble: 50 × 3
#>    lttrs orig_num completed_num
#>    <chr>    <int>         <dbl>
#>  1 a           NA             1
#>  2 a            2             2
#>  3 a            3             3
#>  4 a           NA             4
#>  5 a           NA             5
#>  6 a            6             6
#>  7 a           NA             7
#>  8 a            8             8
#>  9 a           NA             9
#> 10 a           10            10
#> # … with 40 more rows

reprex package (v2.0.1)

创建于 2022-05-29

使用 tidyr 可能超级简单,但我不知道如何从 df 直接获得所需的输出。

好的,我好像想通了:

df |> 
  mutate(missing_num = num) |> 
  group_by(lttrs) |> 
  complete(num = full_seq(1:10, 1))
#> # A tibble: 50 × 3
#> # Groups:   lttrs [5]
#>    lttrs   num missing_num
#>    <chr> <dbl>       <int>
#>  1 a         1          NA
#>  2 a         2           2
#>  3 a         3           3
#>  4 a         4          NA
#>  5 a         5          NA
#>  6 a         6           6
#>  7 a         7          NA
#>  8 a         8           8
#>  9 a         9          NA
#> 10 a        10          10
#> # … with 40 more rows

这是一个 tidyverse 解决方案: 首先我们创建 num 的副本然后我们使用 completenesting:

library(dplyr)
library(tidyr)

df %>% 
  mutate(num_new = num) %>% 
  complete(lttrs, nesting(num_new)) %>% 
  data.frame()
 lttrs num_new num
1      a       1   1
2      a       2   2
3      a       3  NA
4      a       4   4
5      a       5   5
6      a       6  NA
7      a       7   7
8      a       8  NA
9      a       9  NA
10     a      10  NA
11     b       1   1
12     b       2   2
13     b       3   3
14     b       4  NA
15     b       5  NA
16     b       6  NA
17     b       7   7
18     b       8  NA
19     b       9   9
20     b      10  NA
21     c       1  NA
22     c       2  NA
23     c       3   3
24     c       4  NA
25     c       5   5
26     c       6   6
27     c       7   7
28     c       8  NA
29     c       9  NA
30     c      10  10
31     d       1  NA
32     d       2   2
33     d       3  NA
34     d       4   4
35     d       5   5
36     d       6  NA
37     d       7  NA
38     d       8   8
39     d       9   9
40     d      10  NA
41     e       1   1
42     e       2   2
43     e       3   3
44     e       4  NA
45     e       5  NA
46     e       6  NA
47     e       7  NA
48     e       8   8
49     e       9   9
50     e      10  NA