R:根据面板数据的组计算列的总和,其中一组没有数据

R: Calculate sum over a column based on groups for panel data where one group has no data

我有以下数据:

structure(list(Name = c("A", "A", "A", "A", "A", "A", "B", "B", 
"B", "B", "B", "B", "C", "C", "C", "C", "C", "C"), Date = c("01.09.2018", 
"02.09.2018", "03.09.2018", "05.11.2021", "06.11.2021", "07.11.2021", 
"01.09.2018", "02.09.2018", "03.09.2018", "05.11.2021", "06.11.2021", 
"07.11.2021", "01.09.2018", "02.09.2018", "03.09.2018", "05.11.2021", 
"06.11.2021", "07.11.2021"), Category = c(1L, 1L, NA, NA, 2L, 
2L, 3L, 3L, 3L, NA, NA, 4L, 4L, 2L, NA, 2L, 2L, NA), Size = c(34L, 
23L, 12L, 53L, 23L, 53L, 23L, 54L, 65L, 75L, 67L, 45L, 45L, 23L, 
23L, 12L, 12L, NA)), class = "data.frame", row.names = c(NA, 
-18L))

我想计算每个类别每天的大小总和。因此,例如,对于 2018 年 9 月 1 日,我希望 2018 年 8 月 1 日所有观测值的总大小属于同一类别。对我来说困难的是我也观察到类别是 NA。

到目前为止,我创建了以下代码:

Data <- Data %>%
  group_by(Category, Date) %>%
  dplyr:: mutate(Sum_Size = sum(Size))

问题是我现在还有观测值和没有类别的天数 (NA) 的总和。 R 将没有“真实”类别 (NA) 的类别视为自己的类别。但是,我希望 R 忽略“NA 类别”并且不计算每一天的大小总和。因此,例如,我们可以在适用的行中写上“不适用”。

谁能帮我写代码?

非常感谢!

编辑:OP 想要保留 Category == NA 的数据,所以也许这个解决方案?

data_noNA <- data %>%
  group_by(Category, Date) %>%
  dplyr::summarize(Sum_Size = sum(Size, na.rm = TRUE)) %>%
  filter(!is.na(Category)) %>%
  # add back in info from missing columns after summarize
  left_join(data, by = c("Category", "Date"))

data2 <- bind_rows(data_noNA, data %>% filter(is.na(Category))); data2
# A tibble: 18 x 5
# Groups:   Category [5]
   Category Date       Sum_Size Name   Size
      <int> <chr>         <int> <chr> <int>
 1        1 01.09.2018       34 A        34
 2        1 02.09.2018       23 A        23
 3        2 02.09.2018       23 C        23
 4        2 05.11.2021       12 C        12
 5        2 06.11.2021       35 A        23
 6        2 06.11.2021       35 C        12
 7        2 07.11.2021       53 A        53
 8        3 01.09.2018       23 B        23
 9        3 02.09.2018       54 B        54
10        3 03.09.2018       65 B        65
11        4 01.09.2018       45 C        45
12        4 07.11.2021       45 B        45
13       NA 03.09.2018       NA A        12
14       NA 05.11.2021       NA A        53
15       NA 05.11.2021       NA B        75
16       NA 06.11.2021       NA B        67
17       NA 03.09.2018       NA C        23
18       NA 07.11.2021       NA C        NA

是这样的吗?

library(tidyverse)
data <- structure(list(Name = c("A", "A", "A", "A", "A", "A", "B", "B", 
                        "B", "B", "B", "B", "C", "C", "C", "C", "C", "C"), Date = c("01.09.2018", 
                                                                                    "02.09.2018", "03.09.2018", "05.11.2021", "06.11.2021", "07.11.2021", 
                                                                                    "01.09.2018", "02.09.2018", "03.09.2018", "05.11.2021", "06.11.2021", 
                                                                                    "07.11.2021", "01.09.2018", "02.09.2018", "03.09.2018", "05.11.2021", 
                                                                                    "06.11.2021", "07.11.2021"), Category = c(1L, 1L, NA, NA, 2L, 
                                                                                                                              2L, 3L, 3L, 3L, NA, NA, 4L, 4L, 2L, NA, 2L, 2L, NA), Size = c(34L, 
                                                                                                                                                                                            23L, 12L, 53L, 23L, 53L, 23L, 54L, 65L, 75L, 67L, 45L, 45L, 23L, 
                                                                                                                                                                                            23L, 12L, 12L, NA)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                     -18L))
data2 <- data %>%
  group_by(Category, Date) %>%
  dplyr::summarize(Sum_Size = sum(Size, na.rm = TRUE)) %>%
  filter(!is.na(Category)); data2
#> `summarise()` has grouped output by 'Category'. You can override using the
#> `.groups` argument.
#> # A tibble: 11 x 3
#> # Groups:   Category [4]
#>    Category Date       Sum_Size
#>       <int> <chr>         <int>
#>  1        1 01.09.2018       34
#>  2        1 02.09.2018       23
#>  3        2 02.09.2018       23
#>  4        2 05.11.2021       12
#>  5        2 06.11.2021       35
#>  6        2 07.11.2021       53
#>  7        3 01.09.2018       23
#>  8        3 02.09.2018       54
#>  9        3 03.09.2018       65
#> 10        4 01.09.2018       45
#> 11        4 07.11.2021       45

reprex package (v2.0.1)

于 2022-04-16 创建