使用 dplyr 按组计数,但也计数 0

Count by group using dplyr, but also count 0s

我正在对 2 个数据帧进行计数 table。一个看起来像这样:

table1 <- data.table("Col1" = c("Al", "Al", "Al", "Cu", "Cu", "Cu", "Pb", "Pb", 
"Pb"), "Col2" = c("F", "UF", "P", "F", "UF", "P", "F", "UF", "P"), "Col3" = c("C", 
"UC", "<", "C", "UC", "<", "C", "UC", "<"))

table2 <- data.table("Col1" = c("Al", "Al", "Cu", "Pb", "Pb", "Pb"), "Col2" = c("F", 
"UF", "F", "F", "UF", "P"), "Col3" = c("C", "UC", "<", "C", "UC", "<"))

我会像这样为 table1 和 table2 创建计数 table:

table1 %>% group_by(Col1, Col2, Col3) %>% tally()

# A tibble: 9 x 4
# Groups:   Col1, Col2 [9]
  Col1  Col2  Col3      n
  <chr> <chr> <chr> <int>
1 Al    F     C         1
2 Al    P     <         1
3 Al    UF    UC        1
4 Cu    F     C         1
5 Cu    P     <         1
6 Cu    UF    UC        1
7 Pb    F     C         1
8 Pb    P     <         1
9 Pb    UF    UC        1

table2 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 6 x 4
# Groups:   Col1, Col2 [6]
  Col1  Col2  Col3      n
  <chr> <chr> <chr> <int>
1 Al    F     C         1
2 Al    UF    UC        1
3 Cu    F     <         1
4 Pb    F     C         1
5 Pb    P     <         1
6 Pb    UF    UC        1

但我希望 table2 的计数 table 与 table1 的组合具有 0 个计数,而不是完全删除它们,因此它不会显示0 很重要。有什么方法可以用 dplyr 或其他包做到这一点吗?

感谢您的帮助!

这个呢?

table1 %>%
  left_join(cbind(table2, n = 1)) %>%
  group_by(Col1, Col2, Col3) %>%
  mutate(n = sum(n, na.rm = TRUE))

我们会看到

  Col1  Col2  Col3      n
  <chr> <chr> <chr> <dbl>
1 Al    F     C         1
2 Al    UF    UC        1
3 Al    P     <         0
4 Cu    F     C         0
5 Cu    UF    UC        0
6 Cu    P     <         0
7 Pb    F     C         1
8 Pb    UF    UC        1
9 Pb    P     <         1

你可以试试complete

library(tidyverse)


table2 %>% 
  count(Col1, Col2, Col3, name = "sum") %>% 
  complete(distinct_all(table1), fill = list(sum=0))
# A tibble: 10 x 4
   Col1  Col2  Col3    sum
   <chr> <chr> <chr> <dbl>
 1 Al    F     C         1
 2 Al    P     <         0
 3 Al    UF    UC        1
 4 Cu    F     C         0
 5 Cu    P     <         0
 6 Cu    UF    UC        0
 7 Pb    F     C         1
 8 Pb    P     <         1
 9 Pb    UF    UC        1
10 Cu    F     <         1

或full_join

table2 %>% 
  count(Col1, Col2, Col3, name = "sum") %>% 
  full_join(distinct_all(table1)) %>% 
  mutate(sum=replace_na(sum, 0))

1) 将 n=1 列附加到 table2 并将 n=0 列附加到 table 1,然后按组对 n 求和。

table2 %>% 
  mutate(n = 1L) %>%
  bind_rows(table1 %>% mutate(n = 0L)) %>%
  group_by(Col1, Col2, Col3) %>%
  summarize(n = sum(n), .groups = "drop")

给予:

# A tibble: 10 x 4
   Col1  Col2  Col3      n
   <chr> <chr> <chr> <int>
 1 Al    F     C         1
 2 Al    P     <         0
 3 Al    UF    UC        1
 4 Cu    F     <         1
 5 Cu    F     C         0
 6 Cu    P     <         0
 7 Cu    UF    UC        0
 8 Pb    F     C         1
 9 Pb    P     <         1
10 Pb    UF    UC        1

2) 这个变体给出了相同的结果。

list(table1, table2) %>%
  bind_rows(.id = "id") %>% 
  group_by(Col1, Col2, Col3) %>%
  summarize(n = sum(id == 2L), .groups = "drop")

3) 这是一个 data.table 唯一的解决方案。

rbindlist(list(table1, table2), idcol = TRUE)[, 
  .(n = sum(.id == 2L)), by = .(Col1, Col2, Col3)]

4) 这是基本的 R 解决方案。

both <- rbind(transform(table1, n = 0), transform(table2, n = 1))
aggregate(n ~., both, sum)

5) 这使用了 SQL.

library(sqldf)

sqldf("with both as (
    select *, 0 as n from table1
    union all
    select *, 1 as n from table2
  )
  select Col1, Col2, Col3, sum(n) as n 
    from both 
    group by Col1, Col2, Col3
")