使用 dplyr 按组计数,但也计数 0
Count by group using dplyr, but also count 0s
我正在对 2 个数据帧进行计数 table。一个看起来像这样:
table1 <- data.table("Col1" = c("Al", "Al", "Al", "Cu", "Cu", "Cu", "Pb", "Pb",
"Pb"), "Col2" = c("F", "UF", "P", "F", "UF", "P", "F", "UF", "P"), "Col3" = c("C",
"UC", "<", "C", "UC", "<", "C", "UC", "<"))
table2 <- data.table("Col1" = c("Al", "Al", "Cu", "Pb", "Pb", "Pb"), "Col2" = c("F",
"UF", "F", "F", "UF", "P"), "Col3" = c("C", "UC", "<", "C", "UC", "<"))
我会像这样为 table1 和 table2 创建计数 table:
table1 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 9 x 4
# Groups: Col1, Col2 [9]
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al P < 1
3 Al UF UC 1
4 Cu F C 1
5 Cu P < 1
6 Cu UF UC 1
7 Pb F C 1
8 Pb P < 1
9 Pb UF UC 1
table2 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 6 x 4
# Groups: Col1, Col2 [6]
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al UF UC 1
3 Cu F < 1
4 Pb F C 1
5 Pb P < 1
6 Pb UF UC 1
但我希望 table2 的计数 table 与 table1 的组合具有 0 个计数,而不是完全删除它们,因此它不会显示0 很重要。有什么方法可以用 dplyr 或其他包做到这一点吗?
感谢您的帮助!
这个呢?
table1 %>%
left_join(cbind(table2, n = 1)) %>%
group_by(Col1, Col2, Col3) %>%
mutate(n = sum(n, na.rm = TRUE))
我们会看到
Col1 Col2 Col3 n
<chr> <chr> <chr> <dbl>
1 Al F C 1
2 Al UF UC 1
3 Al P < 0
4 Cu F C 0
5 Cu UF UC 0
6 Cu P < 0
7 Pb F C 1
8 Pb UF UC 1
9 Pb P < 1
你可以试试complete
library(tidyverse)
table2 %>%
count(Col1, Col2, Col3, name = "sum") %>%
complete(distinct_all(table1), fill = list(sum=0))
# A tibble: 10 x 4
Col1 Col2 Col3 sum
<chr> <chr> <chr> <dbl>
1 Al F C 1
2 Al P < 0
3 Al UF UC 1
4 Cu F C 0
5 Cu P < 0
6 Cu UF UC 0
7 Pb F C 1
8 Pb P < 1
9 Pb UF UC 1
10 Cu F < 1
或full_join
table2 %>%
count(Col1, Col2, Col3, name = "sum") %>%
full_join(distinct_all(table1)) %>%
mutate(sum=replace_na(sum, 0))
1) 将 n=1 列附加到 table2 并将 n=0 列附加到 table 1,然后按组对 n 求和。
table2 %>%
mutate(n = 1L) %>%
bind_rows(table1 %>% mutate(n = 0L)) %>%
group_by(Col1, Col2, Col3) %>%
summarize(n = sum(n), .groups = "drop")
给予:
# A tibble: 10 x 4
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al P < 0
3 Al UF UC 1
4 Cu F < 1
5 Cu F C 0
6 Cu P < 0
7 Cu UF UC 0
8 Pb F C 1
9 Pb P < 1
10 Pb UF UC 1
2) 这个变体给出了相同的结果。
list(table1, table2) %>%
bind_rows(.id = "id") %>%
group_by(Col1, Col2, Col3) %>%
summarize(n = sum(id == 2L), .groups = "drop")
3) 这是一个 data.table 唯一的解决方案。
rbindlist(list(table1, table2), idcol = TRUE)[,
.(n = sum(.id == 2L)), by = .(Col1, Col2, Col3)]
4) 这是基本的 R 解决方案。
both <- rbind(transform(table1, n = 0), transform(table2, n = 1))
aggregate(n ~., both, sum)
5) 这使用了 SQL.
library(sqldf)
sqldf("with both as (
select *, 0 as n from table1
union all
select *, 1 as n from table2
)
select Col1, Col2, Col3, sum(n) as n
from both
group by Col1, Col2, Col3
")
我正在对 2 个数据帧进行计数 table。一个看起来像这样:
table1 <- data.table("Col1" = c("Al", "Al", "Al", "Cu", "Cu", "Cu", "Pb", "Pb",
"Pb"), "Col2" = c("F", "UF", "P", "F", "UF", "P", "F", "UF", "P"), "Col3" = c("C",
"UC", "<", "C", "UC", "<", "C", "UC", "<"))
table2 <- data.table("Col1" = c("Al", "Al", "Cu", "Pb", "Pb", "Pb"), "Col2" = c("F",
"UF", "F", "F", "UF", "P"), "Col3" = c("C", "UC", "<", "C", "UC", "<"))
我会像这样为 table1 和 table2 创建计数 table:
table1 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 9 x 4
# Groups: Col1, Col2 [9]
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al P < 1
3 Al UF UC 1
4 Cu F C 1
5 Cu P < 1
6 Cu UF UC 1
7 Pb F C 1
8 Pb P < 1
9 Pb UF UC 1
table2 %>% group_by(Col1, Col2, Col3) %>% tally()
# A tibble: 6 x 4
# Groups: Col1, Col2 [6]
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al UF UC 1
3 Cu F < 1
4 Pb F C 1
5 Pb P < 1
6 Pb UF UC 1
但我希望 table2 的计数 table 与 table1 的组合具有 0 个计数,而不是完全删除它们,因此它不会显示0 很重要。有什么方法可以用 dplyr 或其他包做到这一点吗?
感谢您的帮助!
这个呢?
table1 %>%
left_join(cbind(table2, n = 1)) %>%
group_by(Col1, Col2, Col3) %>%
mutate(n = sum(n, na.rm = TRUE))
我们会看到
Col1 Col2 Col3 n
<chr> <chr> <chr> <dbl>
1 Al F C 1
2 Al UF UC 1
3 Al P < 0
4 Cu F C 0
5 Cu UF UC 0
6 Cu P < 0
7 Pb F C 1
8 Pb UF UC 1
9 Pb P < 1
你可以试试complete
library(tidyverse)
table2 %>%
count(Col1, Col2, Col3, name = "sum") %>%
complete(distinct_all(table1), fill = list(sum=0))
# A tibble: 10 x 4
Col1 Col2 Col3 sum
<chr> <chr> <chr> <dbl>
1 Al F C 1
2 Al P < 0
3 Al UF UC 1
4 Cu F C 0
5 Cu P < 0
6 Cu UF UC 0
7 Pb F C 1
8 Pb P < 1
9 Pb UF UC 1
10 Cu F < 1
或full_join
table2 %>%
count(Col1, Col2, Col3, name = "sum") %>%
full_join(distinct_all(table1)) %>%
mutate(sum=replace_na(sum, 0))
1) 将 n=1 列附加到 table2 并将 n=0 列附加到 table 1,然后按组对 n 求和。
table2 %>%
mutate(n = 1L) %>%
bind_rows(table1 %>% mutate(n = 0L)) %>%
group_by(Col1, Col2, Col3) %>%
summarize(n = sum(n), .groups = "drop")
给予:
# A tibble: 10 x 4
Col1 Col2 Col3 n
<chr> <chr> <chr> <int>
1 Al F C 1
2 Al P < 0
3 Al UF UC 1
4 Cu F < 1
5 Cu F C 0
6 Cu P < 0
7 Cu UF UC 0
8 Pb F C 1
9 Pb P < 1
10 Pb UF UC 1
2) 这个变体给出了相同的结果。
list(table1, table2) %>%
bind_rows(.id = "id") %>%
group_by(Col1, Col2, Col3) %>%
summarize(n = sum(id == 2L), .groups = "drop")
3) 这是一个 data.table 唯一的解决方案。
rbindlist(list(table1, table2), idcol = TRUE)[,
.(n = sum(.id == 2L)), by = .(Col1, Col2, Col3)]
4) 这是基本的 R 解决方案。
both <- rbind(transform(table1, n = 0), transform(table2, n = 1))
aggregate(n ~., both, sum)
5) 这使用了 SQL.
library(sqldf)
sqldf("with both as (
select *, 0 as n from table1
union all
select *, 1 as n from table2
)
select Col1, Col2, Col3, sum(n) as n
from both
group by Col1, Col2, Col3
")