使用 dplyr 按年计算频率(条件计数)
count frequency by year with dplyr (conditional count)
我想按年份统计工具 A 的使用情况并保留零。
ID <- c(1,1,2,2,2,3,4,5,5,5)
Tool <- c("A","B","A","B","A","A","B","A","A","A")
Year <- c(2000,2001,2001,2001,2002,2002,2001,2000,2001,2002)
df <- data.frame(ID,Tool,Year)
library(tidyverse)
df %>% group_by(ID) %>% summarise(toolA = sum(Tool == "A")) %>% count(toolA)
# A tibble: 4 x 2
toolA n
<int> <int>
1 0 1
2 1 2
3 2 1
4 3 1
我想添加年份列,这样我就可以得到如下所示的 table
tool A
Count
2000
2001
2002
0
1
0
0
0
1
2
1
0
1
2
1
0
1
1
3
1
1
1
1
年下的数字表示一年内使用的次数。(不是一个人)
你会怎么做?
也许这太复杂了,存在 better/easier 解决方案。
library(dplyr)
library(tidyr)
dataA <- df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A")) %>%
count(toolA)
df %>%
group_by(ID, Year) %>%
summarise(toolA = sum(Tool == "A"), .groups = 'drop') %>%
pivot_wider(names_from = Year, values_from = toolA, values_fill = 0) %>%
select(-ID) %>%
mutate(toolA = rowSums(.)) %>%
right_join(dataA, by = 'toolA') %>%
select(toolA, n, everything()) %>%
arrange(toolA) %>%
group_by(toolA, n) %>%
summarise(across(.fns = sum), .groups = 'drop')
# toolA n `2000` `2001` `2002`
# <dbl> <int> <int> <int> <int>
#1 0 1 0 0 0
#2 1 2 1 0 1
#3 2 1 0 1 1
#4 3 1 1 1 1
我可能会用 tidyverse
尝试这种方法。当按 ID
分组时,使用 Year
创建一个列表列。在像您所做的那样包含计数 n
之后,使用 unnest_longer
恢复年份。我为计数为零的情况添加了一个额外的列,称为“None”。最后的 pivot_wider
会再次将数据转换为宽格式。
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A"),
Years = list(Year[Tool == "A"])) %>%
add_count(toolA) %>%
unnest_longer(Years) %>%
replace_na(list(Years = "None")) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = c(toolA, n), names_from = Years, names_prefix = "Year_", values_from = value, values_fill = 0)%>%
arrange(toolA)
输出
toolA n Year_2000 Year_2001 Year_2002 Year_None
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 0 1 0 0 0 1
2 1 2 1 0 1 0
3 2 1 0 1 1 0
4 3 1 1 1 1 0
这是另一种tidyverse
方法。简单地说,我们会将数据框从宽转向长,然后进行总结。 Frist 总结摆脱了所有其他非 "A"
s。第二次汇总将结果 table 压缩到每个 toolA
标识的唯一 bin 中,并生成 count
.
library(dplyr)
library(tidyr)
df %>%
mutate(value = +(Tool == "A")) %>%
pivot_wider(names_from = Year, values_fill = 0L) %>%
group_by(ID) %>%
summarize(across(-Tool, sum)) %>%
group_by(toolA = rowSums(across(-ID))) %>%
summarize(count = n(), across(-c(ID, count), sum))
输出
# A tibble: 4 x 5
toolA count `2000` `2001` `2002`
<dbl> <int> <int> <int> <int>
1 0 1 0 0 0
2 1 2 1 0 1
3 2 1 0 1 1
4 3 1 1 1 1
我想按年份统计工具 A 的使用情况并保留零。
ID <- c(1,1,2,2,2,3,4,5,5,5)
Tool <- c("A","B","A","B","A","A","B","A","A","A")
Year <- c(2000,2001,2001,2001,2002,2002,2001,2000,2001,2002)
df <- data.frame(ID,Tool,Year)
library(tidyverse)
df %>% group_by(ID) %>% summarise(toolA = sum(Tool == "A")) %>% count(toolA)
# A tibble: 4 x 2
toolA n
<int> <int>
1 0 1
2 1 2
3 2 1
4 3 1
我想添加年份列,这样我就可以得到如下所示的 table
tool A | Count | 2000 | 2001 | 2002 |
---|---|---|---|---|
0 | 1 | 0 | 0 | 0 |
1 | 2 | 1 | 0 | 1 |
2 | 1 | 0 | 1 | 1 |
3 | 1 | 1 | 1 | 1 |
年下的数字表示一年内使用的次数。(不是一个人) 你会怎么做?
也许这太复杂了,存在 better/easier 解决方案。
library(dplyr)
library(tidyr)
dataA <- df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A")) %>%
count(toolA)
df %>%
group_by(ID, Year) %>%
summarise(toolA = sum(Tool == "A"), .groups = 'drop') %>%
pivot_wider(names_from = Year, values_from = toolA, values_fill = 0) %>%
select(-ID) %>%
mutate(toolA = rowSums(.)) %>%
right_join(dataA, by = 'toolA') %>%
select(toolA, n, everything()) %>%
arrange(toolA) %>%
group_by(toolA, n) %>%
summarise(across(.fns = sum), .groups = 'drop')
# toolA n `2000` `2001` `2002`
# <dbl> <int> <int> <int> <int>
#1 0 1 0 0 0
#2 1 2 1 0 1
#3 2 1 0 1 1
#4 3 1 1 1 1
我可能会用 tidyverse
尝试这种方法。当按 ID
分组时,使用 Year
创建一个列表列。在像您所做的那样包含计数 n
之后,使用 unnest_longer
恢复年份。我为计数为零的情况添加了一个额外的列,称为“None”。最后的 pivot_wider
会再次将数据转换为宽格式。
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(toolA = sum(Tool == "A"),
Years = list(Year[Tool == "A"])) %>%
add_count(toolA) %>%
unnest_longer(Years) %>%
replace_na(list(Years = "None")) %>%
mutate(value = 1) %>%
pivot_wider(id_cols = c(toolA, n), names_from = Years, names_prefix = "Year_", values_from = value, values_fill = 0)%>%
arrange(toolA)
输出
toolA n Year_2000 Year_2001 Year_2002 Year_None
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 0 1 0 0 0 1
2 1 2 1 0 1 0
3 2 1 0 1 1 0
4 3 1 1 1 1 0
这是另一种tidyverse
方法。简单地说,我们会将数据框从宽转向长,然后进行总结。 Frist 总结摆脱了所有其他非 "A"
s。第二次汇总将结果 table 压缩到每个 toolA
标识的唯一 bin 中,并生成 count
.
library(dplyr)
library(tidyr)
df %>%
mutate(value = +(Tool == "A")) %>%
pivot_wider(names_from = Year, values_fill = 0L) %>%
group_by(ID) %>%
summarize(across(-Tool, sum)) %>%
group_by(toolA = rowSums(across(-ID))) %>%
summarize(count = n(), across(-c(ID, count), sum))
输出
# A tibble: 4 x 5
toolA count `2000` `2001` `2002`
<dbl> <int> <int> <int> <int>
1 0 1 0 0 0
2 1 2 1 0 1
3 2 1 0 1 1
4 3 1 1 1 1