R/dplyr 函数:频率 table 包括分组变量的总计
R/dplyr function: Frequency table including totals of grouping variables
我正在处理来自家庭调查的数据,我想计算对各种问题的回答频率表(每个受访者可能有多个答案)。为了加快对调查数据的分析,我在尝试保持 tidyverse
逻辑的同时编写了一些本地函数。我已经成功编写了一个函数,允许我在使用分组变量时计算频率。
library(tibble)
library(dplyr)
my_df <- tibble(id = c(1, 1, 2, 2, 3, 4, 5, 7, 8, 8),
country = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
region = c("ax", "ax", "ax", "ay", "ay", "bx", "bx", "by", "by", "by"),
district = c("ax1", "ax1", "ax2", "ay1", "ay2", "bx1", "bx1", "by1", "by1", "by1"),
question = c("answer1", "answer2", "answer1", "answer2", "answer1", "answer1", "answer1", "answer2", "answer1", "answer2"))
freq <- function(df, var, id_var, ...) {
n <- df %>%
group_by(...) %>%
summarise(n = NROW(unique({{id_var}})), .groups = "drop") %>%
left_join(distinct(df, {{var}}), by = character(), .)
df %>%
group_by(..., {{var}}) %>%
summarise(cases = n(), .groups = "drop") %>%
left_join(n) %>%
mutate(freq_answer = cases/n*100) %>%
ungroup()
}
my_df %>%
freq(question, id, country, region)
# A tibble: 7 x 6
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
出于报告目的,在某些情况下,我想计算每个分组级别的频率并将结果合并到一个数据框中。我找到了一个解决方案,可以让我对固定数量的分组变量执行此操作,并且能够获得所需的结果。显然,如果我使用或多或少的分组变量,我将不得不指定额外的函数。
freq_sum <- function(df, var, id_var, group1, group2) {
df0 <- freq({{df}}, {{var}}, {{id_var}}) %>%
add_column({{group1}} :="Total", .before = 1) %>%
add_column({{group2}} :="Total", .after = 1)
df1 <- freq({{df}}, {{var}}, {{id_var}}, {{group1}}) %>%
add_column({{group2}} :="Total", .after = 1)
df2 <- freq({{df}}, {{var}}, {{id_var}}, {{group1}}, {{group2}})
rbind(df2, df1, df0)
}
my_df %>%
freq_sum(question, id, country, region)
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
8 A Total answer1 3 3 100
9 A Total answer2 2 3 66.7
10 B Total answer1 3 4 75
11 B Total answer2 2 4 50
12 Total Total answer1 6 7 85.7
13 Total Total answer2 4 7 57.1
我的问题:有没有人对如何使freq_sum
功能更general/elegant有什么建议,不需要事先指定号码分组变量?
我对如何实现这一目标有一些初步想法,但我不确定如何实施它们或它们是否可行。
freq_sum <- function(df, var, id, ...) {
df0 <- df %>%
freq({{var}}, {{id}}, ...)
grouping_vars <- df0 %>%
select(1:{{var}}) %>%
select(-last_col()) %>%
names()
# From grouping_vars create a list with vectors that contain increasingy less grouping variables.
[1] "country" "region" "district"
[2] "country" "region"
[3] "country"
# Use the elements of the list as input in the freq() function.
# Add the missing grouping variables to the resulting data frames.
# Combine all dataframes in a single data frame.
}
如果有人有类似的问题:在下面两个问题的答案的帮助下,我找到了一个涉及for循环的解决方案,可以按预期工作,并且可以让我自由选择要汇总的分组变量的数量.
freq_sum <- function(df, var, id_var, ...) {
var_names <- names(select(df, ...))
df_total <- bind_rows(setNames(rep("Total", length(var_names)), var_names))
df_final <- df %>% freq({{var}}, {{id_var}}, ...)
for (i in 1:length(var_names)-1) {
v <- var_names[1:i]
df_final <- df %>%
freq({{var}}, {{id_var}}, across(v)) %>%
add_column(!!!df_total[!names(df_total) %in% names(.)]) %>%
rbind(df_final,.) %>%
distinct()
}
df %>%
freq({{var}}, {{id_var}}) %>%
add_column(!!!df_total[!names(df_total) %in% names(.)]) %>%
rbind(df_final, .)
}
my_df %>%
freq_sum(question, id, country, region)
# A tibble: 13 x 6
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
8 A Total answer1 3 3 100
9 A Total answer2 2 3 66.7
10 B Total answer1 3 4 75
11 B Total answer2 2 4 50
12 Total Total answer1 6 7 85.7
13 Total Total answer2 4 7 57.1
我正在处理来自家庭调查的数据,我想计算对各种问题的回答频率表(每个受访者可能有多个答案)。为了加快对调查数据的分析,我在尝试保持 tidyverse
逻辑的同时编写了一些本地函数。我已经成功编写了一个函数,允许我在使用分组变量时计算频率。
library(tibble)
library(dplyr)
my_df <- tibble(id = c(1, 1, 2, 2, 3, 4, 5, 7, 8, 8),
country = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
region = c("ax", "ax", "ax", "ay", "ay", "bx", "bx", "by", "by", "by"),
district = c("ax1", "ax1", "ax2", "ay1", "ay2", "bx1", "bx1", "by1", "by1", "by1"),
question = c("answer1", "answer2", "answer1", "answer2", "answer1", "answer1", "answer1", "answer2", "answer1", "answer2"))
freq <- function(df, var, id_var, ...) {
n <- df %>%
group_by(...) %>%
summarise(n = NROW(unique({{id_var}})), .groups = "drop") %>%
left_join(distinct(df, {{var}}), by = character(), .)
df %>%
group_by(..., {{var}}) %>%
summarise(cases = n(), .groups = "drop") %>%
left_join(n) %>%
mutate(freq_answer = cases/n*100) %>%
ungroup()
}
my_df %>%
freq(question, id, country, region)
# A tibble: 7 x 6
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
出于报告目的,在某些情况下,我想计算每个分组级别的频率并将结果合并到一个数据框中。我找到了一个解决方案,可以让我对固定数量的分组变量执行此操作,并且能够获得所需的结果。显然,如果我使用或多或少的分组变量,我将不得不指定额外的函数。
freq_sum <- function(df, var, id_var, group1, group2) {
df0 <- freq({{df}}, {{var}}, {{id_var}}) %>%
add_column({{group1}} :="Total", .before = 1) %>%
add_column({{group2}} :="Total", .after = 1)
df1 <- freq({{df}}, {{var}}, {{id_var}}, {{group1}}) %>%
add_column({{group2}} :="Total", .after = 1)
df2 <- freq({{df}}, {{var}}, {{id_var}}, {{group1}}, {{group2}})
rbind(df2, df1, df0)
}
my_df %>%
freq_sum(question, id, country, region)
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
8 A Total answer1 3 3 100
9 A Total answer2 2 3 66.7
10 B Total answer1 3 4 75
11 B Total answer2 2 4 50
12 Total Total answer1 6 7 85.7
13 Total Total answer2 4 7 57.1
我的问题:有没有人对如何使freq_sum
功能更general/elegant有什么建议,不需要事先指定号码分组变量?
我对如何实现这一目标有一些初步想法,但我不确定如何实施它们或它们是否可行。
freq_sum <- function(df, var, id, ...) {
df0 <- df %>%
freq({{var}}, {{id}}, ...)
grouping_vars <- df0 %>%
select(1:{{var}}) %>%
select(-last_col()) %>%
names()
# From grouping_vars create a list with vectors that contain increasingy less grouping variables.
[1] "country" "region" "district"
[2] "country" "region"
[3] "country"
# Use the elements of the list as input in the freq() function.
# Add the missing grouping variables to the resulting data frames.
# Combine all dataframes in a single data frame.
}
如果有人有类似的问题:在下面两个问题的答案的帮助下,我找到了一个涉及for循环的解决方案,可以按预期工作,并且可以让我自由选择要汇总的分组变量的数量.
freq_sum <- function(df, var, id_var, ...) {
var_names <- names(select(df, ...))
df_total <- bind_rows(setNames(rep("Total", length(var_names)), var_names))
df_final <- df %>% freq({{var}}, {{id_var}}, ...)
for (i in 1:length(var_names)-1) {
v <- var_names[1:i]
df_final <- df %>%
freq({{var}}, {{id_var}}, across(v)) %>%
add_column(!!!df_total[!names(df_total) %in% names(.)]) %>%
rbind(df_final,.) %>%
distinct()
}
df %>%
freq({{var}}, {{id_var}}) %>%
add_column(!!!df_total[!names(df_total) %in% names(.)]) %>%
rbind(df_final, .)
}
my_df %>%
freq_sum(question, id, country, region)
# A tibble: 13 x 6
country region question cases n freq_answer
<chr> <chr> <chr> <int> <int> <dbl>
1 A ax answer1 2 2 100
2 A ax answer2 1 2 50
3 A ay answer1 1 2 50
4 A ay answer2 1 2 50
5 B bx answer1 2 2 100
6 B by answer1 1 2 50
7 B by answer2 2 2 100
8 A Total answer1 3 3 100
9 A Total answer2 2 3 66.7
10 B Total answer1 3 4 75
11 B Total answer2 2 4 50
12 Total Total answer1 6 7 85.7
13 Total Total answer2 4 7 57.1