r table 列出多个具有频率的分类变量
r table listing multiple categorical variables with frequencies
library("tidyverse")
library("papaja")
df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"),
gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L,
3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L,
6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA",
"CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA",
"MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH",
"OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA",
"WI", "WY"), class = "factor"), first_time_founder_d = c(0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-21L))
df <- df %>%
select(investment_type,
state_code_org,
gender_d,
first_time_founder_d) %>%
mutate_at(c("gender_d", "first_time_founder_d"), list(~ factor(.))) %>%
mutate(gender_d=factor(ifelse(gender_d==1, "Male", "Female"))) %>%
mutate(first_time_founder_d=factor(ifelse(first_time_founder_d==1, "Yes", "No"))) %>%
mutate(investment_type=factor(ifelse(investment_type=="angel", "Angel", ifelse(investment_type=="pre_seed", "Pre-Seed", "Seed")))) %>%
drop_na() %>%
summary() %>%
as.data.frame()
# Clean up columns
df <- df %>%
select(-Var1) %>%
rename(Variable=Var2, N=Freq) %>%
mutate(Variable=factor(ifelse(Variable=="investment_type", "Investment Type", ifelse(Variable=="state_code_org", "State", ifelse(str_detect(Variable, "gender_d"), "Gender", "First-Time Founder"))))) %>%
drop_na()
# break N into level and N
df <- df %>%
separate(col = N, into = c("Level", "N"), sep = ":")
# Remove white space in values
df <- df %>%
mutate(
Variable=trimws(Variable)) %>%
mutate(
Level=trimws(Level)) %>%
mutate(
N=trimws(N))
# Convert N to integer
df <- df %>%
mutate(N=as.integer(N))
df <- df %>%
group_by(Variable) %>%
arrange(Variable, desc(N))
apa_table(
df,
# stub_indents = list("1", "2"),
caption = "Summary of categorical variables.",
note = "Missing data is not shown.")
这是我现在得到的。
我愿意使用任何软件包——这恰好使用了 papaja。但它需要在 rmarkdown 中使用 PDF 输出并符合 APA 风格。
我希望 table 折叠变量值,以便它们不会重复多次,并将状态(其他)移动到状态分组的底部。像这样(不同的数据集)作为例子:
您可以尝试 gt
包(尚未在 CRAN 上)。
# devtools::install_github("rstudio/gt")
library(gt)
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
tab_header(
title = "Summary of categorical variables."
) %>%
tab_source_note(
source_note = md("*Missing data is not shown.*")
)
这是 HTML 演绎版。它使用 dplyr
的组来确定行分组。
repo 和 https://gt.rstudio.com 都说它支持 HTML 中的输出,计划在未来使用 LaTeX 和 RTF,但它有些工作。
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
# tab_header(
# title = "Summary of categorical variables.", subtitle = ""
# ) %>%
tab_source_note(
source_note = md("*Missing data is not shown.*")
) %>%
as_latex()
tab_header
和乳胶输出 (https://github.com/rstudio/gt/issues/463) 存在一个错误,看起来 tab_source_note
也可能有点歪斜。
我重新安排了一些东西,并且能够得到这个,但我相信这不是你想要的。 (这表明字幕中的任何非空格都允许 tab_header
起作用,但是 " "
—— 任意数量的空格 —— 不起作用。)
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
tab_header(
title = "Summary of categorical variables.",
subtitle = md("*Missing data is not shown.*")
) %>%
as_latex()
我认为这是一个简单的解决方案:
df$Variable[duplicated(df$Variable)] <- "" # remove duplicated labels
df <- df[c(1:7, 9:13, 8), ] # move "(other)" to last row
apa_table(
df,
align = "llr", # right-align last column
caption = "Summary of categorical variables.",
note = "Missing data is not shown.")
呈现如下:
这是使用 apa_table()
的另一种方法。
首先以更简单的方式总结您的数据:
library("dplyr")
library("tidyr")
df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"),
gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L,
3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L,
6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA",
"CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA",
"MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH",
"OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA",
"WI", "WY"), class = "factor"), first_time_founder_d = c(0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-21L))
factor_level_count <- df %>%
mutate(
gender_d = factor(gender_d, levels = c(0, 1), labels = c("Female", "Male"))
, first_time_founder_d = factor(first_time_founder_d, levels = c(0, 1), labels = c("No", "Yes"))
, investment_type = factor(investment_type, levels = c("angel", "pre_seed", "seed"), labels = c("Angel", "Pre-Seed", "Seed"))
) %>%
na.exclude %>%
pivot_longer(cols = everything()) %>%
group_by(name, value) %>%
count() %>%
ungroup() %>%
mutate(
name = factor(name , levels = c("first_time_founder_d", "gender_d", "investment_type", "state_code_org"), labels = c("Firt-Time Founder", "Gender", "Investement Type", "State"))
) %>%
group_by(name) %>%
mutate(percent = printnum(n / sum(n) * 100, digits = 1)) %>%
rename(Variable = value, N = n, "%" = percent)
现在您可以拆分 data.frame
并将它们重新组合成命名列表以获得存根缩进。
factor_level_count_list <- split(factor_level_count, f = factor_level_count$name, drop = TRUE) %>%
lapply(function(x) x[, -1]) # Removes split-column
library("papaja")
apa_table(
factor_level_count_list
, align = "llr" # Right-align last column
, caption = "Summary of categorical variables."
, note = "Missing data is not shown."
, merge_method = "indent" # Table style to use for merging list elements
, midrules = c(3, 6, 9)
)
library("tidyverse")
library("papaja")
df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"),
gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L,
3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L,
6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA",
"CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA",
"MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH",
"OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA",
"WI", "WY"), class = "factor"), first_time_founder_d = c(0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-21L))
df <- df %>%
select(investment_type,
state_code_org,
gender_d,
first_time_founder_d) %>%
mutate_at(c("gender_d", "first_time_founder_d"), list(~ factor(.))) %>%
mutate(gender_d=factor(ifelse(gender_d==1, "Male", "Female"))) %>%
mutate(first_time_founder_d=factor(ifelse(first_time_founder_d==1, "Yes", "No"))) %>%
mutate(investment_type=factor(ifelse(investment_type=="angel", "Angel", ifelse(investment_type=="pre_seed", "Pre-Seed", "Seed")))) %>%
drop_na() %>%
summary() %>%
as.data.frame()
# Clean up columns
df <- df %>%
select(-Var1) %>%
rename(Variable=Var2, N=Freq) %>%
mutate(Variable=factor(ifelse(Variable=="investment_type", "Investment Type", ifelse(Variable=="state_code_org", "State", ifelse(str_detect(Variable, "gender_d"), "Gender", "First-Time Founder"))))) %>%
drop_na()
# break N into level and N
df <- df %>%
separate(col = N, into = c("Level", "N"), sep = ":")
# Remove white space in values
df <- df %>%
mutate(
Variable=trimws(Variable)) %>%
mutate(
Level=trimws(Level)) %>%
mutate(
N=trimws(N))
# Convert N to integer
df <- df %>%
mutate(N=as.integer(N))
df <- df %>%
group_by(Variable) %>%
arrange(Variable, desc(N))
apa_table(
df,
# stub_indents = list("1", "2"),
caption = "Summary of categorical variables.",
note = "Missing data is not shown.")
这是我现在得到的。
我愿意使用任何软件包——这恰好使用了 papaja。但它需要在 rmarkdown 中使用 PDF 输出并符合 APA 风格。
我希望 table 折叠变量值,以便它们不会重复多次,并将状态(其他)移动到状态分组的底部。像这样(不同的数据集)作为例子:
您可以尝试 gt
包(尚未在 CRAN 上)。
# devtools::install_github("rstudio/gt")
library(gt)
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
tab_header(
title = "Summary of categorical variables."
) %>%
tab_source_note(
source_note = md("*Missing data is not shown.*")
)
这是 HTML 演绎版。它使用 dplyr
的组来确定行分组。
repo 和 https://gt.rstudio.com 都说它支持 HTML 中的输出,计划在未来使用 LaTeX 和 RTF,但它有些工作。
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
# tab_header(
# title = "Summary of categorical variables.", subtitle = ""
# ) %>%
tab_source_note(
source_note = md("*Missing data is not shown.*")
) %>%
as_latex()
tab_header
和乳胶输出 (https://github.com/rstudio/gt/issues/463) 存在一个错误,看起来 tab_source_note
也可能有点歪斜。
我重新安排了一些东西,并且能够得到这个,但我相信这不是你想要的。 (这表明字幕中的任何非空格都允许 tab_header
起作用,但是 " "
—— 任意数量的空格 —— 不起作用。)
df %>%
mutate(`%` = scales::percent(N / sum(N), 1)) %>%
gt() %>%
tab_header(
title = "Summary of categorical variables.",
subtitle = md("*Missing data is not shown.*")
) %>%
as_latex()
我认为这是一个简单的解决方案:
df$Variable[duplicated(df$Variable)] <- "" # remove duplicated labels
df <- df[c(1:7, 9:13, 8), ] # move "(other)" to last row
apa_table(
df,
align = "llr", # right-align last column
caption = "Summary of categorical variables.",
note = "Missing data is not shown.")
呈现如下:
这是使用 apa_table()
的另一种方法。
首先以更简单的方式总结您的数据:
library("dplyr")
library("tidyr")
df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"),
gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L,
3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L,
6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA",
"CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA",
"MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH",
"OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA",
"WI", "WY"), class = "factor"), first_time_founder_d = c(0,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-21L))
factor_level_count <- df %>%
mutate(
gender_d = factor(gender_d, levels = c(0, 1), labels = c("Female", "Male"))
, first_time_founder_d = factor(first_time_founder_d, levels = c(0, 1), labels = c("No", "Yes"))
, investment_type = factor(investment_type, levels = c("angel", "pre_seed", "seed"), labels = c("Angel", "Pre-Seed", "Seed"))
) %>%
na.exclude %>%
pivot_longer(cols = everything()) %>%
group_by(name, value) %>%
count() %>%
ungroup() %>%
mutate(
name = factor(name , levels = c("first_time_founder_d", "gender_d", "investment_type", "state_code_org"), labels = c("Firt-Time Founder", "Gender", "Investement Type", "State"))
) %>%
group_by(name) %>%
mutate(percent = printnum(n / sum(n) * 100, digits = 1)) %>%
rename(Variable = value, N = n, "%" = percent)
现在您可以拆分 data.frame
并将它们重新组合成命名列表以获得存根缩进。
factor_level_count_list <- split(factor_level_count, f = factor_level_count$name, drop = TRUE) %>%
lapply(function(x) x[, -1]) # Removes split-column
library("papaja")
apa_table(
factor_level_count_list
, align = "llr" # Right-align last column
, caption = "Summary of categorical variables."
, note = "Missing data is not shown."
, merge_method = "indent" # Table style to use for merging list elements
, midrules = c(3, 6, 9)
)