r table 列出多个具有频率的分类变量

r table listing multiple categorical variables with frequencies

library("tidyverse")
library("papaja")

df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L, 
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 
3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"), 
    gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 
    1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L, 
    3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L, 
    6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA", 
    "CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA", 
    "MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH", 
    "OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WY"), class = "factor"), first_time_founder_d = c(0, 
    1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1, 
    0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-21L))

df <- df %>%
  select(investment_type,
         state_code_org,
         gender_d,
         first_time_founder_d) %>%
  mutate_at(c("gender_d", "first_time_founder_d"), list(~ factor(.))) %>%
  mutate(gender_d=factor(ifelse(gender_d==1, "Male", "Female"))) %>%
  mutate(first_time_founder_d=factor(ifelse(first_time_founder_d==1, "Yes", "No"))) %>%
  mutate(investment_type=factor(ifelse(investment_type=="angel", "Angel", ifelse(investment_type=="pre_seed", "Pre-Seed", "Seed")))) %>%
  drop_na() %>%
  summary() %>%
  as.data.frame()

# Clean up columns
df <- df %>%
  select(-Var1) %>%
  rename(Variable=Var2, N=Freq) %>%
  mutate(Variable=factor(ifelse(Variable=="investment_type", "Investment Type", ifelse(Variable=="state_code_org", "State", ifelse(str_detect(Variable, "gender_d"), "Gender", "First-Time Founder"))))) %>%
  drop_na()

# break N into level and N
df <- df %>%
  separate(col = N, into = c("Level", "N"), sep = ":")

# Remove white space in values
df <- df %>% 
  mutate(
    Variable=trimws(Variable)) %>%
  mutate(
    Level=trimws(Level)) %>%
  mutate(
    N=trimws(N))

# Convert N to integer
df <- df %>% 
  mutate(N=as.integer(N))

df <- df %>% 
  group_by(Variable) %>% 
  arrange(Variable, desc(N))

apa_table(
  df,
  # stub_indents = list("1", "2"),
  caption = "Summary of categorical variables.",
  note = "Missing data is not shown.")

这是我现在得到的。

我愿意使用任何软件包——这恰好使用了 papaja。但它需要在 rmarkdown 中使用 PDF 输出并符合 APA 风格。

我希望 table 折叠变量值,以便它们不会重复多次,并将状态(其他)移动到状态分组的底部。像这样(不同的数据集)作为例子:

您可以尝试 gt 包(尚未在 CRAN 上)。

# devtools::install_github("rstudio/gt")
library(gt)
df %>%
  mutate(`%` = scales::percent(N / sum(N), 1)) %>%
  gt() %>%
  tab_header(
    title = "Summary of categorical variables."
  ) %>%    
  tab_source_note(
    source_note = md("*Missing data is not shown.*")
  )

这是 HTML 演绎版。它使用 dplyr 的组来确定行分组。

repo 和 https://gt.rstudio.com 都说它支持 HTML 中的输出,计划在未来使用 LaTeX 和 RTF,但它有些工作。

df %>%
  mutate(`%` = scales::percent(N / sum(N), 1)) %>%
  gt() %>%
  # tab_header(
  #   title = "Summary of categorical variables.", subtitle = ""
  # ) %>%    
  tab_source_note(
    source_note = md("*Missing data is not shown.*")
  ) %>%
  as_latex()

tab_header 和乳胶输出 (https://github.com/rstudio/gt/issues/463) 存在一个错误,看起来 tab_source_note 也可能有点歪斜。

我重新安排了一些东西,并且能够得到这个,但我相信这不是你想要的。 (这表明字幕中的任何非空格都允许 tab_header 起作用,但是 " " —— 任意数量的空格 —— 不起作用。)

df %>%
  mutate(`%` = scales::percent(N / sum(N), 1)) %>%
  gt() %>%
  tab_header(
    title = "Summary of categorical variables.",
    subtitle = md("*Missing data is not shown.*")
  ) %>%    
  as_latex()

我认为这是一个简单的解决方案:

df$Variable[duplicated(df$Variable)] <- ""     # remove duplicated labels
df <- df[c(1:7, 9:13, 8), ]                    # move "(other)" to last row

apa_table(
  df,
  align = "llr",                               # right-align last column
  caption = "Summary of categorical variables.",
  note = "Missing data is not shown.")

呈现如下:

这是使用 apa_table() 的另一种方法。

首先以更简单的方式总结您的数据:

library("dplyr")
library("tidyr")

df <- structure(list(investment_type = structure(c(3L, 3L, 3L, 3L, 
                                             3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 
                                             3L), .Label = c("angel", "pre_seed", "seed"), class = "factor"), 
               gender_d = c(1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 
                            1, 1, 1, 1, 0, 1), state_code_org = structure(c(3L, 22L, 
                                                                            3L, 15L, 3L, 4L, 3L, 3L, 22L, 3L, 29L, 25L, 8L, 29L, 10L, 
                                                                            6L, 22L, 4L, 17L, 23L, 17L), .Label = c("AL", "AR", "CA", 
                                                                                                                    "CO", "CT", "DC", "DE", "FL", "GA", "IL", "KS", "LA", "MA", 
                                                                                                                    "MD", "MN", "MO", "NC", "NE", "NH", "NJ", "NV", "NY", "OH", 
                                                                                                                    "OR", "PA", "RI", "SC", "TN", "TX", "UT", "VA", "VT", "WA", 
                                                                                                                    "WI", "WY"), class = "factor"), first_time_founder_d = c(0, 
                                                                                                                                                                             1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, NA, 1, 0, 0, 1, 
                                                                                                                                                                             0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
                                                                                                                                                                                                                                          -21L))

factor_level_count <- df %>% 
  mutate(
    gender_d = factor(gender_d, levels = c(0, 1), labels = c("Female", "Male"))
    , first_time_founder_d = factor(first_time_founder_d, levels = c(0, 1), labels = c("No", "Yes"))
    , investment_type = factor(investment_type, levels = c("angel", "pre_seed", "seed"), labels = c("Angel", "Pre-Seed", "Seed"))
  ) %>%
  na.exclude %>% 
  pivot_longer(cols = everything()) %>% 
  group_by(name, value) %>% 
  count() %>%
  ungroup() %>% 
  mutate(
    name = factor(name , levels = c("first_time_founder_d", "gender_d", "investment_type", "state_code_org"), labels = c("Firt-Time Founder", "Gender", "Investement Type", "State"))
  ) %>% 
  group_by(name) %>% 
  mutate(percent = printnum(n / sum(n) * 100, digits = 1)) %>% 
  rename(Variable = value, N = n, "%" = percent)

现在您可以拆分 data.frame 并将它们重新组合成命名列表以获得存根缩进。

factor_level_count_list <- split(factor_level_count, f = factor_level_count$name, drop = TRUE) %>% 
  lapply(function(x) x[, -1]) # Removes split-column

library("papaja")

apa_table(
  factor_level_count_list
  , align = "llr"             # Right-align last column
  , caption = "Summary of categorical variables."
  , note = "Missing data is not shown."
  , merge_method = "indent"   # Table style to use for merging list elements
  , midrules = c(3, 6, 9)
)