
Plot labels of the mean of the dependent variable on a stacked bar plot by two categorical variables

我正在使用来自 Glassdoor 的性别薪酬差距数据,可从 here.


我正在尝试在 5 个不同绩效评级的堆叠条形图中包含响应变量 totalSalary 均值的标签。


  geom_bar(stat = "summary", fun = "mean", width = 0.9, color = "black") +
  theme_bw() +
  labs(x = "Job Title", y = "Mean Total Salary", fill = "Gender") +
  theme(axis.title = element_text(size = 10, color = "blue"),
        axis.text = element_text(size = 8),
        legend.position = "top") +
  # geom_col() +
  # geom_text(aes(label = totalSalary), position = position_stack(vjust = 0.5), color = "white") +
  scale_fill_manual(values = c("#FF66CC", "blue")) +
  scale_y_continuous(labels = comma) +
  coord_flip() +
  facet_wrap( ~ perfEval)





尽管如此 - 关于定位标签的技术问题,这是一种方法。棘手的部分是找到堆叠条的中心位置。


df <- readr::read_csv("~/data.csv")

df_summary <- df %>% 
  group_by(gender, jobTitle, perfEval) %>%
  summarize(totalcomp = mean(basePay + bonus),
            totalcomp_label = paste0(round(totalcomp * 1e-3, 0), "k")) %>%

df_plot <- df_summary %>% 
    # the messy part to find approriate label positions - there may be a solution with less pivoting steps
    df_summary %>%
      tidyr::pivot_wider(id_cols = c(jobTitle, perfEval), 
                         values_from = "totalcomp", names_from = "gender", values_fill = 0) %>%
      dplyr::mutate(labelpos_M = Male/2, labelpos_F = Male + Female/2) %>% 
      tidyr::pivot_longer(c(Female, Male), names_to = "gender") %>%
        labelpos = case_when(gender == "Male" ~ labelpos_M,
                             gender == "Female" ~ labelpos_F,
                             TRUE ~ NA_real_)
      ) %>%
      dplyr::select(jobTitle, perfEval, gender, labelpos),
    by = c("jobTitle", "perfEval", "gender")

# A tibble: 98 x 6
#   gender jobTitle       perfEval totalcomp totalcomp_label labelpos
#   <chr>  <chr>             <dbl>     <dbl> <chr>              <dbl>
# 1 Female Data Scientist        1   118479. 118k             164089.
# 2 Female Data Scientist        2   105040. 105k             140556.
# 3 Female Data Scientist        3   100275. 100k             149580.
# 4 Female Data Scientist        4    87633. 88k              127996.
# 5 Female Data Scientist        5   101449. 101k             142046.

df_plot %>%
  ggplot() +
  geom_col(aes(y = jobTitle, x = totalcomp, fill = gender), width = 0.9, color = "black") +
  theme_bw() +
  labs(x = "Job Title", y = "Mean Total Salary", fill = "Gender") +
  theme(axis.title = element_text(size = 10, color = "blue"),
        axis.text = element_text(size = 8),
        legend.position = "top") +
  scale_fill_manual(values = c("#FF66CC", "blue")) +
  scale_x_continuous(labels = scales::comma) +
  facet_wrap( ~ perfEval) +
  # positioning the labels
  geom_text(aes(x = labelpos, y = jobTitle, label = totalcomp_label), 
            color = "white")