after_stat 中 tapply 的空因子水平导致大杂烩

Emtpy factor level with tapply in after_stat causes hodgepodge

我想用每个 x 轴组的百分比标签绘制一个图。 这在没有空组的情况下工作正常:

# library
library(ggplot2)
library(reshape2)

# example data from reshape2
str(tips)
#> 'data.frame':    244 obs. of  7 variables:
#>  $ total_bill: num  17 10.3 21 23.7 24.6 ...
#>  $ tip       : num  1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
#>  $ sex       : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
#>  $ smoker    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ day       : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
#>  $ time      : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ size      : int  2 3 3 2 4 4 2 4 2 2 ...

# function to count percentage per day
comp_pct <- function(count, day) {
  count / tapply(count, day, sum)[day]
}

# correct plot

ggplot(tips, aes(x = day, group = sex)) +
  geom_bar(aes(y = ..prop.., fill = factor(..group..)), stat = "count") +
  geom_text(aes(
    label = after_stat(scales::percent(comp_pct(count, x))),
    y = ..prop..
  ), stat = "count", position = position_stack(vjust = 0.5)) +
  labs(y = "Percent", fill = "sex") +
  scale_x_discrete(drop=FALSE) +
  scale_y_continuous(labels = scales::percent)

然而,在添加一个空关卡后,带有 after_stat 的标签不再像预期的那样工作。我不确定这是否是由 comp_pcttapply() 的输出顺序引起的。但是,我无法解决它。


# additional empty level
tips -> tips1

tips1$day <- factor(tips$day, levels=c("NewDay",levels(tips$day)))
levels(tips1$day)
#> [1] "NewDay" "Fri"    "Sat"    "Sun"    "Thur"

# bars OK, labels not OK
ggplot(tips1, aes(x = day, group = sex)) +
  geom_bar(aes(y = ..prop.., fill = factor(..group..)), stat = "count") +
  geom_text(aes(
    label = after_stat(scales::percent(comp_pct(count, x))),
    y = ..prop..
  ), stat = "count", position = position_stack(vjust = 0.5)) +
  labs(y = "Percent", fill = "sex") +
  scale_x_discrete(drop=FALSE) +
  scale_y_continuous(labels = scales::percent)
#> Warning: Removed 2 rows containing missing values (geom_text).

reprex package (v2.0.1)

于 2022-04-02 创建

pre-calculate 绘制您想要绘制的内容比试图纠缠 ggplot 的汇总函数之间的交互要容易得多。 Pre-calculating 您的统计数据还可以让您更好地了解您的流程并更好地控制潜在输出(即,如果您也想将数据保存在电子表格中)。我们可以使用 tidyverse 包(其中 ggplot 是一个)来计算绘图内容,这大大简化了绘图:

library(reshape2)
library(tidyverse)

tips1 <- tips

# add new level to day factor
tips1$day <- factor(tips1$day, levels = c('NewDay', levels(tips1$day)))

# calculate what is to be plotted
tips_summary <- tips1 %>% 
  count(day, sex) %>% 
  group_by(day) %>% 
  mutate(prop_day = n / sum(n)) %>% 
  group_by(sex) %>% 
  mutate(prop_sex = n / sum(n))

# plot; scale_x_discrete(drop = F) is necessary to plot the empty NewDay level 
ggplot(tips_summary, aes(x = day, y = prop_sex, fill = sex)) +
  geom_col(position = 'stack') +
  geom_text(aes(label = scales::percent(prop_day)), position = position_stack(vjust = 0.5)) +
  scale_x_discrete(drop = F) +
  scale_y_continuous(labels = scales::percent)

问题是使用 count / tapply(count, day, sum)[day] 按位置提取计算的百分比(抱歉。我的错。(;) 虽然这对原始数据集很好用,但在更一般的情况下不起作用,即为第四个条形图标签返回 NA:

print(day)
#> [1] 2 3 4 5 2 3 4 5
#>         3         4         5      <NA>         3         4         5      <NA> 
print(count / tapply(count, day, sum)[day])
#> 0.1034483 0.3684211 0.2903226        NA 0.1149425 0.7763158 0.9354839        NA 

为了解决这个问题,我们必须将 day 转换为字符以按名称提取百分比值:

library(ggplot2)
library(reshape2)

# additional empty level
tips -> tips1

# function to count percentage per day
comp_pct <- function(count, day) {
  count / tapply(count, day, sum)[as.character(day)]
}

tips1$day <- factor(tips$day, levels = c("NewDay", levels(tips$day)))

ggplot(tips1, aes(x = day, group = sex)) +
  geom_bar(aes(y = ..prop.., fill = factor(..group..)), stat = "count") +
  geom_text(aes(
    label = after_stat(scales::percent(comp_pct(count, x))),
    y = ..prop..
  ), stat = "count", position = position_stack(vjust = 0.5)) +
  labs(y = "Percent", fill = "sex") +
  scale_x_discrete(drop = FALSE) +
  scale_y_continuous(labels = scales::percent)