ggplot2:使用不同比例的刻面时,箱线图宽度不正确

ggplot2: incorrect boxplot width when facetting with facets of different scales

我需要多面箱线图。绘图的 x 轴是一个定量变量,我想在绘图上反映此信息。各个方面的横坐标比例差异很大。

我的问题是方框的宽度对于大比例尺的小平面来说非常小。

一个可能的解释是方框的宽度对于所有方面都是相同的,而理想情况下它应该由每个方面的 xlims 单独确定。

如果有两个输入,我将不胜感激:

提前致谢!

备注:将横坐标转换为分类变量可能是一种解决方案,但并不完美,因为它会导致一些信息丢失。

最小工作示例:

library(tidyverse)

c(1:4,7) %>% 
  c(.,10*.) %>% # Create abscissa on two different scales
  lapply(FUN = function(x) {tibble(x = x, y = rnorm(50), idx = ifelse(test = x<8, yes = 'A', no = 'B'))}) %>% # Create sample (y) and label (idx)
  bind_rows() %>% 
  ggplot(aes(x = x, y = y, group = x)) + 
  geom_boxplot() + 
  facet_wrap(~idx, scales = 'free') 

结果:

一个麻烦的解决方案是从头开始重新绘制箱线图,但这不是很令人满意:

draw_boxplot = function(locations, width, ymin, lower, middle, upper, ymax, idx){

  local_df = tibble(locations = locations, width = width, ymin = ymin, lower = lower, middle = middle, upper = upper, ymax = ymax, idx = idx)

  ggplot(data = local_df) + 
    geom_rect(aes(xmin = locations - width/2, xmax = locations + width/2, ymin = lower, ymax = upper), fill = 'white', colour = 'black') + 
    geom_segment(aes(x = locations - width/2, xend = locations + width/2, y = middle, yend = middle), size = 0.8) + 
    geom_segment(aes(x = locations, xend = locations, y = upper, yend = ymax)) + 
    geom_segment(aes(x = locations, xend = locations, y = lower, yend = ymin)) + 
    facet_wrap(~idx, scales = 'free_x')
}

make_boxplot = function(to_plot){
  to_plot %>% 
    cmp_boxplot %>% 
    (function(x){
      draw_boxplot(locations = x$x, width = x$width, ymin = x$y0, lower = x$y25, middle = x$y50, upper = x$y75, ymax = x$y100, idx = x$idx)
    })

}


cmp_boxplot = function(to_plot){
  to_plot %>% 
    group_by(idx) %>% 
    mutate(width = 0.6*(max(x) - min(x))/length(unique(x))) %>% #hand specified width
    group_by(x) %>% 
    mutate(y0 = min(y),
           y25 = quantile(y, 0.25),
           y50 = median(y),
           y75 = quantile(y, 0.75),
           y100 = max(y)) %>% 
    select(-y) %>% 
    unique()
}

c(1:4,7) %>% 
  c(.,10*.) %>% 
  lapply(FUN = function(x) {tibble(x = x, y = rnorm(50), idx = ifelse(test = x<8, yes = 'A', no = 'B'))}) %>% 
  bind_rows() %>% 
  make_boxplot

结果:

由于 geom_boxplot 不允许将 width 作为审美变化,因此您必须自己编写。幸运的是它并不太复杂。

bp_custom <- function(vals, type) {

  bp = boxplot.stats(vals)

  if(type == "whiskers") {
    y    = bp$stats[1]
    yend = bp$stats[5]
    return(data.frame(y = y, yend = yend))
  }

  if(type == "box") {
    ymin = bp$stats[2]
    ymax = bp$stats[4]
    return(data.frame(ymin = ymin, ymax = ymax))
  }

  if(type == "median") {
    y    = median(vals)
    yend = median(vals)
    return(data.frame(y = y, yend = yend))
  }

  if(type == "outliers") {
    y = bp$out
    return(data.frame(y = y))
  } else {
    return(warning("Type must be one of 'whiskers', 'box', 'median', or 'outliers'."))
  }
}

此函数执行所有计算和 returns 适合在 stat_summary 中使用的数据帧。然后我们在几个不同的层中调用它来构建箱线图的各个部分。请注意,您需要计算每组刻面的箱线图的宽度,在下面使用管道中的 dplyr 完成。我计算了宽度,使得 x 的范围根据唯一 x 值的数量分成相等的段,然后每个框大约该段宽度的 1/2。您的数据可能需要不同的调整。

library(dplyr)

c(1:4,7) %>% 
  c(.,10*.) %>% # Create abscissa on two different scales
  lapply(FUN = function(x) {
    tibble(x = x, y = rnorm(50), idx = ifelse(test = x<8, yes = 'A', no = 'B'))
  }) %>% 
  bind_rows() %>%                                
  group_by(idx) %>%                                              # NOTE THIS LINE
  mutate(width = 0.25*diff(range(x))/length(unique(x))) %>%      # NOTE THIS LINE
  ggplot(aes(x = x, y = y, group = x)) +
  stat_summary(fun.data = bp_custom, fun.args = "whiskers",
               geom = "segment", aes(xend = x)) + 
  stat_summary(fun.data = bp_custom, fun.args = "box", 
               geom = "rect", aes(xmin = x - width, xmax = x + width), 
               fill = "white", color = "black") + 
  stat_summary(fun.data = bp_custom, fun.args = "median", 
               geom = "segment", aes(x = x - width, xend = x + width), size = 1.5) + 
  stat_summary(fun.data = bp_custom, fun.args = "outliers", 
               geom = "point") + 
  facet_wrap(~idx, scales = 'free') 

至于将其报告为错误(实际上是一个所需的功能),我认为这是一个不常见的用例,他们不会优先考虑它。如果您将此代码包装成自定义 geom(基于 here)并提交拉取请求,您可能会更幸运。