R 用户定义函数中的整洁评估语法

Question

我想定义一个泛型函数

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  xvar <- enquo(xvar)
  yvar <- enquo(yvar)
  groupvar <- enquo(groupvar)

  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if( "yield" %in% names(tmp) )
  {
    tmp <- tmp %>%
      mutate_at(vars(!!yvar), ~ifelse(round(yield, 0) < 85, NA, .))
  }
  
  # Compute IQR for each year
  tmp_iqr <- tmp %>%
    group_by(!!groupvar) %>%
    summarise(iqr=IQR(!!yvar, na.rm = TRUE))
  
  p <- ggplot(data = tmp %>% mutate_at(vars(!!yvar), ~ifelse(tmp_iqr[which(tmp_iqr[[!!groupvar]] %in% (!!xvar)),]$iqr == 0, . + runif(1, -0.01, 0.01), . )), aes(x = !!xvar, y = !!yvar))
  p <- p + stat_boxplot(aes(group = !!groupvar), na.rm = TRUE, coef = 10000)   # Trick (large unrealistic coef value) so whiskers end at min(y) & max(y)
  p <- p + geom_boxplot(na.rm = TRUE, outlier.shape = NA)

  return(p)
}

即使 IQR 为 0，它也能够绘制扩展到 min/max 的箱线图须。我试图通过向有罪数据添加微小的随机数（低于显着性水平）来实现这一点，以避免 IQR= 0.

但是，我一定是漏掉了关于quosure的语法，因为运行这个函数

func_boxplot2(data, date, days, date)

与数据集

structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L), .Label = c("2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019"), class = c("ordered", 
"factor")), station = c("41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", 
"41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", 
"41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", 
"41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", 
"41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1"
), days = c(16, 15, 45, 26, 14, 14, 32, 7, 87, 42, 24, 23, 25, 
25, 55, 29, 29, 16, 11, 14, 58, 21, 19, 10, 10, 14, 33, 18, 10, 
7, 9, 10, 19, 7, 8, 7, 1, 5, 15, 8, 1, 4, 5, 6, 14, 6, 5, 5, 
3, 5, 19, 8, 4, 5, 3, 4, 16, 3, 1, 3), yield = c(98.4817351598173, 
49.4520547945205, 95.8561643835616, 97.6712328767123, 98.2648401826484, 
95.1598173515982, 97.8767123287671, 27.9109589041096, 98.310502283105, 
98.972602739726, 97.203196347032, 96.2100456621005, 98.7818761384335, 
96.7554644808743, 97.4954462659381, 98.8046448087432, 98.747723132969, 
98.3037340619308, 99.0525114155251, 96.1986301369863, 97.1004566210046, 
96.4954337899543, 96.3698630136986, 98.2077625570776, 96.62100456621, 
98.3675799086758, 95.6963470319635, 96.8835616438356, 93.5844748858447, 
87.8196347031963, 91.2328767123288, 92.5570776255708, 81.5182648401827, 
82.7739726027397, 90.1826484018265, 87.1461187214612, 87.2153916211293, 
92.9986338797814, 94.6948998178506, 85.5760473588342, 92.3611111111111, 
96.2204007285975, 86.3698630136986, 86.4269406392694, 87.796803652968, 
93.2762557077626, 96.6438356164384, 95.6164383561644, 71.3812785388128, 
93.7442922374429, 96.3698630136986, 97.2602739726027, 95.7876712328767, 
94.7146118721461, 87.6141552511416, 43.0821917808219, 88.6872146118722, 
92.6826484018265, 90.365296803653, 86.541095890411), environ = structure(c(5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 
3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 
3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L), .Label = c("Urbain avec très forte influence du trafic", 
"Urbain avec forte influence du trafic", "Urbain avec influence modérée du trafic", 
"Urbain avec faible influence du trafic", "Urbain avec très faible influence du trafic", 
"Industriel avec influence modérée du trafic"), class = "factor")), row.names = c(NA, 
-60L), class = c("tbl_df", "tbl", "data.frame"))

给我以下错误

 Error: Problem with `mutate()` input `days`.
x Must extract column with a single valid subscript.
x Subscript `date` has size 60 but must be size 1.
ℹ Input `days` is `(structure(function (..., .x = ..1, .y = ..2, . = ..1) ...`.

请问我的语法有什么问题？

非常感谢，

A.

========更新==========

使用建议的更新函数

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        ifelse(
          IQR({{yvar}}, na.rm = TRUE) == 0,
          x + runif(1, -0.01,0.01),
          x
        )
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}

结果如下图

如我的评论所述，tmp 的处理似乎为同一年的所有行输出今年的第一个值 yvar，这解释了情节。事实上，评论这个块给出了下图

Answer 1

ifelse 中的条件长度不正确。您可以将 data ggplot 参数更改为此。

data =
  tmp %>%
    group_by(!!groupvar) %>%
    mutate_at(
        vars(!!yvar),
        if (IQR(., na.rm = TRUE) == 0) {
          . + runif(1, -0.01,0.01)
        } else {
          .
        }
    )

您对 quosure 和 !! 的使用是正确的，但是您应该使用更新的 {{ operator。

这是更新后的函数

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        if (IQR(x, na.rm = TRUE) == 0) {
          x + runif(length(x), -0.01, 0.01)
        } else {
          x
        }
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}

func_boxplot2(data, date, days, date)

Answer 2

我认为您的问题得到了很好的回答。这是解决问题的一种完全不同的方法，可以回答您可能应该问的问题：如何修改用于生成箱线图的统计信息，而不是修改数据。

通常统计数据是 StatBoxplot，ggproto 对象的重要部分是方法 compute_group。在绘图中使用时，它 returns 包含列

的单行数据框

[1] "ymin"        "lower"       "middle"      "upper"      
[5] "ymax"        "outliers"    "notchupper"  "notchlower" 
[9] "x"           "relvarwidth" "flipped_aes"

这些大多有某种明显的意义；唯一不明显的是 outliers，它是一个列表模式列，包含一个包含异常值的数字向量。

因此，要完全摆脱异常值绘制，您可以创建一个继承统计数据，它与 StatBoxplot 类似，但会修改 compute_group:

的结果

NoOutlierStatBoxplot <- 
  ggproto("NoOutlierStatBoxplot", ggplot2::StatBoxplot,
          compute_group = function(..., self) {
            res <- ggproto_parent(StatBoxplot, self)$compute_group(...)
            res$ymin <- min(c(res$ymin, res$outliers[[1]]))
            res$ymax <- max(c(res$ymax, res$outliers[[1]]))
            res$outliers <- list(numeric())
            res
          })

（这与您所做的并不完全相同：它仍然在移除异常值后计算上四分位数和下四分位数。如果这对您很重要，您可能需要进行更广泛的修改。）

通过此修改，您可以删除 func_boxplot2 中的大量代码，包括删除 stat_boxplot():

func_boxplot3 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA, 
                 aes(group = {{groupvar}}), 
                 stat = NoOutlierStatBoxplot)
}

func_boxplot3(mydf, date, days, date)

R 用户定义函数中的整洁评估语法

Tidy evaluation syntax in R user-defined function

r

function

dplyr

tidyeval

quosure