在函数中指定变量的准则

Guidelines for specifying variables in a function

考虑在此处创建 dat1

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200))

dat1 有 5 个变量的测量值,观察值 (IDs) 可以根据 3 个分组变量进行分组:LocStateRegion 我必须对每个响应 variable/grouping 变量组合执行各种任务,所以我一直在编写函数以使其更容易,并保持我的分析整洁。我正在使用 rstatix 包进行多项操作。以下函数将对我指定的数据进行 Kruskal Wallis 检验,计算效果大小 efsz 和 return 结果在单个数据框中 res:

library(rstatix)
KruskTest <- function(dat, groupvar, var){
  kt <- dat%>%kruskal_test(get(var) ~ get(groupvar))
  efsz <- dat%>%kruskal_effsize(get(var) ~ get(groupvar))
  res <<- cbind(kt, efsz[,3:5])
  res[1,1] <<- var
  res$groupvar <<- groupvar 
  res <<- res[,c(10,1:9)]
}
KruskTest(dat=dat1, groupvar = "Region", var = "var1") 

现在我可以使用该函数遍历每个响应变量并在单个数据框中获取分组变量的结果(示例显示 Region),这正是我需要的:

vars <- paste(names(dat1[,5:9]))
a <- data.frame()
for(i in vars){
  KruskTest(dat=dat1, groupvar="Region", var= i)
  a <- rbind(a, res)
}

这对 Kruskal Wallis 测试非常有用,现在我想制作一个非常相似的函数来进行 duns 测试,但看看会发生什么:

dunn <- function(dat, groupvar, var){
  res <<- dat%>%rstatix::dunn_test(get(var) ~ get(groupvar), p.adjust.method = "bonferroni")
}
dunn(dat=dat1, groupvar="Region", var = "var1")

r:Error: Can't extract columns that don't exist. x The column `get(groupvar)` doesn't exist.

在用户编写的函数之外,您可以使用完全相同的方式为 dunn_test()kruskal_test() 指定数据。那么在这两个函数中指定变量有什么区别,为什么第一个起作用而第二个不起作用?

考虑到@Gregor 关于不写入环境并尝试清理其他一些粗糙边缘的评论,我提出了改进建议,尽管 Gregor 是正确的,但您最大的问题不过是打字错误。

library(rstatix)
library(purrr)

# rewritten to avoid writing to environment

NewKruskTest <- function(dat, groupvar, var) {
  kt <- dat %>% kruskal_test(as.formula(paste(var, "~", groupvar)))
  efsz <- dat %>% kruskal_effsize(as.formula(paste(var, "~", groupvar)))
  results <- cbind(kt, efsz[,3:5])
  results$groupvar <- groupvar 
  results <- results[,c(10,1:9)]
  return(results)
}

# works on a single if you want to test
# NewKruskTest(dat = dat1, groupvar = "Region", var = "var1") 

# No paste needed
vars <- names(dat1[,5:9])

# NewKruskTest will work in your existing for loop but you 
# may find `purrr:map_dfr` cleaner

map_dfr(vars, ~ NewKruskTest(dat = dat1, groupvar = "Region", var = .))
#>   groupvar  .y.   n statistic df      p         method      effsize method.1
#> 1   Region var1 200 3.0520896  1 0.0806 Kruskal-Wallis  0.010364089  eta2[H]
#> 2   Region var2 200 0.5961552  1 0.4400 Kruskal-Wallis -0.002039620  eta2[H]
#> 3   Region var3 200 1.6330090  1 0.2010 Kruskal-Wallis  0.003197015  eta2[H]
#> 4   Region var4 200 3.4031343  1 0.0651 Kruskal-Wallis  0.012137042  eta2[H]
#> 5   Region var5 200 0.7230090  1 0.3950 Kruskal-Wallis -0.001398945  eta2[H]
#>   magnitude
#> 1     small
#> 2     small
#> 3     small
#> 4     small
#> 5     small

# NewDunn rewritten

NewDunn <- function(dat, groupvar, var) {
  results <- dat %>% rstatix::dunn_test(as.formula(paste(var, "~", groupvar)), 
                        p.adjust.method = "bonferroni")
  results$groupvar <- groupvar 
  results <- results[,c(10,1:9)]
  return(results)
}

# works on a single if you want to test
# NewDunn(dat=dat1, groupvar ="Region", var = "var1")

map_dfr(vars, ~ NewDunn(dat = dat1, groupvar = "Region", var = .))
#> # A tibble: 5 x 10
#>   groupvar .y.   group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
#>   <chr>    <chr> <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
#> 1 Region   var1  r1     r2       100   100    -1.75  0.0806 0.0806 ns          
#> 2 Region   var2  r1     r2       100   100    -0.772 0.440  0.440  ns          
#> 3 Region   var3  r1     r2       100   100    -1.28  0.201  0.201  ns          
#> 4 Region   var4  r1     r2       100   100     1.84  0.0651 0.0651 ns          
#> 5 Region   var5  r1     r2       100   100    -0.850 0.395  0.395  ns

根据您的数据


set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200))