如何将 data.table 列发送到函数

How to send data.table columns to a function

我有以下数据和代码:

mydt = structure(list(vnum1 = c(0.517551446921093, -0.997822163825322, 
3.40784990301597, -0.20990292802279, 0.171252718589118, -0.624084617915488, 
0.0979152932727754, -0.673949942523713, 0.689937370719125, -0.356403906786312, 
-0.565253563082689, -0.725285109477077, -0.343253827285705, -0.515803106223986, 
2.21193745540815, 0.179392018244011, 0.695885203438304, -0.869946981188651, 
0.170084087339536, 0.864392658315656, 0.801471783050381, 0.753880989575548, 
-0.572671791856263, -0.238511443188091, -1.1837711276515, 1.13728246296508, 
0.702244681081861, -0.851470541269798, 0.0471820411719659, 0.547952252697306, 
0.527539936397851, 0.247070882010565, -0.562100684713534, -1.05183021003772, 
0.934263969812236, -0.603673312084538, -2.00612207642211, 0.2312103046843, 
-0.214991379754579, 0.282701708464789, 0.289934023279607, 0.567328033965404, 
-0.359157137438815, 0.648221129776207, 0.857904763904759, 0.289415512264559, 
1.06555885899638, 0.333119386976963, -1.46070627726311, 0.0552050036156248
), vfac1 = structure(c(2L, 1L, 1L, 2L, 1L, 3L, 2L, 3L, 1L, 4L, 
4L, 3L, 1L, 3L, 1L, 4L, 4L, 4L, 1L, 2L, 2L, 4L, 2L, 4L, 1L, 3L, 
4L, 1L, 2L, 2L, 2L, 1L, 3L, 4L, 1L, 2L, 1L, 3L, 1L, 4L, 2L, 3L, 
2L, 1L, 2L, 2L, 2L, 3L, 4L, 2L), .Label = c("1", "2", "3", "4"
), class = "factor"), vch1 = structure(c(3L, 4L, 5L, 4L, 1L, 
5L, 5L, 3L, 3L, 4L, 1L, 4L, 3L, 5L, 1L, 3L, 4L, 5L, 1L, 3L, 5L, 
2L, 5L, 5L, 1L, 2L, 5L, 5L, 1L, 3L, 4L, 1L, 2L, 2L, 5L, 1L, 4L, 
2L, 1L, 5L, 4L, 4L, 3L, 2L, 5L, 4L, 3L, 2L, 3L, 2L), .Label = c("A", 
"B", "C", "D", "E"), class = "factor")), .Names = c("vnum1", 
"vfac1", "vch1"), class = c("data.table", "data.frame"), row.names = c(NA, 
50L))


mydt[,list(mean=mean(vnum1), sd=sd(vnum1)),list(vfac1, vch1)]
    vfac1 vch1        mean         sd
 1:     2    C  0.52725962 0.54536269
 2:     1    D -1.50197212 0.71297571
 3:     1    E  1.16354778 2.13889714
 4:     2    D  0.22424664 0.31039463
 5:     1    A  0.23359711 1.10743823
 6:     3    E -0.56994386 0.07656659
 7:     2    E  0.29615501 0.67455339
 8:     3    C -0.67394994         NA
 9:     1    C  0.17334177 0.73057650
10:     4    D  0.16974065 0.74408077
11:     4    A -0.56525356         NA
12:     3    D -0.07897854 0.91401552
13:     4    C -0.64065713 1.15972463
14:     4    E -0.03087801 0.67895741
15:     4    B -0.14897461 1.27683063
16:     3    B  0.28487787 0.69502367
17:     2    A -0.27824564 0.46022423
18:     1    B  0.64822113         NA
19:     2    B  0.05520500         NA

我想创建以下函数,我可以在其中发送列名并获得上述结果。但是,以下功能不起作用:

myfn = function(ddt, 'vnum1', 'vfac1', 'vch1'){
        mydt[,list(mean=mean('vnum1'), sd=sd('vnum1')),list('vfac1', 'vch1')]
}

如何发送列名(或列向量本身)以便从函数中获取结果?感谢您的帮助。

你可以试试

 myfn <- function(dt, v1, v2, v3){
   dt[, list(mean=mean(eval(as.name(v1))), sd=sd(eval(as.name(v1)))), 
        by=c(deparse(substitute(v2)), deparse(substitute(v3)))]
 }
 myfn(mydt, 'vnum1', vfac1, vch1)
 #   vfac1 vch1        mean         sd
 #1:     2    C  0.52725962 0.54536269
 #2:     1    D -1.50197212 0.71297571
 #3:     1    E  1.16354778 2.13889714
 #4:     2    D  0.22424664 0.31039463
 #5:     1    A  0.23359711 1.10743823
 #6:     3    E -0.56994386 0.07656659
 #7:     2    E  0.29615501 0.67455339
 #8:     3    C -0.67394994         NA
 #9:     1    C  0.17334177 0.73057650
#10:     4    D  0.16974065 0.74408077
#11:     4    A -0.56525356         NA
#12:     3    D -0.07897854 0.91401552
#13:     4    C -0.64065713 1.15972463
#14:     4    E -0.03087801 0.67895741
#15:     4    B -0.14897461 1.27683063
#16:     3    B  0.28487787 0.69502367
#17:     2    A -0.27824564 0.46022423
#18:     1    B  0.64822113         NA
#19:     2    B  0.05520500         NA

在更改列名时也有效

 setnames(mydt, names(mydt), letters[1:3])
 head(myfn(mydt, 'a', b, c),2)
 #   b c       mean        sd
 #1: 2 C  0.5272596 0.5453627
 #2: 1 D -1.5019721 0.7129757

或者您可以删除 deparse(substitute(.. 并传递变量是带引号的字符串

 myfn <- function(dt, v1, v2, v3){
    dt[, list(mean=mean(eval(as.name(v1))), sd=sd(eval(as.name(v1)))), 
        by=c(v2, v3)]
 }
myfn(mydt, 'vnum1', 'vfac1', 'vch1')

这是另一个可以接受不带引号的字符串的变体

 myfn <- function(dt, v1, v2, v3){
   args  <- as.list(match.call())
   e1 <- c(deparse(args$v2), deparse(args$v3))
   dt[, .(mean=mean(eval(args$v1)), sd=sd(eval(args$v1))), by=e1]
  }

  head(myfn(mydt, vnum1, vfac1, vch1),2)
  #  vfac1 vch1       mean        sd
  #1:     2    C  0.5272596 0.5453627
  #2:     1    D -1.5019721 0.7129757

我认为您可以通过使用 .SD 来简化它并使其更加 data.table 惯用。您也不需要在 by 语句中进行评估,因为 data.table 不会在全局环境中查找变量以进行聚合(与 j 语句不同)。就这么简单

myfn <- function(dt, v1, v2, v3){
  dt[, .(mean = mean(.SD[[v1]]), sd = sd(.SD[[v1]])), c(v2, v3)]
}
myfn(mydt, "vnum1", "vfac1", "vch1")
##    vfac1 vch1       mean         sd
## 1:     2    C  0.5272596 0.54536269
## 2:     1    D -1.5019721 0.71297571
## 3:     1    E  1.1635478 2.13889714
## 4:     2    D  0.2242466 0.31039463
## 5:     1    A  0.2335971 1.10743823
## 6:     3    E -0.5699439 0.07656659
...

正在测试其他列名

setnames(mydt, letters[1:3])
head(myfn(mydt, "a", "b", "c"), 2)
#    b c       mean        sd
# 1: 2 C  0.5272596 0.5453627
# 2: 1 D -1.5019721 0.7129757

或者,您也可以使用 get,如

myfn <- function(dt, v1, v2, v3){
  dt[, .(mean = mean(get(v1)), sd = sd(get(v1))), c(v2, v3)]
}

尽管如此,j 语句中的效率明智的@akruns eval(as.name()) 组合应该是最快的,直到 Arun/Matt 将优化 .SD