使用 (d)plyr 创建一系列新列

Create a series of new columns with (d)plyr

举例来说,我有一个包含列 a 的数据框,我想为 i.

的多个值创建列 a^i
> dat <- data.frame(a=1:5)
> dat
    a
1   1
2   2
3   3
4   4
5   5

例如,我想要 i=2:5 的输出:

  a power_2 power_3 power_4 power_5
1 1       1       1       1       1
2 2       4       8      16      32
3 3       9      27      81     243
4 4      16      64     256    1024
5 5      25     125     625    3125

目前我使用 data.table 得到如下输出:

DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]

如何处理plyr/dplyr?当然,我可以通过为每个 i 输入 power_i=a^i 来执行以下操作,但这不是我想要的。

mutate(dat, power_2=a^2, power_3=a^3, ...)

答案后的结论

已经提出了几个答案,并由@docendo discimus 进行了比较。我只是添加与 data.table 的比较。

library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <-  data.table(dat)

library(microbenchmark)

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
)
Unit: relative
       expr       min        lq      mean    median        uq       max neval cld
 data.table  1.022945  1.039674  1.108558  1.026319  1.083644  2.370180    30  a 
     Henrik  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30  a 
      dd.do  1.149195  1.160735  1.167672  1.158141  1.150280  1.268279    30  a 
      dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711    30   b

更新基准 有两个 base 解决方案,Henrik2 和 josh(来自他的评论),这是最快的:

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b

这是一个使用 do 的选项:

i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125

另一种选择,使用bind_cols

dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125

评论后编辑:

@Henrik 的解决方案比我的快:

set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))

library(microbenchmark)

microbenchmark(
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
  )
Unit: relative
   expr       min        lq    median        uq       max neval
 Henrik  1.000000  1.000000  1.000000  1.000000  1.000000    30
  dd.do  1.138506  1.179104  1.173298  1.149581  2.660237    30
  dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538    30

一种可能是在do中使用outer,然后将名称设置为setNames

i <- 2:5
dat %>%
  do(data.frame(., outer(.$a, i, `^`))) %>%
  setNames(., c("a", paste0("power_", i)))

#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125

如果你先命名 'power vector' "i",你可以调用 cbind 而不是 dodata.frame,我认为没有立即需要dplyr 在这种特殊情况下起作用。

cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`))
#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125

base,非 do 代码对于较大的样本数据来说速度更快。我还添加了@Josh O'Brien 的 base 解决方案。

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b

也许这也有帮助

nm1 <- paste('power', 2:5, sep="_")
lst <- setNames(as.list(2:5), nm1)
dat1 <- setNames(as.data.frame(replicate(4, 1:5)),c('a', nm1) )
mutate_each_(dat1, funs(.^lst$.), nm1)
#    a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125