使用 (d)plyr 创建一系列新列
Create a series of new columns with (d)plyr
举例来说,我有一个包含列 a
的数据框,我想为 i
.
的多个值创建列 a^i
> dat <- data.frame(a=1:5)
> dat
a
1 1
2 2
3 3
4 4
5 5
例如,我想要 i=2:5
的输出:
a power_2 power_3 power_4 power_5
1 1 1 1 1 1
2 2 4 8 16 32
3 3 9 27 81 243
4 4 16 64 256 1024
5 5 25 125 625 3125
目前我使用 data.table
得到如下输出:
DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]
如何处理plyr
/dplyr
?当然,我可以通过为每个 i
输入 power_i=a^i
来执行以下操作,但这不是我想要的。
mutate(dat, power_2=a^2, power_3=a^3, ...)
答案后的结论
已经提出了几个答案,并由@docendo discimus 进行了比较。我只是添加与 data.table
的比较。
library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <- data.table(dat)
library(microbenchmark)
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq mean median uq max neval cld
data.table 1.022945 1.039674 1.108558 1.026319 1.083644 2.370180 30 a
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
dd.do 1.149195 1.160735 1.167672 1.158141 1.150280 1.268279 30 a
dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711 30 b
更新基准 有两个 base
解决方案,Henrik2 和 josh(来自他的评论),这是最快的:
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
这是一个使用 do
的选项:
i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
另一种选择,使用bind_cols
:
dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
评论后编辑:
@Henrik 的解决方案比我的快:
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
library(microbenchmark)
microbenchmark(
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 30
dd.do 1.138506 1.179104 1.173298 1.149581 2.660237 30
dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538 30
一种可能是在do
中使用outer
,然后将名称设置为setNames
i <- 2:5
dat %>%
do(data.frame(., outer(.$a, i, `^`))) %>%
setNames(., c("a", paste0("power_", i)))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
如果你先命名 'power vector' "i",你可以调用 cbind
而不是 do
和 data.frame
,我认为没有立即需要dplyr
在这种特殊情况下起作用。
cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
base
,非 do
代码对于较大的样本数据来说速度更快。我还添加了@Josh O'Brien 的 base
解决方案。
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
也许这也有帮助
nm1 <- paste('power', 2:5, sep="_")
lst <- setNames(as.list(2:5), nm1)
dat1 <- setNames(as.data.frame(replicate(4, 1:5)),c('a', nm1) )
mutate_each_(dat1, funs(.^lst$.), nm1)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
举例来说,我有一个包含列 a
的数据框,我想为 i
.
a^i
> dat <- data.frame(a=1:5)
> dat
a
1 1
2 2
3 3
4 4
5 5
例如,我想要 i=2:5
的输出:
a power_2 power_3 power_4 power_5
1 1 1 1 1 1
2 2 4 8 16 32
3 3 9 27 81 243
4 4 16 64 256 1024
5 5 25 125 625 3125
目前我使用 data.table
得到如下输出:
DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]
如何处理plyr
/dplyr
?当然,我可以通过为每个 i
输入 power_i=a^i
来执行以下操作,但这不是我想要的。
mutate(dat, power_2=a^2, power_3=a^3, ...)
答案后的结论
已经提出了几个答案,并由@docendo discimus 进行了比较。我只是添加与 data.table
的比较。
library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <- data.table(dat)
library(microbenchmark)
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq mean median uq max neval cld
data.table 1.022945 1.039674 1.108558 1.026319 1.083644 2.370180 30 a
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
dd.do 1.149195 1.160735 1.167672 1.158141 1.150280 1.268279 30 a
dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711 30 b
更新基准 有两个 base
解决方案,Henrik2 和 josh(来自他的评论),这是最快的:
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
这是一个使用 do
的选项:
i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
另一种选择,使用bind_cols
:
dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
评论后编辑:
@Henrik 的解决方案比我的快:
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
library(microbenchmark)
microbenchmark(
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 30
dd.do 1.138506 1.179104 1.173298 1.149581 2.660237 30
dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538 30
一种可能是在do
中使用outer
,然后将名称设置为setNames
i <- 2:5
dat %>%
do(data.frame(., outer(.$a, i, `^`))) %>%
setNames(., c("a", paste0("power_", i)))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
如果你先命名 'power vector' "i",你可以调用 cbind
而不是 do
和 data.frame
,我认为没有立即需要dplyr
在这种特殊情况下起作用。
cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
base
,非 do
代码对于较大的样本数据来说速度更快。我还添加了@Josh O'Brien 的 base
解决方案。
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
也许这也有帮助
nm1 <- paste('power', 2:5, sep="_")
lst <- setNames(as.list(2:5), nm1)
dat1 <- setNames(as.data.frame(replicate(4, 1:5)),c('a', nm1) )
mutate_each_(dat1, funs(.^lst$.), nm1)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125