用户编写函数的快速循环
fast looping for user written functions
我写了两个函数
a) 首先创建模拟数据并估计模型
b) 第二个重复此过程多次,并从多次模拟中获得平均统计数据。
我想做的第三步是在不同的样本量中重复这个过程。我知道如何使用 for 循环执行此操作,但需要很长时间。有没有人对如何提高循环速度有建议?
特别是,我会对使用并行处理或评估替代循环包感兴趣,例如 purrr
。
这是一个例子:
# create a the first function simulates data and estimates the model
genmodel <- function (n,meanx,meany){
df <- as.data.frame(list(mean_x=rnorm(n=n, mean=meanx, sd=1)))
df <- df %>% mutate(mean_y=rnorm(n=n, mean=meany, sd=1))
model<- lm_robust(mean_y ~ mean_x, data=df,
se_type = "stata")
pval<- as.data.frame(list(p=summary(model)$coefficients)) %>% t()
pval <- as.data.frame(pval) %>% rownames_to_column()
return(pval)
}
# example
> genmodel(n=100,meanx=2,meany=1)
rowname (Intercept) mean_x
1 p.Estimate 9.984653e-01 -0.05115484
2 p.Std..Error 2.027905e-01 0.10273142
3 p.t.value 4.923630e+00 -0.49794738
4 p.Pr...t.. 3.441203e-06 0.61963671
5 p.CI.Lower 5.960341e-01 -0.25502201
6 p.CI.Upper 1.400896e+00 0.15271232
7 p.DF 9.800000e+01 98.00000000
生成第二个函数,该函数将第一个函数迭代多次并对估计的统计数据进行平均
average_model <- function(nrep=100, # number of simulations
n,
mean_x,
mean_y
){
tmpres<- lapply(1:nrep, function(x) genmodel(n=n,meanx=mean_x,meany=mean_y))
tmpres <- do.call(rbind, tmpres)
vec<- names(tmpres[2:ncol(tmpres)])
tmpres <- unique(setDT(tmpres)[,paste("avg",(vec),sep = "_"):=map(.SD,~ mean(.x)),by=rowname,.SDcols=(vec)
][,nobs:=n] %>% select(rowname,`avg_(Intercept)`,avg_mean_x,nobs))
}
# example
tst<-average_model(nrep=50,n=100,mean_x=2,mean_y=1)
rowname avg_(Intercept) avg_mean_x nobs
1: p.Estimate 1.06002378 -0.03100749 100
2: p.Std..Error 0.22368299 0.09921118 100
3: p.t.value 4.83878275 -0.31190506 100
4: p.Pr...t.. 0.00206157 0.45198433 100
5: p.CI.Lower 0.61613217 -0.22788884 100
6: p.CI.Upper 1.50391540 0.16587386 100
7: p.DF 98.00000000 98.00000000 100
现在我的 objective 是针对不同的样本大小迭代此 average_model
函数,并创建一个包含所有信息的唯一数据框。这可以使用 for
循环
轻松完成
for (i in seq(from=100,to=500,by=30)){
tmpres <- average_model(nrep=50,n=i,mean_x=2,mean_y=1)
results <- rbind(results, tmpres) # sequentially paste results
head(results)
rowname avg_(Intercept) avg_mean_x nobs
1: p.Estimate 1.001296821 0.000989775 100
2: p.Std..Error 0.224800002 0.099078646 100
3: p.t.value 4.530076894 0.027428073 100
4: p.Pr...t.. 0.001934362 0.504152193 100
5: p.CI.Lower 0.555188534 -0.195628574 100
6: p.CI.Upper 1.447405108 0.197608124 100
# it can also be done using `apply`, but both approach are quite slow
tmpres<- lapply(seq(from=100,to=500,by=30), function(x) average_model(nrep=50,n=x,mean_x=2,mean_y=1)
tmpres <- do.call(rbind, tmpres)
这个 for 循环的问题是它非常慢。
有没有办法使用并行处理来做到这一点?减少 运行 时间的其他建议?
这可以更直接地完成:
expand_grid(
nreps = 50,
n = seq.default(100, 500, by = 30),
mean_x = 2, mean_y = 1
) %>%
rowid_to_column("n_idx") %>%
uncount(nreps, .remove = FALSE) %>%
rowid_to_column("nreps_idx") %>%
rowwise() %>%
mutate(
lm_robust =
estimatr::lm_robust(
y ~ X,
data =
tibble(y = rnorm(n, mean = mean_y, sd = 1),
X = rnorm(n, mean = mean_x, sd = 1)),
se_type = "stata"
) %>%
coefficients() %>%
set_names(str_c("coef_", names(.))) %>%
list()
) %>%
unnest_wider(lm_robust) %>%
group_by(nreps_idx) %>%
summarise(
n = unique(n),
across(starts_with("coef"), mean),
)
这导致
# A tibble: 700 × 4
nreps_idx n `coef_(Intercept)` coef_X
<int> <dbl> <dbl> <dbl>
1 1 100 1.34 -0.183
2 2 100 0.845 0.0188
3 3 100 0.949 0.0341
4 4 100 1.20 -0.0705
5 5 100 0.731 0.0419
6 6 100 0.809 0.0564
7 7 100 0.920 0.0558
8 8 100 1.22 -0.0673
9 9 100 1.22 -0.171
10 10 100 1.26 -0.127
# … with 690 more rows
计算速度相当快。
现在,我没有在您的代码中包含所有参数,因为老实说,取它们的平均值没有意义,但如果您也想要它们,那么...
expand_grid(
nreps = 50,
n = seq.default(100, 500, by = 30),
mean_x = 2, mean_y = 1
) %>%
rowid_to_column("n_idx") %>%
uncount(nreps, .remove = FALSE) %>%
rowid_to_column("nreps_idx") %>%
rowwise() %>%
mutate(
lm_robust =
estimatr::lm_robust(
y ~ X,
data =
tibble(y = rnorm(n, mean = mean_y, sd = 1),
X = rnorm(n, mean = mean_x, sd = 1)),
se_type = "stata"
) %>%
# SECOND APPROACH
summary() %>%
`[[`("coefficients") %>%
as_tibble(rownames = "rowname") %>%
pivot_wider(names_from = "rowname",
values_from = everything()) %>%
# FIRST APPROACH
# coefficients() %>%
# set_names(str_c("coef_", names(.))) %>%
list()
) %>%
unnest_wider(lm_robust) %>%
print() %>%
group_by(nreps_idx) %>%
summarise(
n = unique(n),
across(starts_with("Estimate"), mean),
# insert statements here to summarise the other gathered stuff
)
但这让事情变得不必要的复杂。
这种“所有 data.table
”方法的速度大约是原来的两倍,但仍然令人失望。
基本思想是 assemble 将所有数据集合并为一个大 data.table
,然后使用 data.table
分组依据循环遍历模型。
library(data.table)
library(estimatr)
library(tictoc)
##
tic()
mf <- data.table(nrep=1:50, meanx=2, meany=1)
mf <- mf[, .(n=seq(100, 500, 30)), by=.(nrep, meanx, meany)]
data <- mf[, .(mean_x=rnorm(n, meanx), mean_y=rnorm(n, meany)), by=.(n, nrep, meanx, meany)]
result <- data[, as.data.table(t(summary(lm(mean_y~mean_x, .SD, se_type = 'stata'))$coefficients), keep.rownames = TRUE)
, by=.(n, nrep, meanx, meany)][, nrep:=NULL]
result <- result[, lapply(.SD, mean), by=.(n, meanx, meany, rn)]
toc()
## 2.58 sec elapsed
所以这在我的机器上需要 2.3 - 2.6 秒,而你的代码运行大约需要 4.0 - 4.1 秒。大约 80% 的时间花在 运行 lm_robust(...)
上。如果我将它换成 base R 中的 lm(...)
,它会在大约 1 秒内运行。
我写了两个函数
a) 首先创建模拟数据并估计模型
b) 第二个重复此过程多次,并从多次模拟中获得平均统计数据。
我想做的第三步是在不同的样本量中重复这个过程。我知道如何使用 for 循环执行此操作,但需要很长时间。有没有人对如何提高循环速度有建议?
特别是,我会对使用并行处理或评估替代循环包感兴趣,例如 purrr
。
这是一个例子:
# create a the first function simulates data and estimates the model
genmodel <- function (n,meanx,meany){
df <- as.data.frame(list(mean_x=rnorm(n=n, mean=meanx, sd=1)))
df <- df %>% mutate(mean_y=rnorm(n=n, mean=meany, sd=1))
model<- lm_robust(mean_y ~ mean_x, data=df,
se_type = "stata")
pval<- as.data.frame(list(p=summary(model)$coefficients)) %>% t()
pval <- as.data.frame(pval) %>% rownames_to_column()
return(pval)
}
# example
> genmodel(n=100,meanx=2,meany=1)
rowname (Intercept) mean_x
1 p.Estimate 9.984653e-01 -0.05115484
2 p.Std..Error 2.027905e-01 0.10273142
3 p.t.value 4.923630e+00 -0.49794738
4 p.Pr...t.. 3.441203e-06 0.61963671
5 p.CI.Lower 5.960341e-01 -0.25502201
6 p.CI.Upper 1.400896e+00 0.15271232
7 p.DF 9.800000e+01 98.00000000
生成第二个函数,该函数将第一个函数迭代多次并对估计的统计数据进行平均
average_model <- function(nrep=100, # number of simulations
n,
mean_x,
mean_y
){
tmpres<- lapply(1:nrep, function(x) genmodel(n=n,meanx=mean_x,meany=mean_y))
tmpres <- do.call(rbind, tmpres)
vec<- names(tmpres[2:ncol(tmpres)])
tmpres <- unique(setDT(tmpres)[,paste("avg",(vec),sep = "_"):=map(.SD,~ mean(.x)),by=rowname,.SDcols=(vec)
][,nobs:=n] %>% select(rowname,`avg_(Intercept)`,avg_mean_x,nobs))
}
# example
tst<-average_model(nrep=50,n=100,mean_x=2,mean_y=1)
rowname avg_(Intercept) avg_mean_x nobs
1: p.Estimate 1.06002378 -0.03100749 100
2: p.Std..Error 0.22368299 0.09921118 100
3: p.t.value 4.83878275 -0.31190506 100
4: p.Pr...t.. 0.00206157 0.45198433 100
5: p.CI.Lower 0.61613217 -0.22788884 100
6: p.CI.Upper 1.50391540 0.16587386 100
7: p.DF 98.00000000 98.00000000 100
现在我的 objective 是针对不同的样本大小迭代此 average_model
函数,并创建一个包含所有信息的唯一数据框。这可以使用 for
循环
for (i in seq(from=100,to=500,by=30)){
tmpres <- average_model(nrep=50,n=i,mean_x=2,mean_y=1)
results <- rbind(results, tmpres) # sequentially paste results
head(results)
rowname avg_(Intercept) avg_mean_x nobs
1: p.Estimate 1.001296821 0.000989775 100
2: p.Std..Error 0.224800002 0.099078646 100
3: p.t.value 4.530076894 0.027428073 100
4: p.Pr...t.. 0.001934362 0.504152193 100
5: p.CI.Lower 0.555188534 -0.195628574 100
6: p.CI.Upper 1.447405108 0.197608124 100
# it can also be done using `apply`, but both approach are quite slow
tmpres<- lapply(seq(from=100,to=500,by=30), function(x) average_model(nrep=50,n=x,mean_x=2,mean_y=1)
tmpres <- do.call(rbind, tmpres)
这个 for 循环的问题是它非常慢。
有没有办法使用并行处理来做到这一点?减少 运行 时间的其他建议?
这可以更直接地完成:
expand_grid(
nreps = 50,
n = seq.default(100, 500, by = 30),
mean_x = 2, mean_y = 1
) %>%
rowid_to_column("n_idx") %>%
uncount(nreps, .remove = FALSE) %>%
rowid_to_column("nreps_idx") %>%
rowwise() %>%
mutate(
lm_robust =
estimatr::lm_robust(
y ~ X,
data =
tibble(y = rnorm(n, mean = mean_y, sd = 1),
X = rnorm(n, mean = mean_x, sd = 1)),
se_type = "stata"
) %>%
coefficients() %>%
set_names(str_c("coef_", names(.))) %>%
list()
) %>%
unnest_wider(lm_robust) %>%
group_by(nreps_idx) %>%
summarise(
n = unique(n),
across(starts_with("coef"), mean),
)
这导致
# A tibble: 700 × 4
nreps_idx n `coef_(Intercept)` coef_X
<int> <dbl> <dbl> <dbl>
1 1 100 1.34 -0.183
2 2 100 0.845 0.0188
3 3 100 0.949 0.0341
4 4 100 1.20 -0.0705
5 5 100 0.731 0.0419
6 6 100 0.809 0.0564
7 7 100 0.920 0.0558
8 8 100 1.22 -0.0673
9 9 100 1.22 -0.171
10 10 100 1.26 -0.127
# … with 690 more rows
计算速度相当快。
现在,我没有在您的代码中包含所有参数,因为老实说,取它们的平均值没有意义,但如果您也想要它们,那么...
expand_grid(
nreps = 50,
n = seq.default(100, 500, by = 30),
mean_x = 2, mean_y = 1
) %>%
rowid_to_column("n_idx") %>%
uncount(nreps, .remove = FALSE) %>%
rowid_to_column("nreps_idx") %>%
rowwise() %>%
mutate(
lm_robust =
estimatr::lm_robust(
y ~ X,
data =
tibble(y = rnorm(n, mean = mean_y, sd = 1),
X = rnorm(n, mean = mean_x, sd = 1)),
se_type = "stata"
) %>%
# SECOND APPROACH
summary() %>%
`[[`("coefficients") %>%
as_tibble(rownames = "rowname") %>%
pivot_wider(names_from = "rowname",
values_from = everything()) %>%
# FIRST APPROACH
# coefficients() %>%
# set_names(str_c("coef_", names(.))) %>%
list()
) %>%
unnest_wider(lm_robust) %>%
print() %>%
group_by(nreps_idx) %>%
summarise(
n = unique(n),
across(starts_with("Estimate"), mean),
# insert statements here to summarise the other gathered stuff
)
但这让事情变得不必要的复杂。
这种“所有 data.table
”方法的速度大约是原来的两倍,但仍然令人失望。
基本思想是 assemble 将所有数据集合并为一个大 data.table
,然后使用 data.table
分组依据循环遍历模型。
library(data.table)
library(estimatr)
library(tictoc)
##
tic()
mf <- data.table(nrep=1:50, meanx=2, meany=1)
mf <- mf[, .(n=seq(100, 500, 30)), by=.(nrep, meanx, meany)]
data <- mf[, .(mean_x=rnorm(n, meanx), mean_y=rnorm(n, meany)), by=.(n, nrep, meanx, meany)]
result <- data[, as.data.table(t(summary(lm(mean_y~mean_x, .SD, se_type = 'stata'))$coefficients), keep.rownames = TRUE)
, by=.(n, nrep, meanx, meany)][, nrep:=NULL]
result <- result[, lapply(.SD, mean), by=.(n, meanx, meany, rn)]
toc()
## 2.58 sec elapsed
所以这在我的机器上需要 2.3 - 2.6 秒,而你的代码运行大约需要 4.0 - 4.1 秒。大约 80% 的时间花在 运行 lm_robust(...)
上。如果我将它换成 base R 中的 lm(...)
,它会在大约 1 秒内运行。