更快的回归模型方差分析
Faster anova of regression models
我有以下玩具数据-
data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))
根据这些数据,我创建了两个模型如下 -
fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)
最后,我使用 anova
比较这些模型
anova(fit1, fit2)
因为我 运行 这些测试是在大约 100 万个独立的数据集上进行的,所以我想提高性能并且 不想 使用 lm
和 anova
函数。
为了加快计算速度,我使用 RcppArmadillo::fastLM
而不是 lm
,但是没有可用的 anova
函数来比较模型。有没有更快的函数(相比lm
),也有对应的anova
函数?
我们将不胜感激任何改进性能的建议。
以下是下面使用的各种 lm
版本的性能结果。对于线性回归,fastLM
是一个明显的赢家。 @Jay 的 anova2
函数比 anova
函数快得多,并且适用于 fastLM。 @Donald 的函数 f
也比 anova
函数快并且适用于 fastLM。
microbenchmark::microbenchmark(
base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
anova_base = anova(fit1, fit2),
Jay_fastLM = anova2(fastLM1, fastLM1),
Jay_lm.fit = anova2(lm.fit3, lm.fit4),
Donald = f(fastLM1, fastLM1),
times = 100L,
control=list(warmup=100L)
)
Unit: microseconds
expr min lq mean median uq max neval cld
base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870 100 e
base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435 4761.020 100 e
lm.fit_1 94.212 106.9020 112.3093 111.2235 116.7010 147.192 100 a
lm.fit_2 124.220 129.8080 134.4455 132.9830 138.2510 156.166 100 a
lm.fit_3 853.906 873.9035 902.5856 889.9715 917.9375 1028.415 100 cd
lm.fit_4 991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399 100 d
fastLm1 368.289 390.7805 434.1467 422.0855 476.9085 584.761 100 a c
fastLm2 416.645 441.8660 481.0027 462.8850 514.0755 617.619 100 a c
anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093 100 f
Jay_fastLM 202.026 218.2580 229.6244 226.3405 238.9490 303.964 100 ab
Jay_lm.fit 200.028 216.0805 234.0143 229.7580 246.1870 292.268 100 ab
Donald 549.425 582.8105 612.6990 605.4400 625.5340 1079.989 100 bc
我们可以破解 stats:::anova.lmlist
,以便它适用于 .lm.fit
(注意圆点)和 RcppArmadillo::fastLm
生成的列表。请检查stats:::anova.lmlist
,如果我没有删除你需要的东西!
anova2 <- function(object, ...) {
objects <- list(object, ...)
ns <- vapply(objects, function(x) length(x$residuals), 0L)
stopifnot(!any(ns != ns[1L]))
resdf <- vapply(objects, df.residual, 0L)
resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
bigmodel <- order(resdf)[1L]
dfs <- c(NA, -diff(resdf))[bigmodel]
ssq <- c(NA, -diff(resdev))[bigmodel]
df.scale <- resdf[bigmodel]
scale <- resdev[bigmodel]/resdf[bigmodel]
fstat <- (ssq/dfs)/scale
p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
return(c(F=fstat, p=p))
}
这些包装器使 .lm.fit
和 RcppArmadillo::fastLm
更方便一些:
lm_fun <- function(fo, dat) {
X <- model.matrix(fo, dat)
fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
fit$df.residual <- dim(X)[1] - dim(X)[2]
return(fit)
}
fastLm_fun <- function(fo, dat) {
fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
return(fit)
}
使用它
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
# F p
# 0.3609728 0.5481032
## Or use `fastLm_fun` here, but it's slower.
比较
anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
# Analysis of Variance Table
#
# Model 1: y ~ x1 + x3
# Model 2: y ~ x2 + x3 + x4
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 997 1003.7
# 2 996 1003.4 1 0.36365 0.361 0.5481
lm_fun
(优于 fastLm_fun
)与 anova2
相结合似乎比传统方法快 60%:
microbenchmark::microbenchmark(
conventional={
fit1 <- lm(y ~ x1 + x3, data)
fit2 <- lm(y ~ x2 + x3 + x4, data)
anova(fit1, fit2)
},
anova2={ ## using `.lm.fit`
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova2_Fast={ ## using `RcppArmadillo::fastLm`
fit1 <- fastLm_fun(y ~ x1 + x3, data)
fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova_Donald={
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova_Donald(fit1, fit2)
},
times=1000L,
control=list(warmup=100L)
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720 1000 d
# anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955 1000 a
# anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279 1000 c
# anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247 1000 b
数据:
set.seed(42)
data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
x4=rnorm(1000))
我有以下玩具数据-
data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))
根据这些数据,我创建了两个模型如下 -
fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)
最后,我使用 anova
anova(fit1, fit2)
因为我 运行 这些测试是在大约 100 万个独立的数据集上进行的,所以我想提高性能并且 不想 使用 lm
和 anova
函数。
为了加快计算速度,我使用 RcppArmadillo::fastLM
而不是 lm
,但是没有可用的 anova
函数来比较模型。有没有更快的函数(相比lm
),也有对应的anova
函数?
我们将不胜感激任何改进性能的建议。
以下是下面使用的各种 lm
版本的性能结果。对于线性回归,fastLM
是一个明显的赢家。 @Jay 的 anova2
函数比 anova
函数快得多,并且适用于 fastLM。 @Donald 的函数 f
也比 anova
函数快并且适用于 fastLM。
microbenchmark::microbenchmark(
base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
anova_base = anova(fit1, fit2),
Jay_fastLM = anova2(fastLM1, fastLM1),
Jay_lm.fit = anova2(lm.fit3, lm.fit4),
Donald = f(fastLM1, fastLM1),
times = 100L,
control=list(warmup=100L)
)
Unit: microseconds
expr min lq mean median uq max neval cld
base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870 100 e
base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435 4761.020 100 e
lm.fit_1 94.212 106.9020 112.3093 111.2235 116.7010 147.192 100 a
lm.fit_2 124.220 129.8080 134.4455 132.9830 138.2510 156.166 100 a
lm.fit_3 853.906 873.9035 902.5856 889.9715 917.9375 1028.415 100 cd
lm.fit_4 991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399 100 d
fastLm1 368.289 390.7805 434.1467 422.0855 476.9085 584.761 100 a c
fastLm2 416.645 441.8660 481.0027 462.8850 514.0755 617.619 100 a c
anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093 100 f
Jay_fastLM 202.026 218.2580 229.6244 226.3405 238.9490 303.964 100 ab
Jay_lm.fit 200.028 216.0805 234.0143 229.7580 246.1870 292.268 100 ab
Donald 549.425 582.8105 612.6990 605.4400 625.5340 1079.989 100 bc
我们可以破解 stats:::anova.lmlist
,以便它适用于 .lm.fit
(注意圆点)和 RcppArmadillo::fastLm
生成的列表。请检查stats:::anova.lmlist
,如果我没有删除你需要的东西!
anova2 <- function(object, ...) {
objects <- list(object, ...)
ns <- vapply(objects, function(x) length(x$residuals), 0L)
stopifnot(!any(ns != ns[1L]))
resdf <- vapply(objects, df.residual, 0L)
resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
bigmodel <- order(resdf)[1L]
dfs <- c(NA, -diff(resdf))[bigmodel]
ssq <- c(NA, -diff(resdev))[bigmodel]
df.scale <- resdf[bigmodel]
scale <- resdev[bigmodel]/resdf[bigmodel]
fstat <- (ssq/dfs)/scale
p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
return(c(F=fstat, p=p))
}
这些包装器使 .lm.fit
和 RcppArmadillo::fastLm
更方便一些:
lm_fun <- function(fo, dat) {
X <- model.matrix(fo, dat)
fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
fit$df.residual <- dim(X)[1] - dim(X)[2]
return(fit)
}
fastLm_fun <- function(fo, dat) {
fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
return(fit)
}
使用它
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
# F p
# 0.3609728 0.5481032
## Or use `fastLm_fun` here, but it's slower.
比较
anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
# Analysis of Variance Table
#
# Model 1: y ~ x1 + x3
# Model 2: y ~ x2 + x3 + x4
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 997 1003.7
# 2 996 1003.4 1 0.36365 0.361 0.5481
lm_fun
(优于 fastLm_fun
)与 anova2
相结合似乎比传统方法快 60%:
microbenchmark::microbenchmark(
conventional={
fit1 <- lm(y ~ x1 + x3, data)
fit2 <- lm(y ~ x2 + x3 + x4, data)
anova(fit1, fit2)
},
anova2={ ## using `.lm.fit`
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova2_Fast={ ## using `RcppArmadillo::fastLm`
fit1 <- fastLm_fun(y ~ x1 + x3, data)
fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
},
anova_Donald={
fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova_Donald(fit1, fit2)
},
times=1000L,
control=list(warmup=100L)
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720 1000 d
# anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955 1000 a
# anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279 1000 c
# anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247 1000 b
数据:
set.seed(42)
data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
x4=rnorm(1000))