更快的回归模型方差分析

Faster anova of regression models

我有以下玩具数据-

data<-data.frame(y=rnorm(1000),x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000), x4=rnorm(1000))

根据这些数据,我创建了两个模型如下 -

fit1 = lm(y ~ x1 + x3, data)
fit2 = lm(y ~ x2 + x3 + x4, data)

最后,我使用 anova

比较这些模型
anova(fit1, fit2)

因为我 运行 这些测试是在大约 100 万个独立的数据集上进行的,所以我想提高性能并且 不想 使用 lmanova 函数。

为了加快计算速度,我使用 RcppArmadillo::fastLM 而不是 lm,但是没有可用的 anova 函数来比较模型。有没有更快的函数(相比lm),也有对应的anova函数?

我们将不胜感激任何改进性能的建议。

以下是下面使用的各种 lm 版本的性能结果。对于线性回归,fastLM 是一个明显的赢家。 @Jay 的 anova2 函数比 anova 函数快得多,并且适用于 fastLM。 @Donald 的函数 f 也比 anova 函数快并且适用于 fastLM。

microbenchmark::microbenchmark(
  base_lm_1 = fit1 <- lm(y ~ x1 + x3, data),
  base_lm_2 = fit2 <-lm(y ~ x2 + x3 + x4, data),
  lm.fit_1 = lm.fit1 <- with(data, .lm.fit(y = y, x = cbind(1, x1, x3))),
  lm.fit_2 = lm.fit2 <- with(data, .lm.fit(y = y, x = cbind(1, x2, x3, x4))),
  lm.fit_3 = lm.fit3 <- lm_fun(y ~ x1 + x3, data),
  lm.fit_4 = lm.fit4 <- lm_fun(y ~ x2 + x3 + x4, data),
  fastLm1 = fastLM1 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x1, x3))),
  fastLm2 = fastLM2 <- with(data, RcppArmadillo::fastLm(y = y, X = cbind(1, x2, x3, x4))),
  anova_base = anova(fit1, fit2),
  Jay_fastLM = anova2(fastLM1, fastLM1),
  Jay_lm.fit = anova2(lm.fit3, lm.fit4),
  Donald = f(fastLM1, fastLM1),
  times = 100L,
  control=list(warmup=100L)
)
Unit: microseconds
       expr      min        lq      mean    median        uq       max neval    cld
  base_lm_1 1472.480 1499.2560 1817.2659 1532.3840 1582.2615 28370.870   100     e 
  base_lm_2 1657.745 1706.5505 1796.3631 1744.3945 1825.7435  4761.020   100     e 
   lm.fit_1   94.212  106.9020  112.3093  111.2235  116.7010   147.192   100 a     
   lm.fit_2  124.220  129.8080  134.4455  132.9830  138.2510   156.166   100 a     
   lm.fit_3  853.906  873.9035  902.5856  889.9715  917.9375  1028.415   100   cd  
   lm.fit_4  991.238 1006.7015 1213.7061 1021.5325 1045.8980 19379.399   100    d  
    fastLm1  368.289  390.7805  434.1467  422.0855  476.9085   584.761   100 a c   
    fastLm2  416.645  441.8660  481.0027  462.8850  514.0755   617.619   100 a c   
 anova_base 2021.982 2099.8755 2322.2707 2190.3340 2246.7800 15345.093   100      f
 Jay_fastLM  202.026  218.2580  229.6244  226.3405  238.9490   303.964   100 ab    
 Jay_lm.fit  200.028  216.0805  234.0143  229.7580  246.1870   292.268   100 ab    
     Donald  549.425  582.8105  612.6990  605.4400  625.5340  1079.989   100  bc 

我们可以破解 stats:::anova.lmlist,以便它适用于 .lm.fit(注意圆点)和 RcppArmadillo::fastLm 生成的列表。请检查stats:::anova.lmlist,如果我没有删除你需要的东西!

anova2 <- function(object, ...) {
  objects <- list(object, ...)
  ns <- vapply(objects, function(x) length(x$residuals), 0L)
  stopifnot(!any(ns != ns[1L])) 
  resdf <- vapply(objects, df.residual, 0L)
  resdev <- vapply(objects, function(x) crossprod(residuals(x)), 0)
  bigmodel <- order(resdf)[1L]
  dfs <- c(NA, -diff(resdf))[bigmodel]
  ssq <- c(NA, -diff(resdev))[bigmodel]
  df.scale <- resdf[bigmodel]
  scale <- resdev[bigmodel]/resdf[bigmodel]
  fstat <- (ssq/dfs)/scale
  p <- pf(fstat, abs(dfs), df.scale, lower.tail=FALSE)
  return(c(F=fstat, p=p))
}

这些包装器使 .lm.fitRcppArmadillo::fastLm 更方便一些:

lm_fun <- function(fo, dat) {
  X <- model.matrix(fo, dat)
  fit <- .lm.fit(X, dat[[all.vars(fo)[1]]])
  fit$df.residual <- dim(X)[1] - dim(X)[2]
  return(fit)
}

fastLm_fun <- function(fo, dat) {
  fit <- RcppArmadillo::fastLm(model.matrix(fo, dat), dat[[all.vars(fo)[1]]])
  return(fit)
}

使用它

fit1 <- lm_fun(y ~ x1 + x3, data)
fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
anova2(fit1, fit2)
#         F         p 
# 0.3609728 0.5481032 

## Or use `fastLm_fun` here, but it's slower.

比较

anova(lm(y ~ x1 + x3, data), lm(y ~ x2 + x3 + x4, data))
# Analysis of Variance Table
# 
# Model 1: y ~ x1 + x3
# Model 2: y ~ x2 + x3 + x4
#   Res.Df    RSS Df Sum of Sq     F Pr(>F)
# 1    997 1003.7                          
# 2    996 1003.4  1   0.36365 0.361 0.5481

lm_fun(优于 fastLm_fun)与 anova2 相结合似乎比传统方法快 60%:

microbenchmark::microbenchmark(
  conventional={
    fit1 <- lm(y ~ x1 + x3, data)
    fit2 <- lm(y ~ x2 + x3 + x4, data)
    anova(fit1, fit2)
  },
  anova2={  ## using `.lm.fit`
    fit1 <- lm_fun(y ~ x1 + x3, data)
    fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
    anova2(fit1, fit2)
  },
  anova2_Fast={  ## using `RcppArmadillo::fastLm`
    fit1 <- fastLm_fun(y ~ x1 + x3, data)
    fit2 <- fastLm_fun(y ~ x2 + x3 + x4, data)
    anova2(fit1, fit2)
  },
  anova_Donald={  
    fit1 <- lm_fun(y ~ x1 + x3, data)
    fit2 <- lm_fun(y ~ x2 + x3 + x4, data)
    anova_Donald(fit1, fit2)
  }, 
  times=1000L,
  control=list(warmup=100L)
)
# Unit: milliseconds
#         expr      min       lq     mean   median       uq      max neval  cld
# conventional 2.885718 2.967053 3.205947 3.018668 3.090954 40.15720  1000    d
#       anova2 1.180683 1.218121 1.285131 1.233335 1.267833 23.81955  1000 a   
#  anova2_Fast 1.961897 2.012756 2.179458 2.037854 2.087893 26.65279  1000   c 
# anova_Donald 1.368699 1.409198 1.561751 1.430562 1.472148 33.12247  1000  b  

数据:

set.seed(42)
data <- data.frame(y=rnorm(1000), x1=rnorm(1000), x2=rnorm(1000), x3=rnorm(1000),
                   x4=rnorm(1000))