创建递归样本外估计以计算 R 中的 RMSE 的有效方法

Efficient way to create recursive out-of-sample estimates in order to calculate RMSE in R

我正在尝试评估不同 OLS 模型的样本外预测性能。最简单的时间序列回归如下所示: Y_t = b0 + b1 * Y_t-30 + e_t

模型的拟合周期假设为 50,然后我让模型 运行 使用 dynlm 包

dynlm(as.zoo(Y) ~ L(as.zoo(Y), 30), start = "1996-01-01", end = timelist[i])

在我当前的代码中,我只是让索引 i 运行 一直到最后,然后我保存相应模型的 RMSE。但是这个 RMSE 不是超前一步预测的样本外,因为我目前的代码已经很慢了,它甚至没有完全按照我的意愿去做,我想问你是否有什么建议我应该用来实现我的目标的软件包。

总而言之,我想做以下事情:

1) 运行 某个拟合周期后的递归回归(扩展window,不滚动window)

2) 创建领先一步的样本外预测

3) 计算这些预测与实际观测值的均方根误差,以评估模型性能

到目前为止,我尝试使用巨大的 for 循环和 dynlm 包来执行此操作,但结果并不令人满意。 非常感谢任何输入,因为我一直在寻找解决方案已有一段时间了。我会在取得一些进展后立即更新我的示例代码。

# minimal working example
require(xts)
require(zoo)
require(dynlm)
timelist <- seq.Date(from = as.Date("1996-01-01"), to = as.Date("1998-01-01"), by = "days")
set.seed(123)
Y <- xts(rnorm(n = length(timelist)), order.by = timelist)
X <- xts(rnorm(n = length(timelist), mean = 10), order.by = timelist)
# rmse container
rmse.container.full <- data.frame(matrix(NA, ncol = 3, nrow = length(index(timelist))))
colnames(rmse.container.full) <- c("Date", "i", "rmse.m1")
rmse.container.full$Date <- timelist
# fitting period
for(i in 50:length(timelist)) {
  # m1 
  model1 <- dynlm(as.zoo(Y) ~ L(as.zoo(X), 30), start = "1996-01-01", end = timelist[i])
  rmse.container.full[i, 2] <- i
  rmse.container.full[i, 3] <- summary(model1)$sigma # RSME mod1 etc
  print(i)
}

好吧,作为提出问题的人,我想贡献一下我是如何解决我的问题的:

因为我只需要领先一步的预测,所以我可以扔掉其他所有东西,这使得代码 运行 速度更快。 (每个模型从 12 分钟到 10 秒)。

我自己创建了完整的数据帧(包括滞后)并使用 lm 而不是 dynlm。 下面的代码给了我想要的结果(我手动检查了前几个观察结果,它似乎有效)。代码改编自这里:Recursive regression in R

      mod1.predictions <- lapply( seq(1400, nrow(df.full)-1), function(x) {
                mod1 <- lm(Y ~ X, data = df.full[1:x, ])
                pred1 <- predict(mod1, newdata = df.full[x+1, ])
                return(pred1)
              })

为了计算 RMSE,我使用了这个函数

# rmse function
rmse <- function(sim, obs) {
  res <- sqrt( mean( (sim - obs)^2, na.rm = TRUE) )
  res
}

感谢 CrossValidated 的提示,帮助很大。

更新

你可以用下面我写的rollRegres包来做out-of-sample-forecasts(比之前的方案快)

# simulate data
set.seed(101)
n <- 1000
X <- rnorm(n)
y <- 10 - X + rnorm(n)
dat <- data.frame(y = y, X)

# define wrapper to get out-of-sample predicted values
library(rollRegres)
wrapper <- function(formula, data, min_window_size){
  out <- roll_regres(
    formula = frm, data = data, width = min_window_size, do_downdates = FALSE,
    do_compute = "1_step_forecasts")$one_step_forecasts

  out[!is.na(out)]
}

# assign function to compare with
func <- function(formula, data, min_window_size){
  sapply(seq(min_window_size, nrow(data) - 1L), function(x) {
    mod1 <- lm(formula, data = data[1:x, ])
    pred1 <- predict(mod1, newdata = data[x+1, ])
    pred1
  })
}

# show that the two gives the same
frm <- y ~ X
r1 <- wrapper(frm, dat, 25L)
r2 <- func   (frm, dat, 25L)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE

# show run time
microbenchmark::microbenchmark(
  func = func(frm, dat, 25L), roll_regres = wrapper(frm, dat, 25L),
  times = 5)
#R> Unit: microseconds
#R>        expr         min          lq        mean      median          uq         max neval
#R>        func 1027213.048 1028723.765 1050103.171 1034833.792 1038513.793 1121231.455     5
#R> roll_regres     560.198     569.284    1073.778     610.766     636.445    2992.198     5

注意~ 560 / 1028700 相对计算时间。然后您可以使用预测值 .

计算 RMSE

旧答案

您可以使用

中的 Fortran 函数大大减少计算时间

Miller, A. J. (1992). Algorithm AS 274: Least squares routines to supplement those of Gentleman. Journal of the Royal Statistical Society. Series C (Applied Statistics), 41(2), 458-478.

您可以使用此代码

# simulate data
set.seed(101)
n <- 1000
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(10 + X %*% runif(10)) + rnorm(n)
dat <- data.frame(y = y, X)

# assign wrapper for biglm
biglm_wrapper <- function(formula, data, min_window_size){
  mf <- model.frame(formula, data)
  X <- model.matrix(terms(mf), mf)
  y - model.response(mf)

  n <- nrow(X)
  p <- ncol(X)
  storage.mode(X) <- "double"
  storage.mode(y) <- "double"
  w <- 1
  qr <- list(
    d = numeric(p), rbar = numeric(choose(p, 2)), 
    thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
  nrbar = length(qr$rbar)
  beta. <- numeric(p)

  out <- numeric(n - min_window_size - 2)
  for(i in 1:(n - 1)){
    row <- X[i, ] # will be over written
    qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
      "INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i], 
      d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L, 
      PACKAGE = "biglm")[
        c("d", "rbar", "thetab", "sserr")]

    if(i >= min_window_size){
      coef. <- .Fortran(
        "REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
        thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
        PACKAGE = "biglm")[["beta"]]
      out[i - min_window_size + 1] <- coef. %*%  X[i + 1, ]
    }
  }

  out
}

# assign function to compare with
func <- function(formula, data, min_window_size){
  sapply(seq(min_window_size, nrow(data)-1), function(x) {
    mod1 <- lm(formula, data = data[1:x, ])
    pred1 <- predict(mod1, newdata = data[x+1, ])
    pred1
  })
}

# show that the two gives the same
frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
r1 <- biglm_wrapper(frm, dat, 25)
r2 <- func(frm, dat, 25)
all.equal(r1, r2, check.attributes = FALSE)
#R> [1] TRUE

# show run time
microbenchmark::microbenchmark(
  r1 = biglm_wrapper(frm, dat, 25), r2 = f2(frm, dat, 25), 
  times = 5)
#R> Unit: milliseconds
#R>  expr         min         lq       mean     median         uq        max neval cld
#R>    r1    9.976505   10.00653   11.85052   10.53157   13.36377   15.37424     5  a 
#R>    r2 1095.944410 1098.29661 1122.17101 1098.58264 1113.48833 1204.54306     5   b