在多列数据上使用 rollapply 和 lm

Question

我有一个类似于下面的数据框，共有 500 列：

Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)

我想计算滚动 window 线性回归，其中我的 window 大小是 12 个数据点，每个顺序回归由 6 个数据点分隔。对于每个回归，"Days" 将始终是模型的 x 分量，而 y 将是每个其他列（B1，然后是 B2、B3 等）。然后我想将 co-efficients 保存为具有现有列标题（B1、B2 等）的数据框。

我认为我的代码很接近，但不是很有效。我使用了动物园图书馆的 rollapply。

slopedata<-rollapply(zoo(Probes), width=12, function(Probes) { 
 coef(lm(formula=y~Probes$Days, data = Probes))[2]
 }, by = 6, by.column=TRUE, align="right")

如果可能的话，我还想将 "xmins" 保存到一个向量中以添加到数据帧中。这意味着每次回归中使用的最小 x 值（基本上是 "Days" 列中的每 6 个数字。）感谢您的帮助。

Answer 1

试试这个：

# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]

# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){

  # beg,end are used to specify the interval
  beg <- seq(1,nrow(Probes),by)[m]
  end <- beg+window-1

  # this is used to go through all the columns
  N <- ncol(Probes)-1
  tmp <- numeric(N)
  # go through each column and store the coefficients in tmp
  for(i in 1:N){
     y <- Probes[[i+1]][beg:end]
     x <- Probes$Days[beg:end]
     tmp[i] <- coef(lm(y~x))[2][[1]]
  }
  # put all our column regressions into a dataframe
  res <- rbind('coeff'=tmp)
  colnames(res) <- colnames(Probes)[-1]

  return(res)
}

# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))

# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)

Answer 2

1)定义一个zoo对象z，其数据包含Probes，索引取自Probes的第一列，即Days.注意 lm 允许 y 成为一个矩阵，定义一个计算回归系数的 coefs 函数。最后 rollapply 超过 z。请注意，返回对象的索引给出了 xmin.

library(zoo)

z <- zoo(Probes, Probes[[1]])

coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")

给予：

> head(rz)
     B11 B12  B21 B22 B31 B32  B41 B42
0.01   4 100 -101 100   9 100 -201 100
0.07   4 100 -101 100   9 100 -201 100
0.13   4 100 -101 100   9 100 -201 100
0.19   4 100 -101 100   9 100 -201 100
0.25   4 100 -101 100   9 100 -201 100
0.31   4 100 -101 100   9 100 -201 100

请注意，如果您需要 rz 的数据框表示，则可以使用 DF <- fortify.zoo(rz)。

2) 另一种有点类似的方法是 rollaplly 在行号上：

library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))), 
      xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)

Answer 3

你也可以使用rollRegres包如下

# setup data
Probes <- data.frame(
  # I changed the days to be intergers
  Days=seq(1L, 491L, 1L),
  B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)

# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
  roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)

# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R      B10 B11  B20 B21 B30 B31  B40 B41
#R [1,]   4 100 -101 100   9 100 -201 100
#R [2,]   4 100 -101 100   9 100 -201 100
#R [3,]   4 100 -101 100   9 100 -201 100
#R [4,]   4 100 -101 100   9 100 -201 100
#R [5,]   4 100 -101 100   9 100 -201 100
#R [6,]   4 100 -101 100   9 100 -201 100

该解决方案比 zoo 解决方案

快得多

library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark(   rollapply = {
    z <- zoo(Probes, Probes[[1]])
    rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")   },   roll_regres = {
    grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

    X <- cbind(1, Probes$Days / 100)
    Ys <- as.matrix(Probes[, 2:5])
    out <- lapply(1:ncol(Ys), function(i)
      roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
    out <- do.call(cbind, out)

    colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
    out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
    out <- out[complete.cases(out), ]
    head(out)   } )
#R Unit: microseconds
#R        expr       min        lq      mean     median        uq       max neval
#R   rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76   100
#R roll_regres   865.186   920.297  1074.161   983.9015  1047.705   5071.41   100

目前您需要从 Github 安装包，因为版本 0.1.0 中的验证错误。因此，运行

devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
                         build_vignettes = TRUE)

在多列数据上使用 rollapply 和 lm

Using rollapply and lm over multiple columns of data

r

lm

zoo

rollapply