大矩阵的线性回归
Linear regression with big matrices
我想用大矩阵执行线性回归。
这是我目前尝试过的方法:
library(bigmemory)
library(biganalytics)
library(bigalgebra)
nrows <- 1000000
X <- as.big.matrix( replicate(100, rnorm(nrows)) )
y <- rnorm(nrows)
biglm.big.matrix(y ~ X)
# Error in CreateNextDataFrameGenerator(formula, data, chunksize, fc, getNextChunkFunc, :
argument "data" is missing, with no default
biglm.big.matrix(y ~ X, data = cbind(y, X))
# Error in bigmemory:::mmap(vars, colnames(data)) :
Couldn't find a match to one of the arguments.
biglm.big.matrix(y ~ X, data = cbind(y = y, X = X))
# Error in bigmemory:::mmap(vars, colnames(data)) :
Couldn't find a match to one of the arguments.
我该如何解决这个问题?
这里,X
是一个有 100 列的(大)矩阵。由于 biglm.big.matrix()
需要 data=
参数,因此看起来您不能像使用 lm()
。另请注意,当您 cbind()
和 big.matrix
时,如 cbind(y, X)
,结果是 list
!!
您似乎需要 y
和 X
作为一个 big.matrix
的一部分,那么您将需要自己手动构建模型公式:
library(bigmemory)
library(biganalytics)
library(bigalgebra)
# Construct an empty big.matrix with the correct number of dimensions and
# with column names
nrows <- 1000000
dat <- big.matrix(nrow=nrows, ncol=101,
dimnames=list(
NULL, # no rownames
c("y", paste0("X", 1:ncol(X))) # colnames: y, X1, X2, ..., X100
))
# fill with y and X:
dat[,1] <- rnorm(nrows)
dat[,2:101] <- replicate(100, rnorm(nrows))
# construct the model formula as a character vector using paste:
# (Or you need to type y ~ X1 + X2 + ... + X100 manually in biglm.big.matrix()!)
f <- paste("y ~", paste(colnames(dat)[-1], collapse=" + "))
# run the model
res <- biglm.big.matrix(as.formula(f), data=dat)
summary(res)
您可以使用 package {bigstatsr} 轻松实现此功能(免责声明:我是作者)。
真值
nrows <- 1000000
X <- replicate(100, rnorm(nrows))
y <- rnorm(nrows)
system.time(
true <- lm(y ~ X)
) # 11.3 sec
与{bigstatsr}
library(bigstatsr)
system.time({
X2 <- as_FBM(X)
X2$add_columns(1)
X2[, ncol(X2)] <- 1
inv_XtX <- solve(big_crossprodSelf(X2)[])
Xty <- big_cprodVec(X2, y)
betas <- inv_XtX %*% Xty
RSS <- drop(crossprod(y - big_prodVec(X2, betas)))
df <- nrow(X2) - ncol(X2)
std_err <- sqrt(RSS * diag(inv_XtX) / df)
}) # 1.6 sec
验证
head(summary(true)$coefficients)
# Intercept at the end
head(betas) ## Estimate
head(std_err)
head(t_stat <- betas / std_err) ## t value
head(pval <- 2 * pt(abs(t_stat), df = df, lower.tail = FALSE))
我想用大矩阵执行线性回归。
这是我目前尝试过的方法:
library(bigmemory)
library(biganalytics)
library(bigalgebra)
nrows <- 1000000
X <- as.big.matrix( replicate(100, rnorm(nrows)) )
y <- rnorm(nrows)
biglm.big.matrix(y ~ X)
# Error in CreateNextDataFrameGenerator(formula, data, chunksize, fc, getNextChunkFunc, :
argument "data" is missing, with no default
biglm.big.matrix(y ~ X, data = cbind(y, X))
# Error in bigmemory:::mmap(vars, colnames(data)) :
Couldn't find a match to one of the arguments.
biglm.big.matrix(y ~ X, data = cbind(y = y, X = X))
# Error in bigmemory:::mmap(vars, colnames(data)) :
Couldn't find a match to one of the arguments.
我该如何解决这个问题?
这里,X
是一个有 100 列的(大)矩阵。由于 biglm.big.matrix()
需要 data=
参数,因此看起来您不能像使用 lm()
。另请注意,当您 cbind()
和 big.matrix
时,如 cbind(y, X)
,结果是 list
!!
您似乎需要 y
和 X
作为一个 big.matrix
的一部分,那么您将需要自己手动构建模型公式:
library(bigmemory)
library(biganalytics)
library(bigalgebra)
# Construct an empty big.matrix with the correct number of dimensions and
# with column names
nrows <- 1000000
dat <- big.matrix(nrow=nrows, ncol=101,
dimnames=list(
NULL, # no rownames
c("y", paste0("X", 1:ncol(X))) # colnames: y, X1, X2, ..., X100
))
# fill with y and X:
dat[,1] <- rnorm(nrows)
dat[,2:101] <- replicate(100, rnorm(nrows))
# construct the model formula as a character vector using paste:
# (Or you need to type y ~ X1 + X2 + ... + X100 manually in biglm.big.matrix()!)
f <- paste("y ~", paste(colnames(dat)[-1], collapse=" + "))
# run the model
res <- biglm.big.matrix(as.formula(f), data=dat)
summary(res)
您可以使用 package {bigstatsr} 轻松实现此功能(免责声明:我是作者)。
真值
nrows <- 1000000
X <- replicate(100, rnorm(nrows))
y <- rnorm(nrows)
system.time(
true <- lm(y ~ X)
) # 11.3 sec
与{bigstatsr}
library(bigstatsr)
system.time({
X2 <- as_FBM(X)
X2$add_columns(1)
X2[, ncol(X2)] <- 1
inv_XtX <- solve(big_crossprodSelf(X2)[])
Xty <- big_cprodVec(X2, y)
betas <- inv_XtX %*% Xty
RSS <- drop(crossprod(y - big_prodVec(X2, betas)))
df <- nrow(X2) - ncol(X2)
std_err <- sqrt(RSS * diag(inv_XtX) / df)
}) # 1.6 sec
验证
head(summary(true)$coefficients)
# Intercept at the end
head(betas) ## Estimate
head(std_err)
head(t_stat <- betas / std_err) ## t value
head(pval <- 2 * pt(abs(t_stat), df = df, lower.tail = FALSE))