重现套索模拟示例1
Reproduce lasso simulation example 1
我想在原文中重现第 280 页示例 1 的结果 lasso paper。
模型是 y = X*beta + sigma*epsilon
其中 epsilon
是 N(0,1)
模拟由 20/20/200 个观测值组成的 50 个数据集
training/validation/test套。
True beta = (3, 1.5, 0, 0, 2, 0, 0, 0)
sigma = 3
x_i
和 x_j
之间的成对相关设置为 corr(i,j) = 0.5^|i-j|
我使用训练、验证、测试拆分方法来找到 test MSE
的估计值。我尝试手动计算一些 test MSE
估计值,以检查在模拟重复之前我是否走在正确的道路上。但我发现的 test MSE
估计值( 在 [9, 15] 之间)似乎比原始论文给出的估计值大得多(中位数 2.43).我附上了用于生成 test MSE
的代码。
有什么建议吗?
library(MASS)
library(glmnet)
simfun <- function(trainn = 20, validationn = 20, testn = 200, corr =0.5, sigma = 3, beta) {
n <- trainn + testn + validationn
p <- length(beta)
Covmatrix <- outer(1:p, 1:p, function(x,y){corr^abs(x-y)})
X <- mvrnorm(n, rep(0,p), Covmatrix) # MASS
X <- X[sample(n),]
y <- X%*%beta + rnorm(n,mean = 0,sd=sigma)
trainx <- X[1:trainn,]
validationx <- X[(trainn+1):(trainn+validationn),]
testx <- X[(trainn+validationn+1):n,]
trainy <- y[1:trainn,]
validationy <- y[(trainn+1):(trainn+validationn),]
testy <- y[(trainn+validationn+1):n,]
list(trainx = trainx, validationx = validationx, testx = testx,
trainy = trainy, validationy = validationy, testy = testy)
}
beta <- c(3,1.5,0,0,2,0,0,0)
data <- simfun(20,20,200,corr=0.5,sigma=3,beta)
trainx <- data$trainx
trainy <- data$trainy
validationx <- data$validationx
validationy <- data$validationy
testx <- data$testx
testy <- data$testy
# training: find betas for all the lambdas
betas <- coef(glmnet(trainx,trainy,alpha=1))
# validation: compute validation test error for each lambda and find the minimum
J.val <- colMeans((validationy-cbind(1,validationx)%*%betas)^2)
beta.opt <- betas[, which.min(J.val)]
# test
test.mse <- mean((testy-cbind(1,testx)%*%beta.opt)^2)
test.mse
这是模拟研究,所以我认为您不必使用训练验证方法。它只是由于其随机性而导致变化。您可以使用它的定义实现预期测试错误。
- 根据您的构造生成多个训练数据集
- 生成独立测试集
- 根据每个训练集拟合每个模型
- 根据测试集计算误差
取平均值
set.seed(1)
simpfun <- function(n_train = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(-3, 5)) {
require(foreach)
require(tidyverse)
# true model
p <- length(beta)
Covmatrix <- outer(
1:p, 1:p,
function(x, y) {
corr^abs(x - y)
}
)
X <- foreach(i = 1:simul, .combine = rbind) %do% {
MASS::mvrnorm(n_train, rep(0, p), Covmatrix)
}
eps <- rnorm(n_train, mean = 0, sd = sigma)
y <- X %*% beta + eps # generate true model
# generate test set
test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix)
te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y
simul_id <- gl(simul, k = n_train, labels = 1:n_train)
# expected test error
train <-
y %>%
as_tibble() %>%
mutate(m_id = simul_id) %>%
group_by(m_id) %>% # for each simulation
do(yhat = predict(glmnet::cv.glmnet(X, y, alpha = 1, lambda = lam_grid), newx = test, s = "lambda.min")) # choose the smallest lambda
MSE <- # (y0 - fhat0)^2
sapply(train$yhat, function(x) {
mean((x - te_y)^2)
})
mean(MSE) # 1/simul * MSE
}
simpfun()
编辑:调整参数,
find_lambda <- function(.data, x, y, lambda, x_val, y_val) {
.data %>%
do(
tuning = predict(glmnet::glmnet(x, y, alpha = 1, lambda = lambda), newx = x_val)
) %>%
do( # tuning parameter: validation set
mse = apply(.$tuning, 2, function(yhat, y) {
mean((y - yhat)^2)
}, y = y_val)
) %>%
mutate(mse_min = min(mse)) %>%
mutate(lam_choose = lambda[mse == mse_min]) # minimize mse
}
使用此功能,可以添加验证步骤
simpfun <- function(n_train = 20, n_val = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(10, -1, length.out = 100)) {
require(foreach)
require(tidyverse)
# true model
p <- length(beta)
Covmatrix <- outer(
1:p, 1:p,
function(x, y) {
corr^abs(x - y)
}
)
X <- foreach(i = 1:simul, .combine = rbind) %do% {
MASS::mvrnorm(n_train, rep(0, p), Covmatrix)
}
eps <- rnorm(n_train, mean = 0, sd = sigma)
y <- X %*% beta + eps # generate true model
# generate validation set
val <- MASS::mvrnorm(n_val, rep(0, p), Covmatrix)
val_y <- val %*% beta + rnorm(n_val, mean = 0, sd = sigma) # validation y
# generate test set
test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix)
te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y
simul_id <- gl(simul, k = n_train, labels = 1:n_train)
Y <-
y %>%
as_tibble() %>%
mutate(m_id = simul_id) %>%
group_by(m_id) %>% # for each simulation: repeat
rename(y = V1)
# Tuning parameter
Tuning <-
Y %>%
find_lambda(x = X, y = y, lambda = lam_grid, x_val = val, y_val = val_y)
# expected test error
test_mse <-
Tuning %>%
mutate(
test_err = mean(
(predict(glmnet::glmnet(X, y, alpha = 1, lambda = lam_choose), newx = test) - te_y)^2
)
) %>%
select(test_err) %>%
pull()
mean(test_mse)
}
simpfun()
我想在原文中重现第 280 页示例 1 的结果 lasso paper。
模型是
y = X*beta + sigma*epsilon
其中epsilon
是N(0,1)
模拟由 20/20/200 个观测值组成的 50 个数据集 training/validation/test套。
True beta = (3, 1.5, 0, 0, 2, 0, 0, 0)
sigma = 3
x_i
和x_j
之间的成对相关设置为corr(i,j) = 0.5^|i-j|
我使用训练、验证、测试拆分方法来找到
test MSE
的估计值。我尝试手动计算一些test MSE
估计值,以检查在模拟重复之前我是否走在正确的道路上。但我发现的test MSE
估计值( 在 [9, 15] 之间)似乎比原始论文给出的估计值大得多(中位数 2.43).我附上了用于生成test MSE
的代码。
有什么建议吗?
library(MASS)
library(glmnet)
simfun <- function(trainn = 20, validationn = 20, testn = 200, corr =0.5, sigma = 3, beta) {
n <- trainn + testn + validationn
p <- length(beta)
Covmatrix <- outer(1:p, 1:p, function(x,y){corr^abs(x-y)})
X <- mvrnorm(n, rep(0,p), Covmatrix) # MASS
X <- X[sample(n),]
y <- X%*%beta + rnorm(n,mean = 0,sd=sigma)
trainx <- X[1:trainn,]
validationx <- X[(trainn+1):(trainn+validationn),]
testx <- X[(trainn+validationn+1):n,]
trainy <- y[1:trainn,]
validationy <- y[(trainn+1):(trainn+validationn),]
testy <- y[(trainn+validationn+1):n,]
list(trainx = trainx, validationx = validationx, testx = testx,
trainy = trainy, validationy = validationy, testy = testy)
}
beta <- c(3,1.5,0,0,2,0,0,0)
data <- simfun(20,20,200,corr=0.5,sigma=3,beta)
trainx <- data$trainx
trainy <- data$trainy
validationx <- data$validationx
validationy <- data$validationy
testx <- data$testx
testy <- data$testy
# training: find betas for all the lambdas
betas <- coef(glmnet(trainx,trainy,alpha=1))
# validation: compute validation test error for each lambda and find the minimum
J.val <- colMeans((validationy-cbind(1,validationx)%*%betas)^2)
beta.opt <- betas[, which.min(J.val)]
# test
test.mse <- mean((testy-cbind(1,testx)%*%beta.opt)^2)
test.mse
这是模拟研究,所以我认为您不必使用训练验证方法。它只是由于其随机性而导致变化。您可以使用它的定义实现预期测试错误。
- 根据您的构造生成多个训练数据集
- 生成独立测试集
- 根据每个训练集拟合每个模型
- 根据测试集计算误差
取平均值
set.seed(1) simpfun <- function(n_train = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(-3, 5)) { require(foreach) require(tidyverse) # true model p <- length(beta) Covmatrix <- outer( 1:p, 1:p, function(x, y) { corr^abs(x - y) } ) X <- foreach(i = 1:simul, .combine = rbind) %do% { MASS::mvrnorm(n_train, rep(0, p), Covmatrix) } eps <- rnorm(n_train, mean = 0, sd = sigma) y <- X %*% beta + eps # generate true model # generate test set test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix) te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y simul_id <- gl(simul, k = n_train, labels = 1:n_train) # expected test error train <- y %>% as_tibble() %>% mutate(m_id = simul_id) %>% group_by(m_id) %>% # for each simulation do(yhat = predict(glmnet::cv.glmnet(X, y, alpha = 1, lambda = lam_grid), newx = test, s = "lambda.min")) # choose the smallest lambda MSE <- # (y0 - fhat0)^2 sapply(train$yhat, function(x) { mean((x - te_y)^2) }) mean(MSE) # 1/simul * MSE } simpfun()
编辑:调整参数,
find_lambda <- function(.data, x, y, lambda, x_val, y_val) {
.data %>%
do(
tuning = predict(glmnet::glmnet(x, y, alpha = 1, lambda = lambda), newx = x_val)
) %>%
do( # tuning parameter: validation set
mse = apply(.$tuning, 2, function(yhat, y) {
mean((y - yhat)^2)
}, y = y_val)
) %>%
mutate(mse_min = min(mse)) %>%
mutate(lam_choose = lambda[mse == mse_min]) # minimize mse
}
使用此功能,可以添加验证步骤
simpfun <- function(n_train = 20, n_val = 20, n_test = 10, simul = 50, corr = .5, sigma = 3, beta = c(3, 1.5, 0, 0, 2, 0, 0, 0), lam_grid = 10^seq(10, -1, length.out = 100)) {
require(foreach)
require(tidyverse)
# true model
p <- length(beta)
Covmatrix <- outer(
1:p, 1:p,
function(x, y) {
corr^abs(x - y)
}
)
X <- foreach(i = 1:simul, .combine = rbind) %do% {
MASS::mvrnorm(n_train, rep(0, p), Covmatrix)
}
eps <- rnorm(n_train, mean = 0, sd = sigma)
y <- X %*% beta + eps # generate true model
# generate validation set
val <- MASS::mvrnorm(n_val, rep(0, p), Covmatrix)
val_y <- val %*% beta + rnorm(n_val, mean = 0, sd = sigma) # validation y
# generate test set
test <- MASS::mvrnorm(n_test, rep(0, p), Covmatrix)
te_y <- test %*% beta + rnorm(n_test, mean = 0, sd = sigma) # test y
simul_id <- gl(simul, k = n_train, labels = 1:n_train)
Y <-
y %>%
as_tibble() %>%
mutate(m_id = simul_id) %>%
group_by(m_id) %>% # for each simulation: repeat
rename(y = V1)
# Tuning parameter
Tuning <-
Y %>%
find_lambda(x = X, y = y, lambda = lam_grid, x_val = val, y_val = val_y)
# expected test error
test_mse <-
Tuning %>%
mutate(
test_err = mean(
(predict(glmnet::glmnet(X, y, alpha = 1, lambda = lam_choose), newx = test) - te_y)^2
)
) %>%
select(test_err) %>%
pull()
mean(test_mse)
}
simpfun()