Loop - LASSO 从模拟数据中提取系数
Loop - LASSO Coefficient extraction from simulation data
我试图从模拟数据的 LASSO 实验中找出哪些系数被正确和不正确地收缩为 0。我可以让它在一次迭代中工作,但我不知道如何正确地循环它以便我可以提取 100 次迭代的相关信息。
这是我当前的代码
library(MASS)
library(glmnet)
N=100
n=200
p=200
set.seed(123)
f.non<-data.frame(NULL)
f.disc<-data.frame(NULL)
X= mvrnorm(200, rep(0,p), diag(1,p,p))
Y=rowSums(X[,1:10])+3.5*rnorm(n)
lasso.model<-cv.glmnet(X,Y,alpha=1,intercept=FALSE)
lasso.coef<-coef(lasso.model,s=lasso.model$lambda.1se)
f.non<- sum(lasso.coef[2:11,] == 0)
f.disc <- sum(lasso.coef[12:201,] != 0)
}
上面的部分是我尝试通过设置一个空数据框进行循环,然后在循环中捕获结果但我无法让循环工作
因此,如果我理解正确(但我不确定我是否理解正确),您正在尝试提取 lasso.coef
对象的值(和名称?),具体取决于它们是否重新等于零。
你可以像这样在 for
循环中重复 100 次(如果你想收集变量的名称):
f.non <- c(NULL)
f.disc <- c(NULL)
N=100
for (i in 1:N) {
n=200
p=200
X = mvrnorm(200, rep(0, p), diag(1, p, p))
Y = rowSums(X[, 1:10]) + 3.5 * rnorm(n)
lasso.model <- cv.glmnet(X, Y, alpha = 1, intercept = FALSE)
lasso.coef <- coef(lasso.model, s = lasso.model$lambda.1se)
# lasso_coefs <- lasso.coef[2:201]
# setNames(lasso_coefs, paste("V", as.character(seq(2, 201, 1)), sep = ""))
lasso_coefs_df <- data.frame(coef = lasso.coef[2:201], name = paste("V", as.character(seq(2,201,1)), sep = ""))
# f.non[[i]] <- as.list(lasso_coefs[lasso_coefs == 0])
# f.disc[[i]] <- as.list(lasso_coefs[lasso_coefs != 0])
f.non[[i]] <- lasso_coefs_df[lasso_coefs_df$coef == 0, ]
f.disc[[i]] <- lasso_coefs_df[lasso_coefs_df$coef != 0, ]
}
否则,更类似于您的尝试,如果您只想要每个变量范围 (2-11、12-201) 的总和:
f.non <- c(NULL)
f.disc <- c(NULL)
N=100
for (i in 1:N) {
n=200
p=200
X = mvrnorm(200, rep(0, p), diag(1, p, p))
Y = rowSums(X[, 1:10]) + 3.5 * rnorm(n)
lasso.model <- cv.glmnet(X, Y, alpha = 1, intercept = FALSE)
lasso.coef <- coef(lasso.model, s = lasso.model$lambda.1se)
f.non[[i]] <- sum(lasso.coef[2:11,] == 0)
f.disc[[i]] <- sum(lasso.coef[12:201,] != 0)
}
我试图从模拟数据的 LASSO 实验中找出哪些系数被正确和不正确地收缩为 0。我可以让它在一次迭代中工作,但我不知道如何正确地循环它以便我可以提取 100 次迭代的相关信息。 这是我当前的代码
library(MASS)
library(glmnet)
N=100
n=200
p=200
set.seed(123)
f.non<-data.frame(NULL)
f.disc<-data.frame(NULL)
X= mvrnorm(200, rep(0,p), diag(1,p,p))
Y=rowSums(X[,1:10])+3.5*rnorm(n)
lasso.model<-cv.glmnet(X,Y,alpha=1,intercept=FALSE)
lasso.coef<-coef(lasso.model,s=lasso.model$lambda.1se)
f.non<- sum(lasso.coef[2:11,] == 0)
f.disc <- sum(lasso.coef[12:201,] != 0)
}
上面的部分是我尝试通过设置一个空数据框进行循环,然后在循环中捕获结果但我无法让循环工作
因此,如果我理解正确(但我不确定我是否理解正确),您正在尝试提取 lasso.coef
对象的值(和名称?),具体取决于它们是否重新等于零。
你可以像这样在 for
循环中重复 100 次(如果你想收集变量的名称):
f.non <- c(NULL)
f.disc <- c(NULL)
N=100
for (i in 1:N) {
n=200
p=200
X = mvrnorm(200, rep(0, p), diag(1, p, p))
Y = rowSums(X[, 1:10]) + 3.5 * rnorm(n)
lasso.model <- cv.glmnet(X, Y, alpha = 1, intercept = FALSE)
lasso.coef <- coef(lasso.model, s = lasso.model$lambda.1se)
# lasso_coefs <- lasso.coef[2:201]
# setNames(lasso_coefs, paste("V", as.character(seq(2, 201, 1)), sep = ""))
lasso_coefs_df <- data.frame(coef = lasso.coef[2:201], name = paste("V", as.character(seq(2,201,1)), sep = ""))
# f.non[[i]] <- as.list(lasso_coefs[lasso_coefs == 0])
# f.disc[[i]] <- as.list(lasso_coefs[lasso_coefs != 0])
f.non[[i]] <- lasso_coefs_df[lasso_coefs_df$coef == 0, ]
f.disc[[i]] <- lasso_coefs_df[lasso_coefs_df$coef != 0, ]
}
否则,更类似于您的尝试,如果您只想要每个变量范围 (2-11、12-201) 的总和:
f.non <- c(NULL)
f.disc <- c(NULL)
N=100
for (i in 1:N) {
n=200
p=200
X = mvrnorm(200, rep(0, p), diag(1, p, p))
Y = rowSums(X[, 1:10]) + 3.5 * rnorm(n)
lasso.model <- cv.glmnet(X, Y, alpha = 1, intercept = FALSE)
lasso.coef <- coef(lasso.model, s = lasso.model$lambda.1se)
f.non[[i]] <- sum(lasso.coef[2:11,] == 0)
f.disc[[i]] <- sum(lasso.coef[12:201,] != 0)
}