使用 R 在多个数据集上自动化机器学习过程

Automate Machine Learning process with R on multiple datasets

我有多个不同长度的数据集。我想应用相关函数删除 98% 的相关变量。如何使用循环同时对多个数据集应用相关函数并将所选变量存储在新数据框中?

如何在多个数据集上同时使用套索回归,同时使用循环函数?谢谢

    H<-data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
    C<-data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
    R<-data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
    E<-data.frame(replicate(4,sample(0:40,10,rep=FALSE)))

# Corrélation
    library("caret")
    library("dplyr")
    data.cor <- cor(subset(H, select = -c(X10)))
    high.cor <- findCorrelation(data.cor, cutoff=0.98)
    remove <- names(H[high.cor]) 
    remove <- c(remove)

    myvars <- names(H) %in% remove
    var_selected <- H[!myvars]
    new_data_H <- var_selected

这是(几种)执行此操作的一种方法:

# Corrélation
library(caret)
library(dplyr)

set.seed(99)

H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))

# Combine input datasets a list
inputs <- list(H, C, R, E)
# Empty list to hold results
outputs <- list()

# Loop over each dataset, one at a time
for(df in inputs){
  data.cor <- cor(df)
  high.cor <- findCorrelation(data.cor, cutoff=0.40)
  # Subset the dataset based on `high.cor`
  # Add the subsetted dataset to a output list of datasets
  outputs <- append(outputs, list(df[,-high.cor]))
}

# This is the first dataset processed by the loop
outputs[[1]]
# Second...
outputs[[2]]
# Third...
outputs[[3]]

编辑:整合您的套索程序

library(glmnet)
library(caret)

set.seed(99)

## Define data (indpendent variables)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
inputs <- list(H, C, R, E)

## Define targets (dependent variables)
Y_H <- data.frame(label_1 = replicate(1,sample(20:35, 10, rep = TRUE)))
Y_C <- data.frame(label_2 = replicate(1,sample(15:65, 10, rep = TRUE)))
Y_R <- data.frame(label_3 = replicate(1,sample(25:45, 10, rep = TRUE)))
Y_E <- data.frame(label_4 = replicate(1,sample(21:80, 10, rep = TRUE)))
targets <- list(Y_H, Y_C, Y_R, Y_E)

## Remove coorelated independent variables
outputs <- list()

for(df in inputs){
  data.cor <- cor(df)
  high.cor <- findCorrelation(data.cor, cutoff=0.40)
  outputs <- append(outputs, list(df[,-high.cor]))
}

## Do lasso regression
lasso_cv <- list()
lasso_model <- list()

for(i in 1:length(outputs)){
  for(j in 1:length(targets)){
    
    lasso_cv[[i]] <- cv.glmnet(
      as.matrix(outputs[[i]]), as.matrix(targets[[j]]), standardize = TRUE, type.measure = "mse",  alpha = 1, nfolds = 3)
    
    lasso_model[[i]] <- glmnet(
      as.matrix(outputs[[i]]), as.matrix(targets[[j]]), lambda = lasso_cv[[i]]$lambda_cv, standardize = TRUE, alpha = 1)
    
  }
}
  • 为每个数据帧创建目标变量
  • 合并列表中的所有数据帧
  • 合并列表中的所有目标
  • 注意:每个目标变量对应一个dataframe
  • 相关性:删除相关变量
  • 对所有列表执行套索回归

创建数据框

set.seed(99)

H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))

Y_H <- data.frame(replicate(1,sample(20:35, 10, rep = TRUE)))
Y_H
names(Y_H)<-
names(Y_H)names(Y_H)=="replicate.1..sample.20.35..10..rep...TRUE.."] <-"label_1"

Y_C <- data.frame(replicate(1,sample(15:65, 10, rep = TRUE)))

names(Y_C) <-
names(Y_C)[names(Y_C)=="replicate.1..sample.15.65..10..rep...TRUE.."] <-"label_2" 

 Y_R <- data.frame(replicate(1,sample(25:45, 10, rep = TRUE)))
 names(Y_R) <-names(Y_R)[names(Y_R) == "replicate.1..sample.25.45..10..rep...TRUE.."] <- "label_3"


 Y_E <- data.frame(replicate(1,sample(21:80, 10, rep = TRUE)))
 names(Y_E) <-names(Y_E)[names(Y_E) == "replicate.1..sample.15.65..10..rep...TRUE.."] <- "label_4"


 inputs <- list(H, C, R, E)

 targets <- list(Y_H, Y_C, Y_R, Y_E)


 outputs <- list()


 for(df in inputs){
     data.cor <- cor(df)
     high.cor <- findCorrelation(data.cor, cutoff=0.40)
     outputs <- append(outputs, list(df[,-high.cor]))
  }

  library("glmnet")

  lasso_cv <- list()
  lasso_model <- list()

  for(i in outputs){
     for(j in targets){
        lasso_cv[i] <- cv.glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]), 
standardize = TRUE, type.measure="mse",  alpha = 1,nfolds = 3)

         lasso_model[i] <- glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]),lambda = lasso_cv[i]$lambda_cv, alpha = 1, standardize = TRUE)

} }