Foreach 和 doparallel 而不是 R 中的 for 循环
Foreach and doparallel instead of for loop in R
我需要通过多线程加速 for 循环。我想为此使用以下库:foreach 和 doParallel。我以前使用过这些包,但仅用于需要一个结果 table 的过程。我不知道如何使用它们导出多个 tables(此处结果为 tables)。我的问题要复杂得多,需要导出许多结果集。在这里,为简单起见,我使用虹膜数据。
library(randomForest)
library(caret)
results_class <- data.frame()
results_overall <- data.frame()
for(i in 1:50){
trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
irisTrain <- iris[ trainIndex,]
irisTest <- iris[-trainIndex,]
model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
do.trace = 100, type = "classification")
pred_test <- predict(model, irisTest[,c(1:4)])
con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
results_class <- rbind(results_class, con.mat_test[["byClass"]])
results_overall <- rbind(results_overall, con.mat_test[["overall"]])
}
据我所知,在 foreach
循环之外修改变量并不容易(甚至不可能),那么将多个结果存储在一个嵌套的 tibble
中怎么样?
library(randomForest)
library(caret)
library(foreach)
library(doParallel)
# Set up parallel computing
cl <- makeCluster(detectCores(logical = TRUE))
registerDoParallel(cl)
res <- foreach(i = 1:50, .packages = c("caret", "randomForest"), .combine = rbind) %dopar% {
trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
irisTrain <- iris[ trainIndex,]
irisTest <- iris[-trainIndex,]
model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
do.trace = 100, type = "classification")
pred_test <- predict(model, irisTest[,c(1:4)])
con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
# Save class into separate variable
# Use substr to get rid of "Class: "
class <- data.frame(con.mat_test[["byClass"]])
overall <- data.frame(con.mat_test[["overall"]])
class$class <- sapply(rownames(class), function(x) substr(x, 8, nchar(x)))
overall$class <- sapply(rownames(overall), function(x) substr(x, 8, nchar(x)))
# Save output dataframe in tibble as list column
return(tibble::tibble(iteration = i,
class = list(class),
overall = list(overall)))
}
# Stop the cluster
stopCluster(cl)
registerDoSEQ()
输出结果如下:
> print(res)
# A tibble: 50 x 3
iteration class overall
<int> <list> <list>
1 1 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
2 2 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
3 3 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
4 4 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
5 5 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
6 6 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
7 7 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
8 8 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
9 9 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
10 10 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
# ... with 40 more rows
我需要通过多线程加速 for 循环。我想为此使用以下库:foreach 和 doParallel。我以前使用过这些包,但仅用于需要一个结果 table 的过程。我不知道如何使用它们导出多个 tables(此处结果为 tables)。我的问题要复杂得多,需要导出许多结果集。在这里,为简单起见,我使用虹膜数据。
library(randomForest)
library(caret)
results_class <- data.frame()
results_overall <- data.frame()
for(i in 1:50){
trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
irisTrain <- iris[ trainIndex,]
irisTest <- iris[-trainIndex,]
model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
do.trace = 100, type = "classification")
pred_test <- predict(model, irisTest[,c(1:4)])
con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
results_class <- rbind(results_class, con.mat_test[["byClass"]])
results_overall <- rbind(results_overall, con.mat_test[["overall"]])
}
据我所知,在 foreach
循环之外修改变量并不容易(甚至不可能),那么将多个结果存储在一个嵌套的 tibble
中怎么样?
library(randomForest)
library(caret)
library(foreach)
library(doParallel)
# Set up parallel computing
cl <- makeCluster(detectCores(logical = TRUE))
registerDoParallel(cl)
res <- foreach(i = 1:50, .packages = c("caret", "randomForest"), .combine = rbind) %dopar% {
trainIndex <- caret::createDataPartition(iris$Species, p = 0.5, list = FALSE)
irisTrain <- iris[ trainIndex,]
irisTest <- iris[-trainIndex,]
model <- randomForest(x = irisTrain[,c(1:4)], y = irisTrain[,5], importance = TRUE,
replace = TRUE, mtry = 4, ntree = 500, na.action=na.omit,
do.trace = 100, type = "classification")
pred_test <- predict(model, irisTest[,c(1:4)])
con.mat_test <- confusionMatrix(pred_test, irisTest[,5], mode ="everything")
# Save class into separate variable
# Use substr to get rid of "Class: "
class <- data.frame(con.mat_test[["byClass"]])
overall <- data.frame(con.mat_test[["overall"]])
class$class <- sapply(rownames(class), function(x) substr(x, 8, nchar(x)))
overall$class <- sapply(rownames(overall), function(x) substr(x, 8, nchar(x)))
# Save output dataframe in tibble as list column
return(tibble::tibble(iteration = i,
class = list(class),
overall = list(overall)))
}
# Stop the cluster
stopCluster(cl)
registerDoSEQ()
输出结果如下:
> print(res)
# A tibble: 50 x 3
iteration class overall
<int> <list> <list>
1 1 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
2 2 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
3 3 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
4 4 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
5 5 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
6 6 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
7 7 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
8 8 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
9 9 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
10 10 <df[,12] [3 x 12]> <df[,2] [7 x 2]>
# ... with 40 more rows