R 中用于通过循环保存数据的并行计算
Parallel computation in R for saving data over loops
我在以下简单代码上应用并行以通过多个循环保存 Openxlsx 输出的努力失败了。
任何人都可以帮助将此代码转换为并行模式。此代码基于真实大小的数据(超过 5000 万次观察,需要 13 小时 运行)。甚至减少2小时对我来说都是一项艰巨的任务。
library(dplyr)
library(readxl)
library(openxlsx)
library(foreach)
library(doParallel)
rawdata <- readxl::read_xlsx("~/Desktop/Book1.xlsx")
TYPE1 <- rawdata %>% filter(TYPE == "A")
TYPE2 <- rawdata %>% filter(TYPE == "B")
Split.TYPE1 <- split(TYPE1, TYPE1$Name)
Split.TYPE2 <- split(TYPE2, TYPE2$Name)
#--------------------------------- Save the TYPE A reports------------------------------------------------------------------------------
###################################(the foreach lines are coded)
for (nm in names(Split.TYPE1)){
#foreach(nm=1:names(Split.TYPE1), .combine=cbind) %dopar% {
file<-paste0(nm,".xlsx")
d1<-as.data.frame(Split.TYPE1[[nm]])
wb<-createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
# #------------------------------ Save the TYPE B in a folder ----------------------------------
for (dn in names(Split.TYPE2)){
dnn <- paste0(dn)
dir.create(dnn)
sub_Split.TYPE2 <- split(Split.TYPE2[[dn]], Split.TYPE2[[dn]]$Surname)
for (fn in names(sub_Split.TYPE2)){
file<-file.path(dnn, paste0(fn,".xlsx"))
d1<-as.data.frame(sub_Split.TYPE2[[fn]])
wb<-createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}}
数据:
Name Surname TYPE
John Greer A
David bear A
Rose beer B
Tara tea B
Sam Mac B
Alan Glass B
Brad Newman A
Kristen Goodman A
Jessica Goodwin A
Heather Poker B
因为我没有你的数据,所以我做了一些小的虚拟样本。
我使用的包:
library(tidyverse)
library(openxlsx)
library(foreach)
library(doParallel)
这部分来自您,没有任何改变。
TYPE1 <- rawdata %>% filter(TYPE == "A")
TYPE2 <- rawdata %>% filter(TYPE == "B")
Split.TYPE1 <- split(TYPE1, TYPE1$Name)
Split.TYPE2 <- split(TYPE2, TYPE2$Name)
定义并行后端。我在这里使用 6 个内核。
cl <- makeCluster(6)
registerDoParallel(cl)
这是您的第一个循环。不要忘记添加 .packages = "openxlsx"
。这确保包裹也被发送给工作人员。我稍微更改了代码,因为 nm in names(Split.TYPE1)
不适用于 foreach。也许有更简单的解决方案,但我不知道。
foreach(nm = 1:length(Split.TYPE1), .combine = cbind, .packages = "openxlsx") %dopar% {
file <- paste0(names(Split.TYPE1)[nm], ".xlsx")
d1 <- as.data.frame(Split.TYPE1[[names(Split.TYPE1)[nm]]])
wb <- createWorkbook(file)
addWorksheet(wb, "test", gridLines = TRUE)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
第二个循环。我过去只用过一次,对我来说效果很好。这就是创建嵌套 foreach 循环的方法。更多信息 here.
foreach(dn = 1:length(Split.TYPE2)) %:%
foreach(fn = 1:length(unique(Split.TYPE2[[names(Split.TYPE2)[dn]]]$Surname)), .packages = "openxlsx") %dopar% {
dnn <- paste0(names(Split.TYPE2)[dn])
dir.create(dnn)
sub_Split.TYPE2 <- split(Split.TYPE2[[names(Split.TYPE2)[dn]]], Split.TYPE2[[names(Split.TYPE2)[dn]]]$Surname)
file <- file.path(dnn, paste0(names(sub_Split.TYPE2)[fn],".xlsx"))
d1 <- as.data.frame(sub_Split.TYPE2[[fn]])
wb <- createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
并停止并行后端。
stopCluster(cl)
使用您的数据,我得到以下嵌套循环的 folder/file 结构:
- Alan
- Glass.xlsx
- Heather
- Poker.xlsx
- Rose
- beer.xlsx
- Sam
- Mac.xlsx
- Tara
- tea.xlsx
我在以下简单代码上应用并行以通过多个循环保存 Openxlsx 输出的努力失败了。
任何人都可以帮助将此代码转换为并行模式。此代码基于真实大小的数据(超过 5000 万次观察,需要 13 小时 运行)。甚至减少2小时对我来说都是一项艰巨的任务。
library(dplyr)
library(readxl)
library(openxlsx)
library(foreach)
library(doParallel)
rawdata <- readxl::read_xlsx("~/Desktop/Book1.xlsx")
TYPE1 <- rawdata %>% filter(TYPE == "A")
TYPE2 <- rawdata %>% filter(TYPE == "B")
Split.TYPE1 <- split(TYPE1, TYPE1$Name)
Split.TYPE2 <- split(TYPE2, TYPE2$Name)
#--------------------------------- Save the TYPE A reports------------------------------------------------------------------------------
###################################(the foreach lines are coded)
for (nm in names(Split.TYPE1)){
#foreach(nm=1:names(Split.TYPE1), .combine=cbind) %dopar% {
file<-paste0(nm,".xlsx")
d1<-as.data.frame(Split.TYPE1[[nm]])
wb<-createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
# #------------------------------ Save the TYPE B in a folder ----------------------------------
for (dn in names(Split.TYPE2)){
dnn <- paste0(dn)
dir.create(dnn)
sub_Split.TYPE2 <- split(Split.TYPE2[[dn]], Split.TYPE2[[dn]]$Surname)
for (fn in names(sub_Split.TYPE2)){
file<-file.path(dnn, paste0(fn,".xlsx"))
d1<-as.data.frame(sub_Split.TYPE2[[fn]])
wb<-createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}}
数据:
Name Surname TYPE
John Greer A
David bear A
Rose beer B
Tara tea B
Sam Mac B
Alan Glass B
Brad Newman A
Kristen Goodman A
Jessica Goodwin A
Heather Poker B
因为我没有你的数据,所以我做了一些小的虚拟样本。
我使用的包:
library(tidyverse)
library(openxlsx)
library(foreach)
library(doParallel)
这部分来自您,没有任何改变。
TYPE1 <- rawdata %>% filter(TYPE == "A")
TYPE2 <- rawdata %>% filter(TYPE == "B")
Split.TYPE1 <- split(TYPE1, TYPE1$Name)
Split.TYPE2 <- split(TYPE2, TYPE2$Name)
定义并行后端。我在这里使用 6 个内核。
cl <- makeCluster(6)
registerDoParallel(cl)
这是您的第一个循环。不要忘记添加 .packages = "openxlsx"
。这确保包裹也被发送给工作人员。我稍微更改了代码,因为 nm in names(Split.TYPE1)
不适用于 foreach。也许有更简单的解决方案,但我不知道。
foreach(nm = 1:length(Split.TYPE1), .combine = cbind, .packages = "openxlsx") %dopar% {
file <- paste0(names(Split.TYPE1)[nm], ".xlsx")
d1 <- as.data.frame(Split.TYPE1[[names(Split.TYPE1)[nm]]])
wb <- createWorkbook(file)
addWorksheet(wb, "test", gridLines = TRUE)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
第二个循环。我过去只用过一次,对我来说效果很好。这就是创建嵌套 foreach 循环的方法。更多信息 here.
foreach(dn = 1:length(Split.TYPE2)) %:%
foreach(fn = 1:length(unique(Split.TYPE2[[names(Split.TYPE2)[dn]]]$Surname)), .packages = "openxlsx") %dopar% {
dnn <- paste0(names(Split.TYPE2)[dn])
dir.create(dnn)
sub_Split.TYPE2 <- split(Split.TYPE2[[names(Split.TYPE2)[dn]]], Split.TYPE2[[names(Split.TYPE2)[dn]]]$Surname)
file <- file.path(dnn, paste0(names(sub_Split.TYPE2)[fn],".xlsx"))
d1 <- as.data.frame(sub_Split.TYPE2[[fn]])
wb <- createWorkbook(file)
addWorksheet(wb, "test", gridLines = T)
writeData(wb, sheet = "test", x = d1)
saveWorkbook(wb, file, overwrite = TRUE)
}
并停止并行后端。
stopCluster(cl)
使用您的数据,我得到以下嵌套循环的 folder/file 结构:
- Alan
- Glass.xlsx
- Heather
- Poker.xlsx
- Rose
- beer.xlsx
- Sam
- Mac.xlsx
- Tara
- tea.xlsx