R Parallel processing error `Error in checkForRemoteErrors(val) : 6 nodes produced errors; first error: subscript out of bounds`
R Parallel processing error `Error in checkForRemoteErrors(val) : 6 nodes produced errors; first error: subscript out of bounds`
我正在学习并行处理,以此来处理一些庞大的数据集。
我预定义了一些变量如下:
CV <- function(mean, sd) {(sd / mean) * 100}
distThreshold <- 5 # Distance threshold
CVThreshold <- 20 # CV threshold
LocalCV <- list()
Num.CV <- list()
然后加载parallel
库,分配基变量和库到簇:
library(parallel)
clust_cores <- makeCluster(detectCores(logical = T) )
clusterExport(clust_cores, c("i","YieldData2rd","CV", "distThreshold", "CVThreshold"))
clusterEvalQ(clust_cores, library(sp))
然后将簇参数clust_cores
传递给parSapply
:
for (i in seq(YieldData2rd)) {
LocalCV[[i]] = parSapply(clust_cores, X = 1:length(YieldData2rd[[i]]),
FUN = function(pt) {
d = spDistsN1(YieldData2rd[[i]], YieldData2rd[[i]][pt,])
ret = CV(mean = mean(YieldData2rd[[i]][d < distThreshold, ]$yield),
sd = sd(YieldData2rd[[i]][d < distThreshold, ]$yield))
return(ret)
}) # calculate CV in the local neighbour
}
stopCluster(clust_cores)
然后除了warning messages:
1: closing unused connection (<-localhost:11688)
,我还得到了Error in checkForRemoteErrors(val) : 6 nodes produced errors; first error: subscript out of bounds
。
请告诉我如何解决这个问题。
对于可重现的示例,我创建了一个大型列表对象,它在没有并行处理组件的原始 for
循环中运行良好。
library('rgdal')
Yield1 <- data.frame(yield=rnorm(460, mean = 10), x1=rnorm(460, mean = 1843235), x2=rnorm(460,mean = 5802532))
Yield2 <- data.frame(yield=rnorm(408, mean = 10), x1=rnorm(408, mean = 1843235), x2=rnorm(408, mean = 5802532))
Yield3 <- data.frame(yield=rnorm(369, mean = 10), x1=rnorm(369, mean = 1843235), x2=rnorm(369, mean = 5802532))
coordinates(Yield1) <- c('x1', 'x2')
coordinates(Yield2) <- c('x1', 'x2')
coordinates(Yield3) <- c('x1', 'x2')
YieldData2rd <- list(Yield1, Yield2, Yield3)
感谢@Omry Atia 的评论,我开始研究 foreach
包并进行了第一次尝试。
library(foreach)
library(doParallel)
#setup parallel backend to use many processors
cores=detectCores()
clust_cores <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(clust_cores)
LocalCV = foreach(i = seq(YieldData2rd), .combine=list, .multicombine=TRUE) %dopar% {
LocalCV[[i]] = sapply(X = 1:length(YieldData2rd[[i]]),
FUN = function(pt) {
d = spDistsN1(YieldData2rd[[i]], YieldData2rd[[i]][pt,])
ret = CV(mean = mean(YieldData2rd[[i]][d < distThreshold, ]$yield),
sd = sd(YieldData2rd[[i]][d < distThreshold, ]$yield))
return(ret)
}) # calculate CV in the local neighbour
}
stopCluster(clust_cores)
它会打印出整个内容,而不会将 LocalCV
放在 foreach
的前面。
它将在一些巨大的数据集上尝试新代码,看看它能有多快。
参考:
我正在学习并行处理,以此来处理一些庞大的数据集。
我预定义了一些变量如下:
CV <- function(mean, sd) {(sd / mean) * 100}
distThreshold <- 5 # Distance threshold
CVThreshold <- 20 # CV threshold
LocalCV <- list()
Num.CV <- list()
然后加载parallel
库,分配基变量和库到簇:
library(parallel)
clust_cores <- makeCluster(detectCores(logical = T) )
clusterExport(clust_cores, c("i","YieldData2rd","CV", "distThreshold", "CVThreshold"))
clusterEvalQ(clust_cores, library(sp))
然后将簇参数clust_cores
传递给parSapply
:
for (i in seq(YieldData2rd)) {
LocalCV[[i]] = parSapply(clust_cores, X = 1:length(YieldData2rd[[i]]),
FUN = function(pt) {
d = spDistsN1(YieldData2rd[[i]], YieldData2rd[[i]][pt,])
ret = CV(mean = mean(YieldData2rd[[i]][d < distThreshold, ]$yield),
sd = sd(YieldData2rd[[i]][d < distThreshold, ]$yield))
return(ret)
}) # calculate CV in the local neighbour
}
stopCluster(clust_cores)
然后除了warning messages:
1: closing unused connection (<-localhost:11688)
,我还得到了Error in checkForRemoteErrors(val) : 6 nodes produced errors; first error: subscript out of bounds
。
请告诉我如何解决这个问题。
对于可重现的示例,我创建了一个大型列表对象,它在没有并行处理组件的原始 for
循环中运行良好。
library('rgdal')
Yield1 <- data.frame(yield=rnorm(460, mean = 10), x1=rnorm(460, mean = 1843235), x2=rnorm(460,mean = 5802532))
Yield2 <- data.frame(yield=rnorm(408, mean = 10), x1=rnorm(408, mean = 1843235), x2=rnorm(408, mean = 5802532))
Yield3 <- data.frame(yield=rnorm(369, mean = 10), x1=rnorm(369, mean = 1843235), x2=rnorm(369, mean = 5802532))
coordinates(Yield1) <- c('x1', 'x2')
coordinates(Yield2) <- c('x1', 'x2')
coordinates(Yield3) <- c('x1', 'x2')
YieldData2rd <- list(Yield1, Yield2, Yield3)
感谢@Omry Atia 的评论,我开始研究 foreach
包并进行了第一次尝试。
library(foreach)
library(doParallel)
#setup parallel backend to use many processors
cores=detectCores()
clust_cores <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(clust_cores)
LocalCV = foreach(i = seq(YieldData2rd), .combine=list, .multicombine=TRUE) %dopar% {
LocalCV[[i]] = sapply(X = 1:length(YieldData2rd[[i]]),
FUN = function(pt) {
d = spDistsN1(YieldData2rd[[i]], YieldData2rd[[i]][pt,])
ret = CV(mean = mean(YieldData2rd[[i]][d < distThreshold, ]$yield),
sd = sd(YieldData2rd[[i]][d < distThreshold, ]$yield))
return(ret)
}) # calculate CV in the local neighbour
}
stopCluster(clust_cores)
它会打印出整个内容,而不会将 LocalCV
放在 foreach
的前面。
它将在一些巨大的数据集上尝试新代码,看看它能有多快。
参考: