加快报告生成中的双循环。也许有清单？

Question

您好，我有一个运行速度非常快的模拟。我遇到的问题是加快模拟生成的数据的报告速度。

#Load in relevant libraries
library(splitstackshape)
library(foreach)
library(doParallel)

#sample data for simulation

set.seed(100)
input <- data.frame(JobNum = seq(1:200)
    ,HangsPerWeek = sample(1:50, 200,replace=T)
    ,DS.CT = sample(c(38,41,43),200,replace=T)
    ,C1.CT = sample(c(40,41,42),200,replace=T)
    ,C2.CT = sample(c(36,41),200,replace=T)
    ,C3.CT = sample(c(38,39,40),200,replace=T)
    ,C4.CT = sample(c(40,27),200,replace=T)
    ,C5D5.CT = sample(c(20,21,22),200,replace=T)
    ,C6D6.CT = sample(c(20,21,22),200,replace=T)
    ,C5D7.CT = sample(c(20,21,22),200,replace=T)
    ,C6D8.CT = sample(c(9,22,23),200,replace=T)
    ,C7CD.CT = sample(c(40,41),200,replace=T))

input$JobNum<-as.character(input$JobNum)

#expand input file to have a single row per part
partsList<-expandRows(input, "HangsPerWeek")

#Set up cluster using all but one core on machine 
#this runs the simulation in parallel
cores=detectCores()
cl <- makeCluster(cores[1]-1)
registerDoParallel(cl)

#Initialize variables
partsOrder <- list()
numSim <- 10

#start simulation
SimResults <- foreach(j=1:numSim) %dopar%{

RobotSimulation <- function(){

#randomize the dataset of parts and record the order
set.seed(100)
parts <- partsList[sample(nrow(partsList)),]
partsOrder <- list(parts$JobNum)

#choose a random sample of parts to populate the conveyor belts
#this random assignment will be constant across all iterations of the simulation
set.seed(101)
LineParts <- partsList[sample(nrow(partsList),234,replace=FALSE),]

#pass parts through system one at a time and record cycle times at each dip
LineParts_dfList <- lapply(seq(nrow(parts)), function(i){      
    #Index line
    LinePartsTemp <- parts[1,]
    LinePartsTemp[2:nrow(LineParts),] <- LineParts[1:nrow(LineParts)-1,]

    #put new part into system
    LinePartsTemp[1,] <- parts[i,]

    #update the list of parts on the line
    LineParts <<- LinePartsTemp      
})

otherstations_veclist <- 
  list(
    DS = vapply(LineParts_dfList, function(df) df[1,'DS.CT'], numeric(1)),
    D1 = vapply(LineParts_dfList, function(df) df[10,'C1.CT'], numeric(1)),
    D2 = vapply(LineParts_dfList, function(df) df[26,'C2.CT'], numeric(1)),
    D3 = vapply(LineParts_dfList, function(df) df[42,'C3.CT'], numeric(1)),
    D4 = vapply(LineParts_dfList, function(df) df[57,'C4.CT'], numeric(1)),
    D5 = vapply(LineParts_dfList, function(df) df[85,'C5D5.CT'], numeric(1)),
    D6 = vapply(LineParts_dfList, function(df) df[120,'C6D6.CT'], numeric(1)),
    D7 = vapply(LineParts_dfList, function(df) df[167,'C5D7.CT'], numeric(1)),
    D8 = vapply(LineParts_dfList, function(df) df[210,'C6D8.CT'], numeric(1)),
    D9 = vapply(LineParts_dfList, function(df) df[216,'C7CD.CT'], numeric(1))
  )

jobstations_veclist <- 
  list(
    DS = vapply(LineParts_dfList, function(df) df[1,'JobNum'], character(1)),
    D1 = vapply(LineParts_dfList, function(df) df[10,'JobNum'], character(1)),
    D2 = vapply(LineParts_dfList, function(df) df[26,'JobNum'], character(1)),
    D3 = vapply(LineParts_dfList, function(df) df[42,'JobNum'], character(1)),
    D4 = vapply(LineParts_dfList, function(df) df[57,'JobNum'], character(1)),
    D5 = vapply(LineParts_dfList, function(df) df[85,'JobNum'], character(1)),
    D6 = vapply(LineParts_dfList, function(df) df[120,'JobNum'], character(1)),
    D7 = vapply(LineParts_dfList, function(df) df[167,'JobNum'], character(1)),
    D8 = vapply(LineParts_dfList, function(df) df[210,'JobNum'], character(1)),
    D9 = vapply(LineParts_dfList, function(df) df[216,'JobNum'], character(1))
  )

#record results
result <- list(partsOrder = partsOrder, CT = otherstations_veclist, JobNum = jobstations_veclist)
return(result)
}

RobotSimulation()

}

#stop using all cores
stopCluster(cl)

在此运行之后，我创建了 4 个不同的图形，其中包含一个双循环以完成模拟的每次迭代。有没有办法继续使用 R 中的列表来加速计算？还是 for 循环是唯一的选择？

#For every time we add a new part to the line for every simulation 
#find the felt cycle time and the bottlenecks

ProblemJob <- c()
FeltCT <- c()
BottleNeck <- c()
CTs <- c()

for(s in 1:numSim){
  for(p in 1:dim(partsList)[1]){
    CT <- c(SimResults[[s]][[2]][[1]][[p]],
      SimResults[[s]][[2]][[2]][[p]],
      SimResults[[s]][[2]][[3]][[p]],
      SimResults[[s]][[2]][[4]][[p]],
      SimResults[[s]][[2]][[5]][[p]],
      max(SimResults[[s]][[2]][[6]][[p]],SimResults[[s]][[2]][[7]][[p]])+max(SimResults[[s]][[2]][[8]][[p]], SimResults[[s]][[2]][[9]][[p]]),
      SimResults[[s]][[2]][[10]][[p]]
    )
    FeltCT <- append(FeltCT,max(CT))
    BottleNeck <- append(BottleNeck,which(CT==max(CT)))
    CTs <- append(CTs,CT[which(CT==max(CT))])
    ProblemJob <- append(ProblemJob,if(which(CT==max(CT))==1){paste('DS',SimResults[[s]][[3]][[1]][[p]],sep=' ')}
    else if(which(CT==max(CT))==2){paste('R1',SimResults[[s]][[3]][[2]][[p]],sep=' ')}
    else if(which(CT==max(CT))==3){paste('R2',SimResults[[s]][[3]][[3]][[p]],sep=' ')}
    else if(which(CT==max(CT))==4){paste('R3',SimResults[[s]][[3]][[4]][[p]],sep=' ')}
    else if(which(CT==max(CT))==5){paste('R4',SimResults[[s]][[3]][[5]][[p]],sep=' ')}
    else if(which(CT==max(CT))==6){c(
                    if(SimResults[[s]][[2]][[6]][[p]] >= SimResults[[s]][[2]][[7]][[p]]){paste('R5D5',SimResults[[s]][[3]][[6]][[p]],sep=' ')}else{paste('R6D6',SimResults[[s]][[3]][[7]][[p]],sep=' ')}
                    ,if(SimResults[[s]][[2]][[8]][[p]] >= SimResults[[s]][[2]][[9]][[p]]){paste('R5D7',SimResults[[s]][[3]][[8]][[p]],sep=' ')}else{paste('R6D8',SimResults[[s]][[3]][[9]][[p]],sep=' ')}
                    ,paste(if(SimResults[[s]][[2]][[6]][[p]] >= SimResults[[s]][[2]][[7]][[p]]){SimResults[[s]][[3]][[6]][[p]]}else{SimResults[[s]][[3]][[7]][[p]]},
                        if(SimResults[[s]][[2]][[8]][[p]] >= SimResults[[s]][[2]][[9]][[p]]){SimResults[[s]][[3]][[8]][[p]]}else{SimResults[[s]][[3]][[9]][[p]]},sep='/')
                    )}
    else if(which(CT==max(CT))==7){paste('R7',SimResults[[s]][[3]][[10]][[p]],sep=' ')}
      )
  }
}

BottleNeckPercent <- 100*tabulate(BottleNeck)/length(BottleNeck)
RobotAvg<-aggregate(CTs~BottleNeck,FUN=mean)
base <- data.frame(BottleNeck=seq(1:7),CTs=rep(0,7))
RobotAvg <- merge(base,RobotAvg,by='BottleNeck',all=TRUE)

par(mfrow=c(2,2))
bp <- barplot(RobotAvg$CTs.y,
    names.arg=c('DS','R1','R2','R3','R4','R5/R6','R7'),
    col="lightblue",
    main="Average Cycle Time per Robot",
    xlab="Robot",ylab="Seconds")
text(bp,RobotAvg$CTs.y,round(RobotAvg$CTs.y),pos=1)

hist(FeltCT,col='yellow',main=paste('Avg Cycle Time:',round(mean(FeltCT),1),sep=' '))

barplot(head(table(ProblemJob)[order(-table(ProblemJob))],20),las=2,col='red',main='Top 20 Problem Jobs and Location')
bp2<-barplot(BottleNeckPercent
    ,col='green'
    ,names=c('DS','R1','R2','R3','R4','R5/R6','R7')
    ,main='% Cause of Bottleneck'
    ,xlab='Robot'
    ,ylab='%')

text(bp2,BottleNeckPercent,paste(round(BottleNeckPercent,2),'%',sep=''),pos=1)

结果如下所示：

Answer 1

好的，所以我认为我最初的想法是实现你想要的东西不会太难，而且在我去吃晚饭之前这将是一个快速的想法。不是那么多，我仍然对解决方案不满意，但主要是 ProblemJob 变量让它变得痛苦。我在笔记本电脑上计时的解决方案为我的解决方案提供了大约 1.5 - 1.7 秒，而你当前的解决方案为 36-39 秒。我相信它可以变得更有效率，但我现在需要吃饭。除了 ProblemJob 之外，我的答案与您的答案完全相同，正如在 OP 的评论中所讨论的那样。事不宜迟：

让你慢下来的其中一件事是在双循环中到处附加向量。删除它会产生巨大的差异。问题是对于一些您事先不知道有多少结果需要预分配的变量。但是，您在此处使用列表上的 lapply 和 purrr::pmap 函数进行了大量计算。还有一些地方你可以多次计算同一个东西。

我们可以一次性预先计算所有 CT 迭代，这让我们在嵌套迭代结构中少了一件事情，并创建了可能用于 ProblemJob 的所有标签：

library(purrr)
simlist = transpose(SimResults)[[2]] %>% lapply(.,function(x) do.call(cbind,x))
labels = transpose(SimResults)[[3]] %>% lapply(.,function(x) do.call(cbind,x))
CT_list = lapply(simlist, function(x) cbind(x[,1:5], pmax(x[,6],x[,7]) + pmax(x[,8],x[,9]),x[,10,drop = FALSE]))
bool1 = lapply(simlist, function(x) x[,6] > x[,7])
bool2 = lapply(simlist, function(x) x[,8] > x[,9])
special_labels = pmap(list(labels,bool1,bool2), function(x,y,z){
  paste(ifelse(y,x[,6],x[,7]), ifelse(z, x[,8],x[,9]),sep = "/")
})  

labels = lapply(labels, function(x) {
  x = t(x)
  x[] = paste(c("DS","R1","R2","R3","R4","R5D5", "R6D6","R5D7","R6D8","R7"), x)
  t(x)
  }
)

由于您反复将第二个和第三个组件从 SimResults 列表中拉出，因此使用 purrr:transpose 一次完成此操作是有意义的，然后 lapply 跨越结果列表再次给出更好的形状。 pmax 这里是 Base R 的一部分，是并行最大值，本质上是一个矢量化 max 函数。

由于比较运算符是矢量化的，我们可以沿着列表执行此操作以创建正在检查 ProblemJob 的布尔条件。根据此处的原始代码， simlist 的组件是循环中 s 的 10 个值，每个组件的行是循环中 p 的 5000+ 个值，并且10 列是您在每次迭代中计算的 CT 的值。 ifelse 是 if(...){} else {}

的向量化版本

预先计算了我们现在需要映射到列表的所有内容，以创建所有输出，这是您的 ProblemJob 变量导致问题的地方，因为我想不出一个巧妙的方法空腹删除嵌套lapply

out_list = pmap(list(CT_list, special_labels, labels, bool1, bool2), function(x,sl,z,b1,b2){
  lapply(1:nrow(x), function(i){
    y = x[i,]
    m = max(y)
    ix = which(y == m)
    # only need to do something special when ix contains a 6
    if(6 %in% ix){
     temp1 = ifelse(b1[i],z[i,6], z[i,7])  
     temp2 = ifelse(b2[i], z[i,8],z[i,9])
     lab = c(temp1,temp2,sl[i], z[i,ix[ix != 6]])
    }else{
      lab = z[i,ix]
    }
    list(FeltCT = m, BottleNeck = ix, CTs = y[ix], ProblemJob = lab)
  }) %>% transpose %>% simplify_all()
}) %>% transpose %>% simplify_all()

我们在此处使用 purrr 中的 transpose %>% simplify_all() 实质上将相同命名的组件加入到列表的结果列表中。您可以通过从 out_list

中提取结果来检查 3 个变量是否获得相同的解决方案

all(out_list$FeltCT == FeltCT)
all(out_list$BottleNeck == BottleNeck)
all(out_list$CTs == CTs)

对在 pmap 中使用 lapply 并不完全满意，但希望它能有所帮助，它肯定比原来的要快得多。我现在需要晚餐。

加快报告生成中的双循环。也许有清单？

Speed up double for loop in report generation. maybe with lists?

simulation

r