并行处理 - 组合结果
Parallel processing - combining results
我已并行处理 运行 我的以下任务:
require(data.table)
library(parallel)
library(foreach)
library(doParallel)
# create computing cluster
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl, cores = detectCores() - 1)
# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df
text
1: apple pie
2: dolphin
3: orange juice
# target string
x <- paste0("\b", c('apple', 'poo'),"\b", collapse = "|")
y <- paste0("\b", c('orange', 'kiwi'),"\b", collapse = "|")
z <- list(x,y); z
> z
[[1]]
[1] "\bapple\b|\bpoo\b"
[[2]]
[1] "\borange\b|\bkiwi\b"
# initialise
df[, flag_matched := 0 ]
# parallel computing - flag rows with a match
a = foreach(i = seq_along(z)
, .packages = c("data.table")
, .combine = rbind
) %dopar%
{
df[, flag_matched := flag_matched + as.numeric(grepl(z[[i]], text, perl=T)) ]
}
# stop hoarding cluster
stopCluster(cl)
但是,我目前将 rbind
作为函数 foreach
中的 combine
参数,因此,结果为否。行数 = nrow(df)* 没有。循环数:
> a
text flag_matched
1: apple pie 1
2: dolphin 0
3: orange juice 0
4: apple pie 0
5: dolphin 0
6: orange juice 1
然后我可以df[, .(sum(flag_matched)), text]
。但是,还有其他合并结果的方法吗?
你可以这样做:
library(data.table)
library(doParallel)
# create computing cluster
registerDoParallel(cl <- makeCluster(detectCores() - 1))
# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df
# target string
x <- paste0("\b", c('apple', 'poo'), "\b", collapse = "|")
y <- paste0("\b", c('orange', 'kiwi'), "\b", collapse = "|")
z <- list(x,y); z
# parallel computing - flag rows with a match
a <- foreach(z_i = z) %dopar% {
grepl(z_i, df$text, perl = TRUE)
}
df$flag_matched <- Reduce(`+`, a)
# stop hoarding cluster
stopCluster(cl)
我已并行处理 运行 我的以下任务:
require(data.table)
library(parallel)
library(foreach)
library(doParallel)
# create computing cluster
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl, cores = detectCores() - 1)
# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df
text
1: apple pie
2: dolphin
3: orange juice
# target string
x <- paste0("\b", c('apple', 'poo'),"\b", collapse = "|")
y <- paste0("\b", c('orange', 'kiwi'),"\b", collapse = "|")
z <- list(x,y); z
> z
[[1]]
[1] "\bapple\b|\bpoo\b"
[[2]]
[1] "\borange\b|\bkiwi\b"
# initialise
df[, flag_matched := 0 ]
# parallel computing - flag rows with a match
a = foreach(i = seq_along(z)
, .packages = c("data.table")
, .combine = rbind
) %dopar%
{
df[, flag_matched := flag_matched + as.numeric(grepl(z[[i]], text, perl=T)) ]
}
# stop hoarding cluster
stopCluster(cl)
但是,我目前将 rbind
作为函数 foreach
中的 combine
参数,因此,结果为否。行数 = nrow(df)* 没有。循环数:
> a
text flag_matched
1: apple pie 1
2: dolphin 0
3: orange juice 0
4: apple pie 0
5: dolphin 0
6: orange juice 1
然后我可以df[, .(sum(flag_matched)), text]
。但是,还有其他合并结果的方法吗?
你可以这样做:
library(data.table)
library(doParallel)
# create computing cluster
registerDoParallel(cl <- makeCluster(detectCores() - 1))
# dummy df
df <- data.table(text = c('apple pie', 'dolphin', 'orange juice')); df
# target string
x <- paste0("\b", c('apple', 'poo'), "\b", collapse = "|")
y <- paste0("\b", c('orange', 'kiwi'), "\b", collapse = "|")
z <- list(x,y); z
# parallel computing - flag rows with a match
a <- foreach(z_i = z) %dopar% {
grepl(z_i, df$text, perl = TRUE)
}
df$flag_matched <- Reduce(`+`, a)
# stop hoarding cluster
stopCluster(cl)