R:使用 foreach、parallel 和 doParallel 将列表中的每个元素与下面的所有其他元素进行比较
R: Compare each element with all the other elements below in a list using foreach, parallel and doParallel
目标:
我正在尝试使用与此包 stringsim
的 Levenshtein 距离将列表中的每个元素与其下方的所有其他元素进行比较,以查找相似的文本。
障碍物:
问题是由于时间和 space 的复杂性,运行 需要很多时间。
这是一个 5 元素数组的复杂度,以 10 comparisons/iterations (4+3+2+1):
结尾
计算器和理论可以在这里找到link
尝试:
我将使用普通的 for 循环进行重现。
fruits <- fruit[1:5] # 5 elements from fruit
n <- len(fruits) # n set to 5
score_df <- data_frame(x=character(0),y=character(0),score=numeric(0)) # initialize an a matrix to host the strings compare and the score
cnt=0 # Count, for counting the how many iterations ran
i=j=0
for(i in 1:(n-1)){
print(i)
print('----')
for(j in i+1:(n-i)){
cnt = cnt+1
print(j)
initial_term = fruits[i] # First element
compared_term = fruits[j] # second element beneath it
score <- stringsim(initial_term,compared_term, method = 'lv') # Compute Levenshtein distance
term <- data_frame(x=initial_term, y=compared_term, score=score) # Adding term to a dataframe
score_df <- bind_rows(score_df, term) # Appending rows to a dataframe
}
print('====')
}
print(paste('operations count: ', cnt)) # Print the iterations count
您可以看到比较的 10 个元素的结果显示正确:
> as_tibble(fruits)
# A tibble: 5 x 1
value
<chr>
1 apple
2 apricot
3 avocado
4 banana
5 bell pepper
> score_df
# A tibble: 10 x 3
x y score
<chr> <chr> <dbl>
1 apple apricot 0.286
2 apple avocado 0.143
3 apple banana 0.167
4 apple bell pepper 0.273
5 apricot avocado 0.143
6 apricot banana 0
7 apricot bell pepper 0.0909
8 avocado banana 0.143
9 avocado bell pepper 0
10 banana bell pepper 0.0909
要求:
我终于能够将该普通循环转换为并行循环。以下是此数据集 stringr::fruit
上的示例 运行ning
我需要帮助来优化下面的循环,所以我可以 运行 它在 ~6k 到 ~7k 行上,如果有的话;由于我尝试使用以下代码导致我的 RStudio 崩溃。
我的处理器如下
PS> Get-WmiObject -Class Win32_Processor -ComputerName. | Select-Object -Property Name,NumberOfCores,NumberOfEnabledCore,NumberOfLogicalProcessors,Description
Name : Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
NumberOfCores : 6
NumberOfEnabledCore : 6
NumberOfLogicalProcessors : 12
Description : Intel64 Family 6 Model 158 Stepping 10
PS>
library(foreach)
library(parallel)
library(doParallel)
fruits <- fruit
n <- length(fruits)
score_df <- data_frame(x=character(0),y=character(0),score=numeric(0))
numCores <- detectCores() # 12
registerDoParallel(numCores - 1) # Assigning 11 threads out of 12
i=j=0
score_df <- foreach(i = 1:(n-1), .combine = 'rbind') %:%
foreach(j = i+1:(n-i), .packages = c("stringdist","tibble","dplyr"), .combine = 'rbind') %dopar% {
initial_term = fruits[i]
compared_term = fruits[j]
score <- stringsim(initial_term,compared_term, method = 'lv')
term <- data_frame(x=initial_term, y=compared_term, score=score)
}
stopImplicitCluster()
结果是正确的预期数量(3160 行)
> score_df
# A tibble: 3,160 x 3
x y score
<chr> <chr> <dbl>
1 apple apricot 0.286
2 apple avocado 0.143
3 apple banana 0.167
4 apple bell pepper 0.273
5 apple bilberry 0.125
6 apple blackberry 0.200
7 apple blackcurrant 0.0833
8 apple blood orange 0.0833
9 apple blueberry 0.111
10 apple boysenberry 0.0909
# ... with 3,150 more rows
参考文献:
平行
https://nceas.github.io/oss-lessons/parallel-computing-in-r/parallel-computing-in-r.html
foreach
https://cran.r-project.org/web/packages/foreach/vignettes/foreach.html
嵌套 foreach
https://cran.r-project.org/web/packages/foreach/vignettes/nested.html
这里有一些想法:
- 使用字符串的字符向量,而不是更慢的
data.frame
;
- 让内部循环 return 有一个命名的数字向量,而不是再次慢得多
data.frame
;
- 内循环不需要创建两个变量,将要比较的字符串直接传给
stringsim
。
这将 return 一个矩阵,而不是 data.frame。并且矩阵具有更快的元素访问时间。
代码会变成
library(tidyverse)
library(parallel)
library(foreach)
library(doParallel)
ncores <- detectCores()
registerDoParallel(ncores - 1L)
fruit <- fruits[["value"]]
n <- nrow(fruits)
score_df <- foreach(i = 1:(n-1), .combine = 'rbind') %:%
foreach(j = (i+1):n, .packages = c("stringdist","tibble","dplyr"), .combine = 'rbind') %dopar% {
score <- stringsim(fruit[i], fruit[j], method = 'lv')
c(initial = i, compared = j, score = score)
}
stopImplicitCluster()
score_df
# initial compared score
#result.1 1 2 0.28571429
#result.2 1 3 0.14285714
#result.3 1 4 0.16666667
#result.4 1 5 0.27272727
#result.1 2 3 0.14285714
#result.2 2 4 0.00000000
#result.3 2 5 0.09090909
#result.1 3 4 0.14285714
#result.2 3 5 0.00000000
#result.4 4 5 0.09090909
class(score_df)
#[1] "matrix" "array"
备注
您应该明确地创建一个集群。我没有,因为这取决于您未说明的操作系统。
编辑
函数stringsim
是向量化的,不需要嵌套循环。内层循环可以用函数处理。
ncores <- detectCores()
registerDoParallel(ncores - 1L)
score_df2 <- foreach(i = 1:(n - 1),
.packages = "stringdist",
.combine = "rbind") %dopar% {
score <- stringdist::stringsim(fruit[i], fruit[(i + 1):n], method = 'lv')
cbind(initial = i, compared = (i+1):n, score = score)
}
stopImplicitCluster()
score_df2
# initial compared score
# [1,] 1 2 0.28571429
# [2,] 1 3 0.14285714
# [3,] 1 4 0.16666667
# [4,] 1 5 0.27272727
# [5,] 2 3 0.14285714
# [6,] 2 4 0.00000000
# [7,] 2 5 0.09090909
# [8,] 3 4 0.14285714
# [9,] 3 5 0.00000000
#[10,] 4 5 0.09090909
数据
txt <- "value
1 apple
2 apricot
3 avocado
4 banana
5 'bell pepper'"
tc <- textConnection(txt)
fruits <- read.table(tc, header = TRUE)
close(tc)
目标:
我正在尝试使用与此包 stringsim
的 Levenshtein 距离将列表中的每个元素与其下方的所有其他元素进行比较,以查找相似的文本。
障碍物:
问题是由于时间和 space 的复杂性,运行 需要很多时间。 这是一个 5 元素数组的复杂度,以 10 comparisons/iterations (4+3+2+1):
结尾计算器和理论可以在这里找到link
尝试:
我将使用普通的 for 循环进行重现。
fruits <- fruit[1:5] # 5 elements from fruit
n <- len(fruits) # n set to 5
score_df <- data_frame(x=character(0),y=character(0),score=numeric(0)) # initialize an a matrix to host the strings compare and the score
cnt=0 # Count, for counting the how many iterations ran
i=j=0
for(i in 1:(n-1)){
print(i)
print('----')
for(j in i+1:(n-i)){
cnt = cnt+1
print(j)
initial_term = fruits[i] # First element
compared_term = fruits[j] # second element beneath it
score <- stringsim(initial_term,compared_term, method = 'lv') # Compute Levenshtein distance
term <- data_frame(x=initial_term, y=compared_term, score=score) # Adding term to a dataframe
score_df <- bind_rows(score_df, term) # Appending rows to a dataframe
}
print('====')
}
print(paste('operations count: ', cnt)) # Print the iterations count
您可以看到比较的 10 个元素的结果显示正确:
> as_tibble(fruits)
# A tibble: 5 x 1
value
<chr>
1 apple
2 apricot
3 avocado
4 banana
5 bell pepper
> score_df
# A tibble: 10 x 3
x y score
<chr> <chr> <dbl>
1 apple apricot 0.286
2 apple avocado 0.143
3 apple banana 0.167
4 apple bell pepper 0.273
5 apricot avocado 0.143
6 apricot banana 0
7 apricot bell pepper 0.0909
8 avocado banana 0.143
9 avocado bell pepper 0
10 banana bell pepper 0.0909
要求:
我终于能够将该普通循环转换为并行循环。以下是此数据集 stringr::fruit
我需要帮助来优化下面的循环,所以我可以 运行 它在 ~6k 到 ~7k 行上,如果有的话;由于我尝试使用以下代码导致我的 RStudio 崩溃。
我的处理器如下
PS> Get-WmiObject -Class Win32_Processor -ComputerName. | Select-Object -Property Name,NumberOfCores,NumberOfEnabledCore,NumberOfLogicalProcessors,Description
Name : Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
NumberOfCores : 6
NumberOfEnabledCore : 6
NumberOfLogicalProcessors : 12
Description : Intel64 Family 6 Model 158 Stepping 10
PS>
library(foreach)
library(parallel)
library(doParallel)
fruits <- fruit
n <- length(fruits)
score_df <- data_frame(x=character(0),y=character(0),score=numeric(0))
numCores <- detectCores() # 12
registerDoParallel(numCores - 1) # Assigning 11 threads out of 12
i=j=0
score_df <- foreach(i = 1:(n-1), .combine = 'rbind') %:%
foreach(j = i+1:(n-i), .packages = c("stringdist","tibble","dplyr"), .combine = 'rbind') %dopar% {
initial_term = fruits[i]
compared_term = fruits[j]
score <- stringsim(initial_term,compared_term, method = 'lv')
term <- data_frame(x=initial_term, y=compared_term, score=score)
}
stopImplicitCluster()
结果是正确的预期数量(3160 行)
> score_df
# A tibble: 3,160 x 3
x y score
<chr> <chr> <dbl>
1 apple apricot 0.286
2 apple avocado 0.143
3 apple banana 0.167
4 apple bell pepper 0.273
5 apple bilberry 0.125
6 apple blackberry 0.200
7 apple blackcurrant 0.0833
8 apple blood orange 0.0833
9 apple blueberry 0.111
10 apple boysenberry 0.0909
# ... with 3,150 more rows
参考文献:
平行
https://nceas.github.io/oss-lessons/parallel-computing-in-r/parallel-computing-in-r.html
foreach
https://cran.r-project.org/web/packages/foreach/vignettes/foreach.html
嵌套 foreach
https://cran.r-project.org/web/packages/foreach/vignettes/nested.html
这里有一些想法:
- 使用字符串的字符向量,而不是更慢的
data.frame
; - 让内部循环 return 有一个命名的数字向量,而不是再次慢得多
data.frame
; - 内循环不需要创建两个变量,将要比较的字符串直接传给
stringsim
。
这将 return 一个矩阵,而不是 data.frame。并且矩阵具有更快的元素访问时间。
代码会变成
library(tidyverse)
library(parallel)
library(foreach)
library(doParallel)
ncores <- detectCores()
registerDoParallel(ncores - 1L)
fruit <- fruits[["value"]]
n <- nrow(fruits)
score_df <- foreach(i = 1:(n-1), .combine = 'rbind') %:%
foreach(j = (i+1):n, .packages = c("stringdist","tibble","dplyr"), .combine = 'rbind') %dopar% {
score <- stringsim(fruit[i], fruit[j], method = 'lv')
c(initial = i, compared = j, score = score)
}
stopImplicitCluster()
score_df
# initial compared score
#result.1 1 2 0.28571429
#result.2 1 3 0.14285714
#result.3 1 4 0.16666667
#result.4 1 5 0.27272727
#result.1 2 3 0.14285714
#result.2 2 4 0.00000000
#result.3 2 5 0.09090909
#result.1 3 4 0.14285714
#result.2 3 5 0.00000000
#result.4 4 5 0.09090909
class(score_df)
#[1] "matrix" "array"
备注
您应该明确地创建一个集群。我没有,因为这取决于您未说明的操作系统。
编辑
函数stringsim
是向量化的,不需要嵌套循环。内层循环可以用函数处理。
ncores <- detectCores()
registerDoParallel(ncores - 1L)
score_df2 <- foreach(i = 1:(n - 1),
.packages = "stringdist",
.combine = "rbind") %dopar% {
score <- stringdist::stringsim(fruit[i], fruit[(i + 1):n], method = 'lv')
cbind(initial = i, compared = (i+1):n, score = score)
}
stopImplicitCluster()
score_df2
# initial compared score
# [1,] 1 2 0.28571429
# [2,] 1 3 0.14285714
# [3,] 1 4 0.16666667
# [4,] 1 5 0.27272727
# [5,] 2 3 0.14285714
# [6,] 2 4 0.00000000
# [7,] 2 5 0.09090909
# [8,] 3 4 0.14285714
# [9,] 3 5 0.00000000
#[10,] 4 5 0.09090909
数据
txt <- "value
1 apple
2 apricot
3 avocado
4 banana
5 'bell pepper'"
tc <- textConnection(txt)
fruits <- read.table(tc, header = TRUE)
close(tc)