R函数超慢
R function super slow
这是一个使用 R 的自对齐矩阵的非常简单的生物信息学实现。它使用滑动 window 运算符 frag1
在字符串序列上循环两次并与每个 fra2
相同的序列。
下面的代码非常慢,不确定如何使用标准 R 语法加快速度。在 python 中这会非常快,但在 R 中需要 1 分钟。我已经通过同时分配 i,j
和 j,i
将计算量减少了一半。
任何加速想法?
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
windowSize<-24;
matrixSize<-nchar(sequence) - windowSize;
defaultValue = -10000000000;
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize);
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize);
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize);
totalScore = 0;
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize;
}
scoreMatrix[i,j] = totalScore;
scoreMatrix[j,i] = totalScore;
}
}
}
return(scoreMatrix);
你在我不太新的笔记本电脑(2014 年的 Lenovo Yoga 2,R3.4)上的原始代码 运行 在 17 秒内完成。经过不那么大的优化后,这个时间减少到 2 秒。我只是在计算开始时将 sequence
转换为矢量。之后,我将 BLOSUM50
中的名称索引更改为数字索引索引。结果是 0.5 秒的执行时间。请参阅下面的代码和基准。
fun = function(sequence){
windowSize<-24
matrixSize<-nchar(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize)
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize)
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun2 = function(sequence){
windowSize<-24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize<-length(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = sequence[i:(i+windowSize)]
for(j in 1:matrixSize){
frag2 = sequence[j:(j+windowSize)]
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[frag1[x],frag2[x]] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun3 = function(sequence){
windowSize = 24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize = length(sequence) - windowSize
scoreMatrix = matrix(NA, nrow = matrixSize, ncol = matrixSize)
sequence_index = match(sequence, colnames(BLOSUM50))
for(i in seq_len(matrixSize)){
frag1 = sequence_index[i:(i+windowSize - 1)]
for(j in seq_len(matrixSize)){
frag2 = sequence_index[j:(j+windowSize - 1)]
if(is.na(scoreMatrix[i,j])){
totalScore = sum(BLOSUM50[(frag2 - 1)*NROW(BLOSUM50) + frag1])/windowSize
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
library(microbenchmark)
microbenchmark(
original = fun(sequence),
fun2 = fun2(sequence),
fun3 = fun3(sequence),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# original 16395.2108 16660.3295 17533.8563 16755.9680 17594.3596 20263.4137 5
# fun2 1992.7731 2010.4031 2027.7953 2015.9592 2034.9022 2084.9390 5
# fun3 472.0641 481.9267 496.2656 498.3259 506.6357 522.3755 5
这是一个使用 R 的自对齐矩阵的非常简单的生物信息学实现。它使用滑动 window 运算符 frag1
在字符串序列上循环两次并与每个 fra2
相同的序列。
下面的代码非常慢,不确定如何使用标准 R 语法加快速度。在 python 中这会非常快,但在 R 中需要 1 分钟。我已经通过同时分配 i,j
和 j,i
将计算量减少了一半。
任何加速想法?
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
windowSize<-24;
matrixSize<-nchar(sequence) - windowSize;
defaultValue = -10000000000;
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize);
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize);
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize);
totalScore = 0;
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize;
}
scoreMatrix[i,j] = totalScore;
scoreMatrix[j,i] = totalScore;
}
}
}
return(scoreMatrix);
你在我不太新的笔记本电脑(2014 年的 Lenovo Yoga 2,R3.4)上的原始代码 运行 在 17 秒内完成。经过不那么大的优化后,这个时间减少到 2 秒。我只是在计算开始时将 sequence
转换为矢量。之后,我将 BLOSUM50
中的名称索引更改为数字索引索引。结果是 0.5 秒的执行时间。请参阅下面的代码和基准。
fun = function(sequence){
windowSize<-24
matrixSize<-nchar(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = substr(sequence,i,i+windowSize)
for(j in 1:matrixSize){
frag2 = substr(sequence,j,j+windowSize)
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[substr(frag1,x,x),substr(frag2,x,x)] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun2 = function(sequence){
windowSize<-24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize<-length(sequence) - windowSize
defaultValue = -10000000000
scoreMatrix <- matrix(defaultValue, nrow = matrixSize, ncol = matrixSize)
for(i in 1:matrixSize){
frag1 = sequence[i:(i+windowSize)]
for(j in 1:matrixSize){
frag2 = sequence[j:(j+windowSize)]
totalScore = 0
if(scoreMatrix[i,j] == defaultValue){
for(x in 1:windowSize){
totalScore = totalScore + BLOSUM50[frag1[x],frag2[x]] / windowSize
}
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
fun3 = function(sequence){
windowSize = 24
sequence = unlist(strsplit(sequence, split = ""))
matrixSize = length(sequence) - windowSize
scoreMatrix = matrix(NA, nrow = matrixSize, ncol = matrixSize)
sequence_index = match(sequence, colnames(BLOSUM50))
for(i in seq_len(matrixSize)){
frag1 = sequence_index[i:(i+windowSize - 1)]
for(j in seq_len(matrixSize)){
frag2 = sequence_index[j:(j+windowSize - 1)]
if(is.na(scoreMatrix[i,j])){
totalScore = sum(BLOSUM50[(frag2 - 1)*NROW(BLOSUM50) + frag1])/windowSize
scoreMatrix[i,j] = totalScore
scoreMatrix[j,i] = totalScore
}
}
}
scoreMatrix
}
if(!exists('BLOSUM50')){
library(Biostrings)
data(BLOSUM50)
#BLOSUM50['A','N']
}
sequence = 'MNLDIHCEQLSDARWTELLPLLQQYEVVRLDDCGLTEEHCKDIGSALRANPSLTELCLRTNELGDAGVHLVLQGLQSPTCKIQKLSLQNCSLTEAGCGVLPSTLRSLPTLRELHLSDNPLGDAGLRLLCEGLLDPQCHLEKLQLEYCRLTAASCEPLASVLRATRALKELTVSNNDIGEAGARVLGQGLADSACQLETLRLENCGLTPANCKDLCGIVASQASLRELDLGSNGLGDAGIAELCPGLLSPASRLKTLWLWECDITASGCRDL'
library(microbenchmark)
microbenchmark(
original = fun(sequence),
fun2 = fun2(sequence),
fun3 = fun3(sequence),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# original 16395.2108 16660.3295 17533.8563 16755.9680 17594.3596 20263.4137 5
# fun2 1992.7731 2010.4031 2027.7953 2015.9592 2034.9022 2084.9390 5
# fun3 472.0641 481.9267 496.2656 498.3259 506.6357 522.3755 5