使用 OpenMP 并行嵌套循环 运行 慢
With OpenMP parallelized nested loops run slow
我有一个 Fortran 程序的一部分,其中包含一些我想与 OpenMP 并行化的嵌套循环。
integer :: nstates , N, i, dima, dimb, dimc, a_row, b_row, b_col, c_row, row, col
double complex, dimension(4,4):: mat
double complex, dimension(:), allocatable :: vecin,vecout
nstates = 2
N = 24
allocate(vecin(nstates**N), vecout(nstates**N))
vecin = ...some data
vecout = 0
mat = reshape([...some data...],[4,4])
dimb=nstates**2
!$OMP PARALLEL DO PRIVATE(dima,dimc,row,col,a_row,b_row,c_row,b_col)
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
do b_col = 1,dimb
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
!$OMP ATOMIC
vecout(row) = vecout(row) + vecin(col)*mat(b_row,b_col)
end do
end do
end do
end do
end do
!$OMP END PARALLEL DO
程序运行,我得到的结果也是正确的,就是慢得令人难以置信。比没有 OpenMP 慢得多。我对 OpenMP 了解不多。我在使用 PRIVATE 或 OMP ATOMIC 时做错了什么吗?对于如何提高我的代码性能的每条建议,我将不胜感激。
如果您的数组太大并且自动归约导致堆栈溢出,您可以使用可分配的临时数组自行实施归约。
正如 Francois Jacq 所指出的,您还有一个由 dima
和 dimb
引起的竞争条件,它应该是私有的。
double complex, dimension(:), allocatable :: tmp
!$OMP PARALLEL PRIVATE(dima,dimb,row,col,a_row,b_row,c_row,b_col,tmp)
allocate(tmp(size(vecout)))
tmp = 0
!$OMP DO
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
do b_col = 1,dimb
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
tmp(row) = tmp(row) + vecin(col)*mat(b_row,b_col)
end do
end do
end do
end do
end do
!$OMP END DO
!$OMP CRITICAL
vecout = vecout + tmp
!$OMP END CRITICAL
!$OMP END PARALLEL
你能试试像这样的东西吗:
do b_col=1,dimb
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
!$OMP PARALLEL DO COLLAPSE(3) PRIVATE(row,col,a_row,b_row,c_row)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
vecout(row) = vecout(row) + vecin(col)*mat(b_row,b_col)
enddo
enddo
enddo
enddo
enddo
优点是 // 循环现在不会引起冲突:所有索引行都不同。
我有一个 Fortran 程序的一部分,其中包含一些我想与 OpenMP 并行化的嵌套循环。
integer :: nstates , N, i, dima, dimb, dimc, a_row, b_row, b_col, c_row, row, col
double complex, dimension(4,4):: mat
double complex, dimension(:), allocatable :: vecin,vecout
nstates = 2
N = 24
allocate(vecin(nstates**N), vecout(nstates**N))
vecin = ...some data
vecout = 0
mat = reshape([...some data...],[4,4])
dimb=nstates**2
!$OMP PARALLEL DO PRIVATE(dima,dimc,row,col,a_row,b_row,c_row,b_col)
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
do b_col = 1,dimb
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
!$OMP ATOMIC
vecout(row) = vecout(row) + vecin(col)*mat(b_row,b_col)
end do
end do
end do
end do
end do
!$OMP END PARALLEL DO
程序运行,我得到的结果也是正确的,就是慢得令人难以置信。比没有 OpenMP 慢得多。我对 OpenMP 了解不多。我在使用 PRIVATE 或 OMP ATOMIC 时做错了什么吗?对于如何提高我的代码性能的每条建议,我将不胜感激。
如果您的数组太大并且自动归约导致堆栈溢出,您可以使用可分配的临时数组自行实施归约。
正如 Francois Jacq 所指出的,您还有一个由 dima
和 dimb
引起的竞争条件,它应该是私有的。
double complex, dimension(:), allocatable :: tmp
!$OMP PARALLEL PRIVATE(dima,dimb,row,col,a_row,b_row,c_row,b_col,tmp)
allocate(tmp(size(vecout)))
tmp = 0
!$OMP DO
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
do b_col = 1,dimb
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
tmp(row) = tmp(row) + vecin(col)*mat(b_row,b_col)
end do
end do
end do
end do
end do
!$OMP END DO
!$OMP CRITICAL
vecout = vecout + tmp
!$OMP END CRITICAL
!$OMP END PARALLEL
你能试试像这样的东西吗:
do b_col=1,dimb
do i=1,N-1
dima=nstates**(i-1)
dimc=nstates**(N-i-1)
!$OMP PARALLEL DO COLLAPSE(3) PRIVATE(row,col,a_row,b_row,c_row)
do a_row = 1, dima
do b_row = 1,dimb
do c_row = 1,dimc
row = ((a_row-1)*dimb + b_row - 1)*dimc + c_row
col = ((a_row-1)*dimb + b_col - 1)*dimc + c_row
vecout(row) = vecout(row) + vecin(col)*mat(b_row,b_col)
enddo
enddo
enddo
enddo
enddo
优点是 // 循环现在不会引起冲突:所有索引行都不同。