如何优化这个有很多循环的 Fortran 子程序?
How to optimize this Fortran subroutine with many loops?
我正在处理当数组变大时效率非常低的子程序,例如,NN=1000,KK=200,MM = 200。但是,我想不出优化它的想法。
program main
implicit none
integer :: NN, KK, MM
integer, allocatable, dimension(:,:) :: id
complex*16, allocatable, dimension(:) :: phase
complex*16 :: phase_base(3)
real*8, allocatable, dimension(:,:) :: wave_base
complex*16, allocatable, dimension(:,:) :: wave
integer :: i, j, k, n
NN = 1000
KK = 200
MM = 200
allocate(id(MM,3))
allocate(phase(KK))
allocate(wave_base(KK, NN*(NN+1)/2 ))
allocate(wave(NN, NN))
id(:,:) = 2
phase_base(:) = (1.0d0,1.0d0)
wave_base(:,:) = 1.0d0
phase(:) = (1.0d0,1.0d0)
call noise_wave(NN, KK, MM, id, phase, phase_base, wave_base, wave)
deallocate(id)
deallocate(phase)
deallocate(wave_base)
deallocate(wave)
end program main
subroutine noise_wave(NN, KK, MM, id, phase_1, phase_base, wave_base, wave)
implicit none
integer, intent(in) :: NN, KK, MM
integer, intent(in), dimension(MM, 3) :: id
complex*16, intent(in) :: phase_1(KK)
complex*16, intent(in) :: phase_base(3)
real*8, intent(in) :: wave_base(KK, NN*(NN+1)/2 )
complex*16, intent(out) :: wave(NN, NN)
integer :: i, j, k, p, n
integer :: x, y, z
real :: start, finish
complex*16 :: phase_2, phase_2_conjg
do p = 1, MM
x = id(p, 1)
y = id(p, 2)
z = id(p, 3)
phase_2 = (phase_base(1) ** x) * (phase_base(2) ** y) * (phase_base(3) ** z)
phase_2_conjg = conjg(phase_2)
n = 0
do j = 1, NN
do i = 1, j ! upper triangle
n = n + 1
do k = 1, KK
wave(i,j) = wave(i,j) + wave_base(k,n) * phase_1(k) * phase_2_conjg
enddo
wave(j,i) = conjg(wave(i,j) )
enddo
enddo
enddo
end subroutin
有人可以给我一些提示吗? (我已经完成了建议的优化。另外,根据Ian的建议,我添加了一个小测试,所以你可以直接测试它。)
如果将循环嵌套更改为
,您可能会获得可衡量的加速
do p = 1, MM
x = id(p, 1)
y = id(p, 2)
z = id(p, 3)
phase = (phase_base(1) ** x) * (phase_base(2) ** y) * (phase_base(3) ** z)
conjg_phase = conjg(phase) ! new variable, calculate here, use below
n = 0
do j = 1, NN
do i = 1, j
n = n + 1
do k = 1, KK
wave(i,j) = wave(i,j) + wave_base(k,n) * conjg_phase
enddo
enddo
wave(j,i) = conjg(wave(i,j) )
enddo
enddo
(如果我理解代码,它可能仍然是正确的!)。如果重复次数足够多,即使像我从循环嵌套底部提取出来的那些小计算也是一种拖累。并且执行速度也可能受益于将这些值移入和移出缓存的频率降低。
可能值得(稍微)交换 id
的维度,然后读取 id(1:3,p)
可能比当前版本更适合缓存。
如果执行速度仍然不合您的口味,是时候学习 OpenMP 了(如果您还不知道)。
这是我遵循上述好主意的解决方案。在 OpenMP 之前仍有一些提高效率的空间。比如子程序中的第k个循环可以通过sum函数消除。
program main
implicit none
integer :: NN, KK, MM
integer, allocatable, dimension(:,:) :: id
complex*16, allocatable, dimension(:) :: phase
complex*16 :: phase_base(3)
real*8, allocatable, dimension(:,:) :: wave_base
complex*16, allocatable, dimension(:,:) :: wave
integer :: i, j, k, n
NN = 1000
KK = 200
MM = 200
allocate(id(MM,3))
allocate(phase(KK))
allocate(wave_base(KK, NN*(NN+1)/2 ))
allocate(wave(NN, NN))
id(:,:) = 2
phase_base(:) = (1.0d0,1.0d0)
wave_base(:,:) = 1.0d0
phase(:) = (1.0d0,1.0d0)
call noise_wave(NN, KK, MM, id, phase, phase_base, wave_base, wave)
deallocate(id)
deallocate(phase)
deallocate(wave_base)
deallocate(wave)
end program main
subroutine noise_wave(NN, KK, MM, id, phase_1, phase_base, wave_base, wave)
implicit none
integer, intent(in) :: NN, KK, MM
integer, intent(in), dimension(MM, 3) :: id
complex*16, intent(in) :: phase_1(KK)
complex*16, intent(in) :: phase_base(3)
real*8, intent(in) :: wave_base(KK, NN*(NN+1)/2 )
complex*16, intent(out):: wave(NN, NN)
integer :: i, j, k, p, n
integer :: x, y, z
real :: start, finish
complex*16 :: phase_2, phase_2_conjg
complex*16 :: wave_tmp(NN*(NN+1)/2)
complex*16 :: wave_tmp_2(NN*(NN+1)/2)
do k = 1, KK
wave_tmp(:) = wave_tmp(:) + wave_base(k,:) * phase_1(k)
enddo
do p = 1, MM
phase_2 = product(phase_base(:)**id(p,:) )
phase_2_conjg = conjg(phase_2)
wave_tmp2(:) = wave_tmp2(:) + wave_tmp(n) * phase_2_conjg
enddo
n = 0
do j = 1, NN
do i = 1, j
n = n + 1
wave(i,j) = wave_tmp2(n)
wave(j,i) = conjg(wave_tmp2(n) )
enddo
enddo
end subroutine
我正在处理当数组变大时效率非常低的子程序,例如,NN=1000,KK=200,MM = 200。但是,我想不出优化它的想法。
program main
implicit none
integer :: NN, KK, MM
integer, allocatable, dimension(:,:) :: id
complex*16, allocatable, dimension(:) :: phase
complex*16 :: phase_base(3)
real*8, allocatable, dimension(:,:) :: wave_base
complex*16, allocatable, dimension(:,:) :: wave
integer :: i, j, k, n
NN = 1000
KK = 200
MM = 200
allocate(id(MM,3))
allocate(phase(KK))
allocate(wave_base(KK, NN*(NN+1)/2 ))
allocate(wave(NN, NN))
id(:,:) = 2
phase_base(:) = (1.0d0,1.0d0)
wave_base(:,:) = 1.0d0
phase(:) = (1.0d0,1.0d0)
call noise_wave(NN, KK, MM, id, phase, phase_base, wave_base, wave)
deallocate(id)
deallocate(phase)
deallocate(wave_base)
deallocate(wave)
end program main
subroutine noise_wave(NN, KK, MM, id, phase_1, phase_base, wave_base, wave)
implicit none
integer, intent(in) :: NN, KK, MM
integer, intent(in), dimension(MM, 3) :: id
complex*16, intent(in) :: phase_1(KK)
complex*16, intent(in) :: phase_base(3)
real*8, intent(in) :: wave_base(KK, NN*(NN+1)/2 )
complex*16, intent(out) :: wave(NN, NN)
integer :: i, j, k, p, n
integer :: x, y, z
real :: start, finish
complex*16 :: phase_2, phase_2_conjg
do p = 1, MM
x = id(p, 1)
y = id(p, 2)
z = id(p, 3)
phase_2 = (phase_base(1) ** x) * (phase_base(2) ** y) * (phase_base(3) ** z)
phase_2_conjg = conjg(phase_2)
n = 0
do j = 1, NN
do i = 1, j ! upper triangle
n = n + 1
do k = 1, KK
wave(i,j) = wave(i,j) + wave_base(k,n) * phase_1(k) * phase_2_conjg
enddo
wave(j,i) = conjg(wave(i,j) )
enddo
enddo
enddo
end subroutin
有人可以给我一些提示吗? (我已经完成了建议的优化。另外,根据Ian的建议,我添加了一个小测试,所以你可以直接测试它。)
如果将循环嵌套更改为
,您可能会获得可衡量的加速 do p = 1, MM
x = id(p, 1)
y = id(p, 2)
z = id(p, 3)
phase = (phase_base(1) ** x) * (phase_base(2) ** y) * (phase_base(3) ** z)
conjg_phase = conjg(phase) ! new variable, calculate here, use below
n = 0
do j = 1, NN
do i = 1, j
n = n + 1
do k = 1, KK
wave(i,j) = wave(i,j) + wave_base(k,n) * conjg_phase
enddo
enddo
wave(j,i) = conjg(wave(i,j) )
enddo
enddo
(如果我理解代码,它可能仍然是正确的!)。如果重复次数足够多,即使像我从循环嵌套底部提取出来的那些小计算也是一种拖累。并且执行速度也可能受益于将这些值移入和移出缓存的频率降低。
可能值得(稍微)交换 id
的维度,然后读取 id(1:3,p)
可能比当前版本更适合缓存。
如果执行速度仍然不合您的口味,是时候学习 OpenMP 了(如果您还不知道)。
这是我遵循上述好主意的解决方案。在 OpenMP 之前仍有一些提高效率的空间。比如子程序中的第k个循环可以通过sum函数消除。
program main
implicit none
integer :: NN, KK, MM
integer, allocatable, dimension(:,:) :: id
complex*16, allocatable, dimension(:) :: phase
complex*16 :: phase_base(3)
real*8, allocatable, dimension(:,:) :: wave_base
complex*16, allocatable, dimension(:,:) :: wave
integer :: i, j, k, n
NN = 1000
KK = 200
MM = 200
allocate(id(MM,3))
allocate(phase(KK))
allocate(wave_base(KK, NN*(NN+1)/2 ))
allocate(wave(NN, NN))
id(:,:) = 2
phase_base(:) = (1.0d0,1.0d0)
wave_base(:,:) = 1.0d0
phase(:) = (1.0d0,1.0d0)
call noise_wave(NN, KK, MM, id, phase, phase_base, wave_base, wave)
deallocate(id)
deallocate(phase)
deallocate(wave_base)
deallocate(wave)
end program main
subroutine noise_wave(NN, KK, MM, id, phase_1, phase_base, wave_base, wave)
implicit none
integer, intent(in) :: NN, KK, MM
integer, intent(in), dimension(MM, 3) :: id
complex*16, intent(in) :: phase_1(KK)
complex*16, intent(in) :: phase_base(3)
real*8, intent(in) :: wave_base(KK, NN*(NN+1)/2 )
complex*16, intent(out):: wave(NN, NN)
integer :: i, j, k, p, n
integer :: x, y, z
real :: start, finish
complex*16 :: phase_2, phase_2_conjg
complex*16 :: wave_tmp(NN*(NN+1)/2)
complex*16 :: wave_tmp_2(NN*(NN+1)/2)
do k = 1, KK
wave_tmp(:) = wave_tmp(:) + wave_base(k,:) * phase_1(k)
enddo
do p = 1, MM
phase_2 = product(phase_base(:)**id(p,:) )
phase_2_conjg = conjg(phase_2)
wave_tmp2(:) = wave_tmp2(:) + wave_tmp(n) * phase_2_conjg
enddo
n = 0
do j = 1, NN
do i = 1, j
n = n + 1
wave(i,j) = wave_tmp2(n)
wave(j,i) = conjg(wave_tmp2(n) )
enddo
enddo
end subroutine