带条件副本的 SIMD 指令
SIMD instructions with condition copy
我有一个看起来像这样的热点。在这里收集某种矢量会很好...关于如何让编译器喜欢这个的任何建议?
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
使用 ifort 16.0.2 我的选择报告看起来像
LOOP BEGIN at code.f(197,13)
remark #25084: Preprocess Loopnests: Moving Out Store [ code.f(203,13) ]
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed FLOW dependence between irb line 201 and irb line 200
remark #15346: vector dependence: assumed ANTI dependence between irb line 200 and irb line 201
remark #15346: vector dependence: assumed ANTI dependence between irb line 200 and irb line 201
remark #15346: vector dependence: assumed FLOW dependence between irb line 201 and irb line 200
remark #25439: unrolled with remainder by 2
remark #25015: Estimate of max trip count of loop=1600
LOOP END
这是小测试程序
program vect
integer :: ii, i0, irb
integer, parameter :: N=32
integer, parameter :: M=8
integer, dimension(N) :: diff
integer, dimension(2*N) :: rbuf
rbuf = 0
!only some values of diff will meet condition
!could be random
do ii=1, N
diff(ii) = ii
end do
!from an outer loop
i0=1003
!this is code for filling up a buffer for an expensive vectorized
!subroutine with full vectors, irb < 2*N
irb=3
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
!check
do ii = 1, 2*N
write(*,*) ii, rbuf(ii)
end do
end
根据目标架构,我能够让编译器使用指令进行矢量化。
!CDIR$ IVDEP
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
和 -xMIC-AVX512
或 -mmic
将为这些体系结构提供矢量指令。例如
vpcompressd %zmm0, -4+vect_$RBUF.0.1(,%rax,4){%k1} #29.15 c1
对于 AVX2,我认为正如@Peter Cordes 在他的评论中所建议的那样,人们已经求助于 intrinsics/asm,但很高兴知道编译器可以解决这个问题。
我有一个看起来像这样的热点。在这里收集某种矢量会很好...关于如何让编译器喜欢这个的任何建议?
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
使用 ifort 16.0.2 我的选择报告看起来像
LOOP BEGIN at code.f(197,13)
remark #25084: Preprocess Loopnests: Moving Out Store [ code.f(203,13) ]
remark #15344: loop was not vectorized: vector dependence prevents vectorization
remark #15346: vector dependence: assumed FLOW dependence between irb line 201 and irb line 200
remark #15346: vector dependence: assumed ANTI dependence between irb line 200 and irb line 201
remark #15346: vector dependence: assumed ANTI dependence between irb line 200 and irb line 201
remark #15346: vector dependence: assumed FLOW dependence between irb line 201 and irb line 200
remark #25439: unrolled with remainder by 2
remark #25015: Estimate of max trip count of loop=1600
LOOP END
这是小测试程序
program vect
integer :: ii, i0, irb
integer, parameter :: N=32
integer, parameter :: M=8
integer, dimension(N) :: diff
integer, dimension(2*N) :: rbuf
rbuf = 0
!only some values of diff will meet condition
!could be random
do ii=1, N
diff(ii) = ii
end do
!from an outer loop
i0=1003
!this is code for filling up a buffer for an expensive vectorized
!subroutine with full vectors, irb < 2*N
irb=3
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
!check
do ii = 1, 2*N
write(*,*) ii, rbuf(ii)
end do
end
根据目标架构,我能够让编译器使用指令进行矢量化。
!CDIR$ IVDEP
do ii = 1, N
if (diff(ii) .le. M ) then
i = i0 + ii - 1
rbuf( irb ) = i
irb = irb + 1
end if
end do
和 -xMIC-AVX512
或 -mmic
将为这些体系结构提供矢量指令。例如
vpcompressd %zmm0, -4+vect_$RBUF.0.1(,%rax,4){%k1} #29.15 c1
对于 AVX2,我认为正如@Peter Cordes 在他的评论中所建议的那样,人们已经求助于 intrinsics/asm,但很高兴知道编译器可以解决这个问题。