OpenMP:增加线程数时出现明显的竞争条件
OpenMP: apparent race condition when increasing number of threads
我有这段代码(删除了相当长且在并行区域之外的变量的初始化)。我正在本地机器(4 个物理内核,8 个线程)上对其进行测试,并将速度和结果与其串行版本进行比较。当我 运行 具有超过 4 个线程的代码时,它似乎有时会在某些竞争条件下发生,并且最终输出(在并行区域之后写入磁盘的变量 T)在这两种情况下是不同的。当我 运行 使用 4 个或更少的线程时,一切都很好,两个代码 运行 具有相同的迭代次数并给出最终结果。
从文档中可以看出,每个 OMP DO 块的末尾都有一个隐式同步(除非您指定 nowait)。
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
change=0.0d0
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test
编辑:正确的代码,参见@Gilles 评论:
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
change=0.0d0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
change=0.0d0
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test
DO WHILE 循环中变量 change
重新初始化的竞争条件已被移除。通过在并行区域之外初始化 change
并使用 !$OMP SINGLE
指令保护其在并行区域中的更新来解决。
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
change=0.0d0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
change=0.0d0
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test
我有这段代码(删除了相当长且在并行区域之外的变量的初始化)。我正在本地机器(4 个物理内核,8 个线程)上对其进行测试,并将速度和结果与其串行版本进行比较。当我 运行 具有超过 4 个线程的代码时,它似乎有时会在某些竞争条件下发生,并且最终输出(在并行区域之后写入磁盘的变量 T)在这两种情况下是不同的。当我 运行 使用 4 个或更少的线程时,一切都很好,两个代码 运行 具有相同的迭代次数并给出最终结果。 从文档中可以看出,每个 OMP DO 块的末尾都有一个隐式同步(除非您指定 nowait)。
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
change=0.0d0
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test
编辑:正确的代码,参见@Gilles 评论:
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
change=0.0d0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
change=0.0d0
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test
DO WHILE 循环中变量 change
重新初始化的竞争条件已被移除。通过在并行区域之外初始化 change
并使用 !$OMP SINGLE
指令保护其在并行区域中的更新来解决。
program test
integer :: nx=500,ny=500
integer :: i,j,iteration
double precision, allocatable, dimension(:,:) :: T, T_old
double precision :: dx,dy,dt
double precision :: error,change,delta,errtol
allocate(T(0:nx+1,0:ny+1))
allocate(T_old(0:nx+1,0:ny+1))
! initialisation of T, T_old, dt, dx, dy and errtol
error=1.0d0
iteration=0
change=0.0d0
!$OMP PARALLEL SHARED(error,iteration,change) private(i,j,delta)
do while (error.gt.errtol.and.error.lt.10.0d0)
!$OMP DO schedule(static) reduction(max:change)
do j=1,ny
do i=1,nx
delta=dt*( (T_old(i+1,j)-2.0d0*T_old(i,j)+T_old(i-1,j))/dx**2 + &
(T_old(i,j+1)-2.0d0*T_old(i,j)+T_old(i,j-1))/dy**2 )
T(i,j)=T_old(i,j)+delta
change=max(delta,change)
enddo
enddo
!$OMP END DO
! implicit barrier (implies FLUSH) at end of parallel do region (unless you specify nowait clause)
!$OMP SINGLE
error=change
change=0.0d0
! just one thread updates iteration
iteration=iteration+1
! write(*,*) iteration, error
!$OMP END SINGLE
!$OMP DO schedule(static)
! update T_old
do j=1,ny
do i=1,nx
T_old(i,j)=T(i,j)
enddo
enddo
!$OMP END DO
enddo
!$OMP END PARALLEL
! write T to disk
deallocate(T,T_old)
end program test