带 BLAS 的 OpenMP
OpenMP with BLAS
我尝试扩展上述 link 答案中的代码,以包括交叉检查和 openmp。
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!naive
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!naive omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
我有两个问题:
- 在 BLAS 或朴素循环中都没有明显的加速。例如,通过
gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp
,然后输入30 30 30 60
,我得到了
30 30 30 60
Time for reshaping method 2.9443999999999998E-003
Difference between result matrices 12.380937791257775
Time for straight method 1.0016000000000001E-003
Time for straight method omp 2.4878000000000001E-003
Time for loop 6.6072500000000006E-002
Time for loop omp 0.100242600000000002
- 当维度变大时,例如输入中的
60 60 60 60
,openmp BLAS 结果可能会得到与 naive loop 不同的值,似乎我错过了一些控制选项。
这里使用 OpenMP 会有什么问题?
编辑
我在 c5
部分的初始化中添加了 omp 行,并注释掉了两个打印行,
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
!naive loop
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!dgemm omp
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!loop omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
!single core: c1 c2 c3
! c1 reshape blas
! c2 blas
! c3 naive loop (reference)
! parallel: c4 c5
! c4 dgemm parallel
! c5 naive loop parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
然后 gfortran reshape.f90 -lblas -fopenmp
,30 30 30 30
输入导致
Time for reshaping method 1.3519000000000001E-003
Difference between result matrices 12.380937791257775
Time for straight method 6.2549999999999997E-004
Time for straight method omp 1.2600000000000001E-003
Time for naive loop 1.0008599999999999E-002
Time for naive loop omp 1.6678999999999999E-002
虽然速度不是很好。
您正在使用同一组变量并行调用 DGEMM
(因为并行区域中的变量在 Fortran 中默认共享)。由于数据竞争,这不起作用并产生奇怪的结果。您有两个选择:
找到 DGEMM
已经线程化的并行 BLAS 实现。英特尔 MKL 和 OpenBLAS 是主要候选者。英特尔 MKL 使用 OpenMP,更具体地说,它是用英特尔 OpenMP 运行时构建的,因此它可能无法很好地与使用 GCC 编译的 OpenMP 代码一起运行,但它可以完美地与非线程代码一起运行。
并行调用 DGEMM
,但使用不同的参数集。相反,执行一个或两个张量的块分解,并让每个线程对一个单独的块进行收缩。由于Fortran使用列优先存储,分解第二张量可能比较合适:
C[i,k,l=1..L] = A[i,j] * B[j,k,l=1..L]
变成有两个线程:
thread 0: C[i,k,l=1..L/2] = A[i,j] * B[j,k,l=1..L/2]
thread 1: C[i,k,l=L/2+1..L] = A[i,j] * B[j,k,l=L/2+1..L]
对于任意数量的线程,它归结为计算每个线程中 l
索引的起始值和结束值,并相应地调整 DGEMM
的参数。
就个人而言,我会选择并行 BLAS 实现。使用英特尔 MKL,您只需 link 在并行驱动程序中,它会自动使用所有可用的 CPU。
块分解的示例实现如下。仅显示对原始代码的添加和更改:
! ADD: Use the OpenMP module
Use :: omp_lib
! ADD: Variables used for the decomposition
Integer :: ithr, istart, iend
! CHANGE: OpenMP with block decomposition
!$omp parallel private(ithr, istart, iend)
ithr = omp_get_thread_num()
! First index (plane) in B for the current thread
istart = ithr * nd / omp_get_num_threads()
! First index (plane) in B for the next thread
iend = (ithr + 1) * nd / opm_get_num_threads()
Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_wp, a, nd, &
b(1, 1, 1 + istart), Size(b, Dim = 1), &
0.0_wp, c4(1, 1, 1 + istart), Size(c4, Dim = 1))
!$omp end parallel
istart
是每个线程工作的 B
第一个平面的索引。 iend
是下一个线程的第一个平面,所以iend - istart
是当前线程的平面数。 b(1, 1, 1 + istart)
是 B 中的平面块开始的地方。 c4(1, 1, 1 + istart)
是结果张量中的块开始的地方。
确保您只执行其中一项,但不能同时执行两项。即,如果您的 BLAS 实现是线程化的,但您决定进行块分解,请在 BLAS 库中禁用线程化。相反,如果您在 BLAS 实现中使用线程,请不要在您的代码中执行块分解。
我尝试扩展上述 link 答案中的代码,以包括交叉检查和 openmp。
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!naive
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!naive omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
我有两个问题:
- 在 BLAS 或朴素循环中都没有明显的加速。例如,通过
gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp
,然后输入30 30 30 60
,我得到了
30 30 30 60
Time for reshaping method 2.9443999999999998E-003
Difference between result matrices 12.380937791257775
Time for straight method 1.0016000000000001E-003
Time for straight method omp 2.4878000000000001E-003
Time for loop 6.6072500000000006E-002
Time for loop omp 0.100242600000000002
- 当维度变大时,例如输入中的
60 60 60 60
,openmp BLAS 结果可能会得到与 naive loop 不同的值,似乎我错过了一些控制选项。
这里使用 OpenMP 会有什么问题?
编辑
我在 c5
部分的初始化中添加了 omp 行,并注释掉了两个打印行,
Program reshape_for_blas
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e
Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads
numthreads = 2
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )
! Set up some data
Call Random_number( a )
Call Random_number( b )
! With reshapes
Call System_clock( start, rate )
!write (*,*) 'clock', start, rate
d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )
!write (*,*) 'clock', finish, rate
Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )
! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate
!naive loop
Call System_clock( start, rate )
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate
!dgemm omp
Call System_clock( start, rate )
!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate
!loop omp
Call System_clock( start, rate )
!$omp parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo
!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate
!single core: c1 c2 c3
! c1 reshape blas
! c2 blas
! c3 naive loop (reference)
! parallel: c4 c5
! c4 dgemm parallel
! c5 naive loop parallel
do la = 1, na
do lc = 1, nc
do ld = 1, nd
if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif
if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo
End Program reshape_for_blas
然后 gfortran reshape.f90 -lblas -fopenmp
,30 30 30 30
输入导致
Time for reshaping method 1.3519000000000001E-003
Difference between result matrices 12.380937791257775
Time for straight method 6.2549999999999997E-004
Time for straight method omp 1.2600000000000001E-003
Time for naive loop 1.0008599999999999E-002
Time for naive loop omp 1.6678999999999999E-002
虽然速度不是很好。
您正在使用同一组变量并行调用 DGEMM
(因为并行区域中的变量在 Fortran 中默认共享)。由于数据竞争,这不起作用并产生奇怪的结果。您有两个选择:
找到
DGEMM
已经线程化的并行 BLAS 实现。英特尔 MKL 和 OpenBLAS 是主要候选者。英特尔 MKL 使用 OpenMP,更具体地说,它是用英特尔 OpenMP 运行时构建的,因此它可能无法很好地与使用 GCC 编译的 OpenMP 代码一起运行,但它可以完美地与非线程代码一起运行。并行调用
DGEMM
,但使用不同的参数集。相反,执行一个或两个张量的块分解,并让每个线程对一个单独的块进行收缩。由于Fortran使用列优先存储,分解第二张量可能比较合适:C[i,k,l=1..L] = A[i,j] * B[j,k,l=1..L]
变成有两个线程:
thread 0: C[i,k,l=1..L/2] = A[i,j] * B[j,k,l=1..L/2] thread 1: C[i,k,l=L/2+1..L] = A[i,j] * B[j,k,l=L/2+1..L]
对于任意数量的线程,它归结为计算每个线程中
l
索引的起始值和结束值,并相应地调整DGEMM
的参数。
就个人而言,我会选择并行 BLAS 实现。使用英特尔 MKL,您只需 link 在并行驱动程序中,它会自动使用所有可用的 CPU。
块分解的示例实现如下。仅显示对原始代码的添加和更改:
! ADD: Use the OpenMP module
Use :: omp_lib
! ADD: Variables used for the decomposition
Integer :: ithr, istart, iend
! CHANGE: OpenMP with block decomposition
!$omp parallel private(ithr, istart, iend)
ithr = omp_get_thread_num()
! First index (plane) in B for the current thread
istart = ithr * nd / omp_get_num_threads()
! First index (plane) in B for the next thread
iend = (ithr + 1) * nd / opm_get_num_threads()
Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_wp, a, nd, &
b(1, 1, 1 + istart), Size(b, Dim = 1), &
0.0_wp, c4(1, 1, 1 + istart), Size(c4, Dim = 1))
!$omp end parallel
istart
是每个线程工作的 B
第一个平面的索引。 iend
是下一个线程的第一个平面,所以iend - istart
是当前线程的平面数。 b(1, 1, 1 + istart)
是 B 中的平面块开始的地方。 c4(1, 1, 1 + istart)
是结果张量中的块开始的地方。
确保您只执行其中一项,但不能同时执行两项。即,如果您的 BLAS 实现是线程化的,但您决定进行块分解,请在 BLAS 库中禁用线程化。相反,如果您在 BLAS 实现中使用线程,请不要在您的代码中执行块分解。