在 fortran 中使用 openmp 并行创建稀疏矩阵
Sparse matrix parallel creation with openmp in fortran
我对 fortran 比较陌生,对 openmp 完全陌生,我有以下问题:
我想并行构造一个(大:~1% 非零元素,总计约 100 万到 10 亿个元素)稀疏矩阵(值、行、列),我没有打开 mp 的代码如下:
function M_sparse(..) result(M)
(variables declarations)
cnt=0
do i=1,n
do j=i,n
v = some_function(..)
if (v /= 0.) then
cnt=cnt+1
ht(cnt)=v
it(cnt)=dble(i)
jt(cnt)=dble(j)
endif
end do
enddo
allocate(M(cnt,3))
M(:,1)=ht(:cnt)
M(:,2)=it(:cnt)
M(:,3)=jt(:cnt)
return
end function
现在我真的很困惑如何并行化它。我至少需要连续更新 ht、it 和 jt,但到目前为止,在每次尝试中,cnt 的最终值在多次运行时甚至不稳定。
这是一个解决方案:创建一个矩阵大小的数组,计算所有 i,j
的 some_function
并记录有多少 j
给出非零结果,将其存储在位置 i
。这是完全平行的。
现在您知道需要多少 space 了,您可以为每个线程指定其在存储中的起点。再次遍历 some_function
并实际填充元素,cnt
对每个线程都是局部的。
好的,所以这会使标量工作量加倍。但是你让它完全平行,所以你真的不在乎,对吧?
另一个想法:将密集数组拆分为多个块,每个线程负责一个块。让每个线程从自己的密集数组部分生成一部分稀疏数组,然后在必要时将这些部分连接在一起。
这是我将如何做的一个破解版本 - 它本质上是@veryreverie 建议的一个版本:生成一组线程私有列表,然后将它们连接起来。注意
- 我假设您不关心元素的排列顺序。如果你这样做了,你现在就有了一个本质上非并行的问题,这将更难解决
- 无法测试其结果的程序毫无意义 - 因此我的程序会根据单线程结果检查 2、3 和 4 线程结果。请注意,因为现在是星期五晚上,而且我感觉很懒惰,所以这个检查虽然很重要,但完成得可怕 效率很低,而且实际上对于大型案例来说,花费的时间比计算本身要长得多!
这里是代码,它是如何编译的,以及在我的四核笔记本电脑上的一些示例结果:
ijb@ijb-Latitude-5410:~/work/stack$ cat listing.f90
Program listing
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Type element_type
Integer :: i, j
Real( wp ) :: Hij
End Type element_type
Type( element_type ), Dimension( : ), Allocatable :: list_of_elements_serial
Type( element_type ), Dimension( : ), Allocatable :: list_of_elements
Integer :: n
Integer :: nth
Integer( li ) :: start, finish, rate
Logical :: worked
Write( *, * ) 'n ?'
Read ( *, * ) n
nth = 1
Call system_clock( start, rate )
! On a Single thread generate a reference list to check against
Call generate_list( n, nth, list_of_elements_serial )
Call system_clock( finish, rate )
Write( *, * ) 'time on ', 1, ' threads = ', Real( finish - start, wp ) / rate, Size( list_of_elements_serial )
! On 2, 3, 4 generate the lists, compare performance, check the results are correct
Do nth = 2, 4
Call system_clock( start, rate )
Call generate_list( n, nth, list_of_elements )
Call system_clock( finish, rate )
Write( *, * ) 'time on ', nth, ' threads = ', Real( finish - start, wp ) / rate, Size( list_of_elements )
Call checkit( list_of_elements_serial, list_of_elements, worked )
Write( *, '( "Checking ... ")', Advance = 'No' )
If( .Not. worked ) Then
Write( *, * ) 'Failed on ', nth, Size( list_of_elements )
Else
Write( *, * ) 'Worked'
End If
End Do
Contains
Subroutine generate_list( n, nth, list_of_elements )
! Generate a list of the non-zero elements
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use omp_lib, Only : omp_get_thread_num
Implicit None
Integer , Intent( In ) :: n ! Size of matrix
Integer , Intent( In ) :: nth ! number of threads
Type( element_type ), Dimension( : ), Allocatable, Intent( Out ) :: list_of_elements ! The list of elements
Real( wp ), Parameter :: tol = 1.0e-16_wp
Integer, Parameter :: n_chunk = 16384
Type( element_type ), Dimension( : ), Allocatable :: private_list
Type( element_type ), Dimension( : ), Allocatable :: temp_list
Real( wp ) :: v
Integer, Dimension( : ), Allocatable :: counts
Integer :: private_count
Integer :: my_start
Integer :: i, j
Interface
Pure Function func( n, i, j ) Result( v )
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ) :: v
Integer, Intent( In ) :: n
Integer, Intent( In ) :: i
Integer, Intent( In ) :: j
End Function func
End Interface
!$omp parallel num_threads( nth ) default( none ) &
!$omp private( private_count, private_list, temp_list, my_start, v, i, j ) &
!$omp shared( n, nth, counts, list_of_elements )
! Generate a subset of the elements local to this thread
Allocate( private_list( 1:n_chunk ) )
private_count = 0
!$omp do
Do i = 1, n
Do j = 1, n
v = func( n, i, j )
If( Abs( v ) > tol ) Then
private_count = private_count + 1
If( private_count > Ubound( private_list, Dim = 1 ) ) Then
Allocate( temp_list( 1:Ubound( private_list, Dim = 1 ) + n_chunk ) )
temp_list( 1:Ubound( private_list, Dim = 1 ) ) = private_list
Call move_alloc( temp_list, private_list )
End If
private_list( private_count )%i = i
private_list( private_count )%j = j
private_list( private_count )%Hij = v
End If
End Do
End Do
! Concatenate the private lists into one shared list
!$omp single
Allocate( counts( 0:nth - 1 ) )
!$omp end single
counts( omp_get_thread_num() ) = private_count
!$omp barrier
!$omp single
Allocate( list_of_elements( 1:Sum( counts ) ) )
!$omp end single
my_start = Sum( counts( 0:omp_get_thread_num() - 1 ) ) + 1
list_of_elements( my_start:my_start + private_count - 1 ) = private_list( 1:private_count )
!$omp end parallel
End Subroutine generate_list
Pure Subroutine checkit( list_ref, list, worked )
! Check whether the given list is just a rearrangement of the reference list
! HORRIBLY inefficient, should really use sorting - can't be bothered.
Implicit None
Type( element_type ), Dimension( : ), Intent( In ) :: list_ref
Type( element_type ), Dimension( : ), Intent( In ) :: list
Logical , Intent( Out ) :: worked
Type( element_type ), Dimension( : ), Allocatable :: temp
Integer :: i, j
worked = .True.
If( Size( list_ref ) /= Size( list ) ) Then
worked = .False.
End If
Allocate( temp, Source = list )
Do i = 1, Size( list_ref )
Do j = 1, Size( list )
! Search for element i of the reference list in the list being checked
If( list_ref( i )%i == temp( j )%i .And. &
list_ref( i )%j == temp( j )%j .And. &
Abs( list_ref( i )%Hij - temp( j )%Hij ) < 1e-15_wp ) Then
Exit
End If
End Do
If( j == Size( list ) + 1 ) Then
worked = .False.
Return
End If
! Mark it as used already
temp( j )%i = -1
temp( j )%j = -1
temp( j )%Hij = Huge( temp( j )%Hij )
End Do
End Subroutine checkit
End Program listing
Pure Function func( n, i, j ) Result( v )
! silly function for sparse matrix
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ) :: v
Integer, Intent( In ) :: n
Integer, Intent( In ) :: i
Integer, Intent( In ) :: j
If( 100 * i < n .And. 100 * j < n ) Then
v = 1.0_wp
Else
v = 0.0_wp
End If
End Function func
ijb@ijb-Latitude-5410:~/work/stack$ gfortran-11 --version
GNU Fortran (GCC) 11.1.0
Copyright © 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb@ijb-Latitude-5410:~/work/stack$ gfortran-11 -std=f2008 -Wall -Wextra -O3 -g -fopenmp listing.f90 -o gen_list
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
10000
time on 1 threads = 6.7302687000000000E-002 9801
time on 2 threads = 2.6817233999999999E-002 9801
Checking ... Worked
time on 3 threads = 1.5919547999999999E-002 9801
Checking ... Worked
time on 4 threads = 1.1952938000000000E-002 9801
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
30000
time on 1 threads = 0.44568265400000001 89401
time on 2 threads = 0.21186449299999999 89401
Checking ... Worked
time on 3 threads = 0.14133034500000000 89401
Checking ... Worked
time on 4 threads = 0.12390519100000000 89401
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
60000
time on 1 threads = 1.7274770189999999 358801
time on 2 threads = 0.85456061200000000 358801
Checking ... Worked
time on 3 threads = 0.57058082499999996 358801
Checking ... Worked
time on 4 threads = 0.42949695500000001 358801
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$
我对 fortran 比较陌生,对 openmp 完全陌生,我有以下问题:
我想并行构造一个(大:~1% 非零元素,总计约 100 万到 10 亿个元素)稀疏矩阵(值、行、列),我没有打开 mp 的代码如下:
function M_sparse(..) result(M)
(variables declarations)
cnt=0
do i=1,n
do j=i,n
v = some_function(..)
if (v /= 0.) then
cnt=cnt+1
ht(cnt)=v
it(cnt)=dble(i)
jt(cnt)=dble(j)
endif
end do
enddo
allocate(M(cnt,3))
M(:,1)=ht(:cnt)
M(:,2)=it(:cnt)
M(:,3)=jt(:cnt)
return
end function
现在我真的很困惑如何并行化它。我至少需要连续更新 ht、it 和 jt,但到目前为止,在每次尝试中,cnt 的最终值在多次运行时甚至不稳定。
这是一个解决方案:创建一个矩阵大小的数组,计算所有 i,j
的 some_function
并记录有多少 j
给出非零结果,将其存储在位置 i
。这是完全平行的。
现在您知道需要多少 space 了,您可以为每个线程指定其在存储中的起点。再次遍历 some_function
并实际填充元素,cnt
对每个线程都是局部的。
好的,所以这会使标量工作量加倍。但是你让它完全平行,所以你真的不在乎,对吧?
另一个想法:将密集数组拆分为多个块,每个线程负责一个块。让每个线程从自己的密集数组部分生成一部分稀疏数组,然后在必要时将这些部分连接在一起。
这是我将如何做的一个破解版本 - 它本质上是@veryreverie 建议的一个版本:生成一组线程私有列表,然后将它们连接起来。注意
- 我假设您不关心元素的排列顺序。如果你这样做了,你现在就有了一个本质上非并行的问题,这将更难解决
- 无法测试其结果的程序毫无意义 - 因此我的程序会根据单线程结果检查 2、3 和 4 线程结果。请注意,因为现在是星期五晚上,而且我感觉很懒惰,所以这个检查虽然很重要,但完成得可怕 效率很低,而且实际上对于大型案例来说,花费的时间比计算本身要长得多!
这里是代码,它是如何编译的,以及在我的四核笔记本电脑上的一些示例结果:
ijb@ijb-Latitude-5410:~/work/stack$ cat listing.f90
Program listing
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Implicit None
Type element_type
Integer :: i, j
Real( wp ) :: Hij
End Type element_type
Type( element_type ), Dimension( : ), Allocatable :: list_of_elements_serial
Type( element_type ), Dimension( : ), Allocatable :: list_of_elements
Integer :: n
Integer :: nth
Integer( li ) :: start, finish, rate
Logical :: worked
Write( *, * ) 'n ?'
Read ( *, * ) n
nth = 1
Call system_clock( start, rate )
! On a Single thread generate a reference list to check against
Call generate_list( n, nth, list_of_elements_serial )
Call system_clock( finish, rate )
Write( *, * ) 'time on ', 1, ' threads = ', Real( finish - start, wp ) / rate, Size( list_of_elements_serial )
! On 2, 3, 4 generate the lists, compare performance, check the results are correct
Do nth = 2, 4
Call system_clock( start, rate )
Call generate_list( n, nth, list_of_elements )
Call system_clock( finish, rate )
Write( *, * ) 'time on ', nth, ' threads = ', Real( finish - start, wp ) / rate, Size( list_of_elements )
Call checkit( list_of_elements_serial, list_of_elements, worked )
Write( *, '( "Checking ... ")', Advance = 'No' )
If( .Not. worked ) Then
Write( *, * ) 'Failed on ', nth, Size( list_of_elements )
Else
Write( *, * ) 'Worked'
End If
End Do
Contains
Subroutine generate_list( n, nth, list_of_elements )
! Generate a list of the non-zero elements
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use omp_lib, Only : omp_get_thread_num
Implicit None
Integer , Intent( In ) :: n ! Size of matrix
Integer , Intent( In ) :: nth ! number of threads
Type( element_type ), Dimension( : ), Allocatable, Intent( Out ) :: list_of_elements ! The list of elements
Real( wp ), Parameter :: tol = 1.0e-16_wp
Integer, Parameter :: n_chunk = 16384
Type( element_type ), Dimension( : ), Allocatable :: private_list
Type( element_type ), Dimension( : ), Allocatable :: temp_list
Real( wp ) :: v
Integer, Dimension( : ), Allocatable :: counts
Integer :: private_count
Integer :: my_start
Integer :: i, j
Interface
Pure Function func( n, i, j ) Result( v )
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ) :: v
Integer, Intent( In ) :: n
Integer, Intent( In ) :: i
Integer, Intent( In ) :: j
End Function func
End Interface
!$omp parallel num_threads( nth ) default( none ) &
!$omp private( private_count, private_list, temp_list, my_start, v, i, j ) &
!$omp shared( n, nth, counts, list_of_elements )
! Generate a subset of the elements local to this thread
Allocate( private_list( 1:n_chunk ) )
private_count = 0
!$omp do
Do i = 1, n
Do j = 1, n
v = func( n, i, j )
If( Abs( v ) > tol ) Then
private_count = private_count + 1
If( private_count > Ubound( private_list, Dim = 1 ) ) Then
Allocate( temp_list( 1:Ubound( private_list, Dim = 1 ) + n_chunk ) )
temp_list( 1:Ubound( private_list, Dim = 1 ) ) = private_list
Call move_alloc( temp_list, private_list )
End If
private_list( private_count )%i = i
private_list( private_count )%j = j
private_list( private_count )%Hij = v
End If
End Do
End Do
! Concatenate the private lists into one shared list
!$omp single
Allocate( counts( 0:nth - 1 ) )
!$omp end single
counts( omp_get_thread_num() ) = private_count
!$omp barrier
!$omp single
Allocate( list_of_elements( 1:Sum( counts ) ) )
!$omp end single
my_start = Sum( counts( 0:omp_get_thread_num() - 1 ) ) + 1
list_of_elements( my_start:my_start + private_count - 1 ) = private_list( 1:private_count )
!$omp end parallel
End Subroutine generate_list
Pure Subroutine checkit( list_ref, list, worked )
! Check whether the given list is just a rearrangement of the reference list
! HORRIBLY inefficient, should really use sorting - can't be bothered.
Implicit None
Type( element_type ), Dimension( : ), Intent( In ) :: list_ref
Type( element_type ), Dimension( : ), Intent( In ) :: list
Logical , Intent( Out ) :: worked
Type( element_type ), Dimension( : ), Allocatable :: temp
Integer :: i, j
worked = .True.
If( Size( list_ref ) /= Size( list ) ) Then
worked = .False.
End If
Allocate( temp, Source = list )
Do i = 1, Size( list_ref )
Do j = 1, Size( list )
! Search for element i of the reference list in the list being checked
If( list_ref( i )%i == temp( j )%i .And. &
list_ref( i )%j == temp( j )%j .And. &
Abs( list_ref( i )%Hij - temp( j )%Hij ) < 1e-15_wp ) Then
Exit
End If
End Do
If( j == Size( list ) + 1 ) Then
worked = .False.
Return
End If
! Mark it as used already
temp( j )%i = -1
temp( j )%j = -1
temp( j )%Hij = Huge( temp( j )%Hij )
End Do
End Subroutine checkit
End Program listing
Pure Function func( n, i, j ) Result( v )
! silly function for sparse matrix
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Real( wp ) :: v
Integer, Intent( In ) :: n
Integer, Intent( In ) :: i
Integer, Intent( In ) :: j
If( 100 * i < n .And. 100 * j < n ) Then
v = 1.0_wp
Else
v = 0.0_wp
End If
End Function func
ijb@ijb-Latitude-5410:~/work/stack$ gfortran-11 --version
GNU Fortran (GCC) 11.1.0
Copyright © 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb@ijb-Latitude-5410:~/work/stack$ gfortran-11 -std=f2008 -Wall -Wextra -O3 -g -fopenmp listing.f90 -o gen_list
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
10000
time on 1 threads = 6.7302687000000000E-002 9801
time on 2 threads = 2.6817233999999999E-002 9801
Checking ... Worked
time on 3 threads = 1.5919547999999999E-002 9801
Checking ... Worked
time on 4 threads = 1.1952938000000000E-002 9801
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
30000
time on 1 threads = 0.44568265400000001 89401
time on 2 threads = 0.21186449299999999 89401
Checking ... Worked
time on 3 threads = 0.14133034500000000 89401
Checking ... Worked
time on 4 threads = 0.12390519100000000 89401
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$ ./gen_list
n ?
60000
time on 1 threads = 1.7274770189999999 358801
time on 2 threads = 0.85456061200000000 358801
Checking ... Worked
time on 3 threads = 0.57058082499999996 358801
Checking ... Worked
time on 4 threads = 0.42949695500000001 358801
Checking ... Worked
ijb@ijb-Latitude-5410:~/work/stack$