为什么按元素矩阵行交换比 Fortran 中按数组行交换更有效?
Why is element-wise matrix row-exchanging more efficient than array-wise row-exchanging in Fortran?
我有一些执行矩阵行交换的 Fortran 代码。在遗留代码中,写成
do J = 1,N
save_val = B(IROW,J)
B(IROW,J) = B(JCOL,J)
B(JCOL,J) = save_val
end do
这会将行 IROW
与行 JCOL
交换(IROW
和 JCOL
是整数)。但是,该代码块的功能并不直观。在我看来,将其写成更直观,或者至少有助于提高可读性:
save_row = B(IROW,:)
B(IROW,:) = B(JCOL,:)
B(JCOL,:) = save_row
(更清楚的是正在移动行)。
从附图中可以明显看出,循环方法相对于数组操作提供了更好的性能。
为什么是这样?是因为当数组中的元素数量变大时,这会变成一个内存受限的过程吗? (即数组会被“分块”)还是别的东西?
编译为gfortran -O3 test.f95
。添加标志 fstack-arrays
没有显着差异。
program test
implicit none
integer :: N
integer :: M
integer :: loop_max = 1e7
integer :: i ! loop index
real :: t1, t2
real :: time_loop, time_array, time_sub_loop, time_sub_array
real, dimension(:, :), allocatable :: B
real, dimension(:) , allocatable :: save_row
real :: save_val
integer :: IROW, J, JCOL
character(*), parameter :: format_header = '(A5, 1X, 4(A12,1X))'
character(*), parameter :: format_data = '(I5, 1X, 4(ES12.5, 1X))'
open(1, file = 'TimingRowExchange.txt', status = 'unknown')
write(1, format_header) 'N', 't_loop', 't_array', 't_sub_loop', 't_sub_array'
do N = 1, 100
M = N + 1
allocate(B(N,N), save_row(M))
call random_number(B)
JCOL = 1
IROW = 3
call CPU_time(t1)
do i = 1, loop_max
do J = 1,N
save_val = B(IROW,J)
B(IROW,J) = B(JCOL,J)
B(JCOL,J) = save_val
end do
end do
call CPU_time(t2)
time_loop = t2 - t1
! write ( *, * ) 'Using Loop =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
save_row(1:N) = B(IROW,:)
B(IROW,:) = B(JCOL,:)
B(JCOL,:) = save_row(1:N)
end do
call CPU_time(t2)
time_array = t2 - t1
! write ( *, * ) 'Using Array =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_loop(B, JCOL, IROW)
end do
call CPU_time(t2)
time_sub_loop = t2 - t1
! write ( *, * ) 'Loop Subrout =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_array(B, JCOL, IROW)
end do
call CPU_time(t2)
time_sub_array = t2 - t1
! write ( *, * ) 'Array Subrout =', t2 - t1
deallocate(B, save_row)
write(1, format_data) N, time_loop, time_array, time_sub_loop, time_sub_array
end do
contains
subroutine print_mat(A)
implicit none
real, dimension(:,:), intent(in) :: A
integer :: n
n = size(A,1) ! # of rows
do i = 1,n
print*, A(i,:)
end do
print*,
end subroutine print_mat
subroutine exchange_rows_loop(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
integer :: J
real :: save_val
do J = 1, size(A,1)
save_val = A(row1,J)
A(row1,J) = A(row2,J)
A(row2,J) = save_val
end do
end subroutine exchange_rows_loop
subroutine exchange_rows_array(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
real, dimension(size(A,1)) :: save_row
save_row = A(row1,:)
A(row1,:) = A(row2,:)
A(row2,:) = save_row
end subroutine exchange_rows_array
end program test
我对 Fortran 哲学(优势)的理解是,该语言应该帮助用户专注于科学,同时处理大多数 computer-related 事情,例如 optimization-for-speed、垃圾收集等。
通过 pure
/elemental
函数和子例程的函数式编程风格是恕我直言,已被引入但未得到充分利用的最伟大的工具之一,因为它使代码清晰、更简单、更健壮。
所以我又添加了一个带有 elemental
交换例程的测试:
subroutine exchange_rows_elemental(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
call swap(A(row1,:),A(row2,:))
end subroutine exchange_rows_elemental
elemental subroutine swap(a,b)
real, intent(inout) :: a,b
real :: save_val
save_val = a
a = b
b = save_val
end subroutine swap
主要是:
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_elemental(B, JCOL, IROW)
end do
call CPU_time(t2)
time_elemental = t2 - t1
! write ( *, * ) 'Elemental =', t2 - t1
这是我在 Windows 上使用 gfortran 9.2.0
得到的结果:
elemental
版本几乎与最快的循环版本一样快,但它可能以矢量化方式运行。我确定在这种情况下,编译器可能正在内联 swap
例程(如果它在另一个文件中,可能无法内联),但仍然告诉编译器 swap
例程可以矢量化可能有助于它实现最佳性能。我喜欢它,因为它是一种很好的方式,可以充分利用编译器优化,而不会用嵌套循环和循环变量使源代码混乱。
这两个版本做的不一样,顺序不一样,编译器做的优化也不一样。该程序集位于 https://godbolt.org/z/hc8zcs
循环:
movss xmm0, DWORD PTR [rax+12]
movss xmm1, DWORD PTR [rax+4]
movss DWORD PTR [rax+4], xmm0
movss DWORD PTR [rax+12], xmm1
cmp ebx, 1
je .L6
movss xmm0, DWORD PTR [rsi]
movss xmm1, DWORD PTR [rcx]
movss DWORD PTR [rsi], xmm1
movss DWORD PTR [rcx], xmm0
cmp ebx, 2
je .L6
movss xmm0, DWORD PTR [r8]
movss xmm1, DWORD PTR [rdi]
movss DWORD PTR [r8], xmm1
movss DWORD PTR [rdi], xmm0
有点展开,使用了向量化指令(SSE)。总体来说还是很直接的。
子数组版本:
mov r10, QWORD PTR [rsp+216]
mov rsi, QWORD PTR [rsp+208]
mov r8d, 10000000
mov rdx, QWORD PTR [rsp+152]
mov rdi, QWORD PTR [rsp+144]
mov r11, r10
lea rbx, [r10+1]
lea r15, [r10+2]
mov r9, QWORD PTR [rsp+8]
imul r11, rsi
lea rax, [rdx+1]
mov rcx, QWORD PTR [rsp+224]
imul rbx, rsi
lea r12, [rdx+3]
imul r15, rsi
sal rsi, 2
lea rbp, [r12+r11]
add rdx, r11
add r11, rax
lea r14, [r12+rbx]
lea r13, [rdi+rdx*4]
add rbx, rax
add r12, r15
add r15, rax
mov QWORD PTR [rsp], r15
mov r15, QWORD PTR [rsp+80]
.L13:
movss xmm0, DWORD PTR [rdi+rbp*4]
movss DWORD PTR [r9], xmm0
cmp r15, 1
je .L10
movss xmm0, DWORD PTR [rdi+r14*4]
movss DWORD PTR [r9+4], xmm0
cmp r15, 2
je .L11
movss xmm0, DWORD PTR [rdi+r12*4]
movss DWORD PTR [r9+8], xmm0
.L11:
cmp r10, rcx
jg .L65
.L37:
mov rax, r13
mov rdx, r10
.L14:
movss xmm0, DWORD PTR [rax+4]
add rdx, 1
movss DWORD PTR [rax+12], xmm0
add rax, rsi
cmp rcx, rdx
jge .L14
movss xmm0, DWORD PTR [r9]
movss DWORD PTR [rdi+r11*4], xmm0
cmp r15, 1
je .L15
.L35:
movss xmm0, DWORD PTR [r9+4]
movss DWORD PTR [rdi+rbx*4], xmm0
cmp r15, 3
jne .L15
movss xmm0, DWORD PTR [r9+8]
mov rax, QWORD PTR [rsp]
movss DWORD PTR [rdi+rax*4], xmm0
涉及到很多指针运算、比较和分支。我怀疑编译器必须检查子数组的别名或零数组范围,并且更难执行有效的矢量化。
我有一些执行矩阵行交换的 Fortran 代码。在遗留代码中,写成
do J = 1,N
save_val = B(IROW,J)
B(IROW,J) = B(JCOL,J)
B(JCOL,J) = save_val
end do
这会将行 IROW
与行 JCOL
交换(IROW
和 JCOL
是整数)。但是,该代码块的功能并不直观。在我看来,将其写成更直观,或者至少有助于提高可读性:
save_row = B(IROW,:)
B(IROW,:) = B(JCOL,:)
B(JCOL,:) = save_row
(更清楚的是正在移动行)。
从附图中可以明显看出,循环方法相对于数组操作提供了更好的性能。 为什么是这样?是因为当数组中的元素数量变大时,这会变成一个内存受限的过程吗? (即数组会被“分块”)还是别的东西?
编译为gfortran -O3 test.f95
。添加标志 fstack-arrays
没有显着差异。
program test
implicit none
integer :: N
integer :: M
integer :: loop_max = 1e7
integer :: i ! loop index
real :: t1, t2
real :: time_loop, time_array, time_sub_loop, time_sub_array
real, dimension(:, :), allocatable :: B
real, dimension(:) , allocatable :: save_row
real :: save_val
integer :: IROW, J, JCOL
character(*), parameter :: format_header = '(A5, 1X, 4(A12,1X))'
character(*), parameter :: format_data = '(I5, 1X, 4(ES12.5, 1X))'
open(1, file = 'TimingRowExchange.txt', status = 'unknown')
write(1, format_header) 'N', 't_loop', 't_array', 't_sub_loop', 't_sub_array'
do N = 1, 100
M = N + 1
allocate(B(N,N), save_row(M))
call random_number(B)
JCOL = 1
IROW = 3
call CPU_time(t1)
do i = 1, loop_max
do J = 1,N
save_val = B(IROW,J)
B(IROW,J) = B(JCOL,J)
B(JCOL,J) = save_val
end do
end do
call CPU_time(t2)
time_loop = t2 - t1
! write ( *, * ) 'Using Loop =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
save_row(1:N) = B(IROW,:)
B(IROW,:) = B(JCOL,:)
B(JCOL,:) = save_row(1:N)
end do
call CPU_time(t2)
time_array = t2 - t1
! write ( *, * ) 'Using Array =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_loop(B, JCOL, IROW)
end do
call CPU_time(t2)
time_sub_loop = t2 - t1
! write ( *, * ) 'Loop Subrout =', t2 - t1
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_array(B, JCOL, IROW)
end do
call CPU_time(t2)
time_sub_array = t2 - t1
! write ( *, * ) 'Array Subrout =', t2 - t1
deallocate(B, save_row)
write(1, format_data) N, time_loop, time_array, time_sub_loop, time_sub_array
end do
contains
subroutine print_mat(A)
implicit none
real, dimension(:,:), intent(in) :: A
integer :: n
n = size(A,1) ! # of rows
do i = 1,n
print*, A(i,:)
end do
print*,
end subroutine print_mat
subroutine exchange_rows_loop(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
integer :: J
real :: save_val
do J = 1, size(A,1)
save_val = A(row1,J)
A(row1,J) = A(row2,J)
A(row2,J) = save_val
end do
end subroutine exchange_rows_loop
subroutine exchange_rows_array(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
real, dimension(size(A,1)) :: save_row
save_row = A(row1,:)
A(row1,:) = A(row2,:)
A(row2,:) = save_row
end subroutine exchange_rows_array
end program test
我对 Fortran 哲学(优势)的理解是,该语言应该帮助用户专注于科学,同时处理大多数 computer-related 事情,例如 optimization-for-speed、垃圾收集等。
通过 pure
/elemental
函数和子例程的函数式编程风格是恕我直言,已被引入但未得到充分利用的最伟大的工具之一,因为它使代码清晰、更简单、更健壮。
所以我又添加了一个带有 elemental
交换例程的测试:
subroutine exchange_rows_elemental(A, row1, row2)
implicit none
real, dimension(:,:), intent(in out) :: A
integer, intent(in) :: row1, row2
call swap(A(row1,:),A(row2,:))
end subroutine exchange_rows_elemental
elemental subroutine swap(a,b)
real, intent(inout) :: a,b
real :: save_val
save_val = a
a = b
b = save_val
end subroutine swap
主要是:
call CPU_time(t1)
do i = 1, loop_max
call exchange_rows_elemental(B, JCOL, IROW)
end do
call CPU_time(t2)
time_elemental = t2 - t1
! write ( *, * ) 'Elemental =', t2 - t1
这是我在 Windows 上使用 gfortran 9.2.0
得到的结果:
elemental
版本几乎与最快的循环版本一样快,但它可能以矢量化方式运行。我确定在这种情况下,编译器可能正在内联 swap
例程(如果它在另一个文件中,可能无法内联),但仍然告诉编译器 swap
例程可以矢量化可能有助于它实现最佳性能。我喜欢它,因为它是一种很好的方式,可以充分利用编译器优化,而不会用嵌套循环和循环变量使源代码混乱。
这两个版本做的不一样,顺序不一样,编译器做的优化也不一样。该程序集位于 https://godbolt.org/z/hc8zcs
循环:
movss xmm0, DWORD PTR [rax+12]
movss xmm1, DWORD PTR [rax+4]
movss DWORD PTR [rax+4], xmm0
movss DWORD PTR [rax+12], xmm1
cmp ebx, 1
je .L6
movss xmm0, DWORD PTR [rsi]
movss xmm1, DWORD PTR [rcx]
movss DWORD PTR [rsi], xmm1
movss DWORD PTR [rcx], xmm0
cmp ebx, 2
je .L6
movss xmm0, DWORD PTR [r8]
movss xmm1, DWORD PTR [rdi]
movss DWORD PTR [r8], xmm1
movss DWORD PTR [rdi], xmm0
有点展开,使用了向量化指令(SSE)。总体来说还是很直接的。
子数组版本:
mov r10, QWORD PTR [rsp+216]
mov rsi, QWORD PTR [rsp+208]
mov r8d, 10000000
mov rdx, QWORD PTR [rsp+152]
mov rdi, QWORD PTR [rsp+144]
mov r11, r10
lea rbx, [r10+1]
lea r15, [r10+2]
mov r9, QWORD PTR [rsp+8]
imul r11, rsi
lea rax, [rdx+1]
mov rcx, QWORD PTR [rsp+224]
imul rbx, rsi
lea r12, [rdx+3]
imul r15, rsi
sal rsi, 2
lea rbp, [r12+r11]
add rdx, r11
add r11, rax
lea r14, [r12+rbx]
lea r13, [rdi+rdx*4]
add rbx, rax
add r12, r15
add r15, rax
mov QWORD PTR [rsp], r15
mov r15, QWORD PTR [rsp+80]
.L13:
movss xmm0, DWORD PTR [rdi+rbp*4]
movss DWORD PTR [r9], xmm0
cmp r15, 1
je .L10
movss xmm0, DWORD PTR [rdi+r14*4]
movss DWORD PTR [r9+4], xmm0
cmp r15, 2
je .L11
movss xmm0, DWORD PTR [rdi+r12*4]
movss DWORD PTR [r9+8], xmm0
.L11:
cmp r10, rcx
jg .L65
.L37:
mov rax, r13
mov rdx, r10
.L14:
movss xmm0, DWORD PTR [rax+4]
add rdx, 1
movss DWORD PTR [rax+12], xmm0
add rax, rsi
cmp rcx, rdx
jge .L14
movss xmm0, DWORD PTR [r9]
movss DWORD PTR [rdi+r11*4], xmm0
cmp r15, 1
je .L15
.L35:
movss xmm0, DWORD PTR [r9+4]
movss DWORD PTR [rdi+rbx*4], xmm0
cmp r15, 3
jne .L15
movss xmm0, DWORD PTR [r9+8]
mov rax, QWORD PTR [rsp]
movss DWORD PTR [rdi+rax*4], xmm0
涉及到很多指针运算、比较和分支。我怀疑编译器必须检查子数组的别名或零数组范围,并且更难执行有效的矢量化。