NVMe SSD 上的 GFortran 未格式化 I/O 吞吐量
GFortran unformatted I/O throughput on NVMe SSDs
请帮助我了解如何使用 (G)Fortran 提高顺序的、未格式化的 I/O 吞吐量,尤其是在使用 NVMe SSD 时。
我写了一个小测试程序,见底部post。它的作用是并行打开一个或多个文件 (OpenMP) 并将一组随机数写入其中。然后它刷新系统缓存(需要 root,否则读取测试很可能会从内存中读取)打开文件并从中读取。时间以挂钟时间衡量(试图仅包括 I/O 相关时间),性能数字以 MiB/s 给出。程序循环直到中止。
我用于测试的硬件是三星 970 Evo Plus 1TB SSD,通过 2 个 PCIe 3.0 通道连接。所以理论上,它应该能够进行~1500MiB/s次顺序读写。
预先使用 "dd if=/dev/zero of=./testfile bs=1G count=1 oflag=direct" 进行测试的结果约为 750MB/s。不太好,但仍然比我用 Gfortran 得到的要好。并且取决于您问的是谁,无论如何不应将 dd 用于基准测试。这只是为了确保硬件在理论上能够提供更多功能。
我的代码的结果往往随着文件大小的增加而变得更好,但即使是 1GiB,它的上限也大约为 200MiB/s 写入,420MiB/s 读取。使用更多线程(例如 4 个)可以稍微提高写入速度,但只能达到 270MiB/s 左右。
我确保保持基准运行时间短,并让 SSD 有时间在测试之间放松。
我的印象是,即使只有一个线程,也应该可以使 2 个 PCIe 3.0 通道的带宽饱和。至少在使用未格式化的 I/O 时是这样。
代码似乎没有 CPU 限制,如果我将 "values" 字段的分配和初始化移出循环,top 显示单个内核的使用率不到 50%。考虑到我希望看到至少高出 5 倍的数字,这对整体性能来说仍然不是好兆头。
我还尝试将 access=stream 用于 open 语句,但无济于事。
那么问题是什么?
我的代码是wrong/unoptimized吗?我的期望值是不是太高了?
使用的平台:
Opensuse Leap 15.1,内核 4.12.14-lp151.28.36-default
2x AMD Epyc 7551、Supermicro H11DSI、三星 970 Evo Plus 1TB (2xPCIe 3.0)
gcc 版本 8.2.1,编译器选项:-ffree-line-length-none -O3 -ffast-math -funroll-loops -flto
MODULE types
implicit none
save
INTEGER, PARAMETER :: I8B = SELECTED_INT_KIND(18)
INTEGER, PARAMETER :: I4B = SELECTED_INT_KIND(9)
INTEGER, PARAMETER :: SP = KIND(1.0)
INTEGER, PARAMETER :: DP = KIND(1.0d0)
END MODULE types
MODULE parameters
use types
implicit none
save
INTEGER(I4B) :: filesize ! file size in MiB
INTEGER(I4B) :: nthreads ! number of threads for parallel ececution
INTEGER(I4B) :: alloc_size ! size of the allocated data field
END MODULE parameters
PROGRAM iometer
use types
use parameters
use omp_lib
implicit none
CHARACTER(LEN=100) :: directory_char, filesize_char, nthreads_char
CHARACTER(LEN=40) :: dummy_char1
CHARACTER(LEN=110) :: filename
CHARACTER(LEN=10) :: filenumber
INTEGER(I4B) :: thread, tunit, n
INTEGER(I8B) :: counti, countf, count_rate
REAL(DP) :: telapsed_read, telapsed_write, mib_written, write_speed, mib_read, read_speed
REAL(SP), DIMENSION(:), ALLOCATABLE :: values
call system_clock(counti,count_rate)
call getarg(1,directory_char)
dummy_char1 = ' directory to test:'
write(*,'(A40,A)') dummy_char1, trim(adjustl(directory_char))
call getarg(2,filesize_char)
dummy_char1 = ' file size (MiB):'
read(filesize_char,*) filesize
write(*,'(A40,I12)') dummy_char1, filesize
call getarg(3,nthreads_char)
dummy_char1 = ' number of parallel threads:'
read(nthreads_char,*) nthreads
write(*,'(A40,I12)') dummy_char1, nthreads
alloc_size = filesize * 262144
dummy_char1 = ' allocation size:'
write(*,'(A40,I12)') dummy_char1, alloc_size
mib_written = real(alloc_size,kind=dp) * real(nthreads,kind=dp) / 1048576.0_dp
mib_read = mib_written
CALL OMP_SET_NUM_THREADS(nthreads)
do while(.true.)
!$OMP PARALLEL default(shared) private(thread, filename, filenumber, values, tunit)
thread = omp_get_thread_num()
write(filenumber,'(I0.10)') thread
filename = trim(adjustl(directory_char)) // '/' // trim(adjustl(filenumber)) // '.temp'
allocate(values(alloc_size))
call random_seed()
call RANDOM_NUMBER(values)
tunit = thread + 100
!$OMP BARRIER
!$OMP MASTER
call system_clock(counti)
!$OMP END MASTER
!$OMP BARRIER
open(unit=tunit, file=trim(adjustl(filename)), status='replace', action='write', form='unformatted')
write(tunit) values
close(unit=tunit)
!$OMP BARRIER
!$OMP MASTER
call system_clock(countf)
telapsed_write = real(countf-counti,kind=dp)/real(count_rate,kind=dp)
write_speed = mib_written/telapsed_write
!write(*,*) 'write speed (MiB/s): ', write_speed
call execute_command_line ('echo 3 > /proc/sys/vm/drop_caches', wait=.true.)
call system_clock(counti)
!$OMP END MASTER
!$OMP BARRIER
open(unit=tunit, file=trim(adjustl(filename)), status='old', action='read', form='unformatted')
read(tunit) values
close(unit=tunit)
!$OMP BARRIER
!$OMP MASTER
call system_clock(countf)
telapsed_read = real(countf-counti,kind=dp)/real(count_rate,kind=dp)
read_speed = mib_read/telapsed_read
write(*,'(A29,2F10.3)') ' write / read speed (MiB/s): ', write_speed, read_speed
!$OMP END MASTER
!$OMP BARRIER
deallocate(values)
!$OMP END PARALLEL
call sleep(1)
end do
END PROGRAM iometer
您的代码中的错误是在计算 mib_written
时忘记考虑 real(sp)
变量的大小(4 个字节)。因此,您的结果是太低的 4 倍。例如。计算为
mib_written = filesize * nthreads
一些小错误,一些特定于 GFortran:
- 不要重复调用
random_seed
,尤其不要从每个线程调用。如果要调用的话,在程序的开头调用一次。
- 您可以使用
open(newunit=tunit, ...)
让编译器运行时为每个文件分配一个唯一的单元号。
- 如果你想要 'standard' 64 位 integer/floating 点类型,你可以使用
iso_fortran_env
内部模块中的变量 int64
和 real64
.
- 为了测试更大的文件,您需要制作
alloc_size
类型 int64
。
- 使用标准
get_command_argument
内在而不是非标准 getarg
。
access='stream'
比默认值(顺序)稍快,因为不需要处理记录长度标记。
你的测试程序与这些修复(和 parameters
模块折叠到主程序中)如下:
PROGRAM iometer
use iso_fortran_env
use omp_lib
implicit none
CHARACTER(LEN=100) :: directory_char, filesize_char, nthreads_char
CHARACTER(LEN=40) :: dummy_char1
CHARACTER(LEN=110) :: filename
CHARACTER(LEN=10) :: filenumber
INTEGER :: thread, tunit
INTEGER(int64) :: counti, countf, count_rate
REAL(real64) :: telapsed_read, telapsed_write, mib_written, write_speed, mib_read, read_speed
REAL, DIMENSION(:), ALLOCATABLE :: values
INTEGER :: filesize ! file size in MiB
INTEGER :: nthreads ! number of threads for parallel ececution
INTEGER(int64) :: alloc_size ! size of the allocated data field
call system_clock(counti,count_rate)
call get_command_argument(1, directory_char)
dummy_char1 = ' directory to test:'
write(*,'(A40,A)') dummy_char1, trim(adjustl(directory_char))
call get_command_argument(2, filesize_char)
dummy_char1 = ' file size (MiB):'
read(filesize_char,*) filesize
write(*,'(A40,I12)') dummy_char1, filesize
call get_command_argument(3, nthreads_char)
dummy_char1 = ' number of parallel threads:'
read(nthreads_char,*) nthreads
write(*,'(A40,I12)') dummy_char1, nthreads
alloc_size = filesize * 262144_int64
dummy_char1 = ' allocation size:'
write(*,'(A40,I12)') dummy_char1, alloc_size
mib_written = filesize * nthreads
dummy_char1 = ' MiB written:'
write(*, '(A40,g0)') dummy_char1, mib_written
mib_read = mib_written
CALL OMP_SET_NUM_THREADS(nthreads)
!$OMP PARALLEL default(shared) private(thread, filename, filenumber, values, tunit)
do while (.true.)
thread = omp_get_thread_num()
write(filenumber,'(I0.10)') thread
filename = trim(adjustl(directory_char)) // '/' // trim(adjustl(filenumber)) // '.temp'
if (.not. allocated(values)) then
allocate(values(alloc_size))
call RANDOM_NUMBER(values)
end if
open(newunit=tunit, file=filename, status='replace', action='write', form='unformatted', access='stream')
!$omp barrier
!$omp master
call system_clock(counti)
!$omp end master
!$omp barrier
write(tunit) values
close(unit=tunit)
!$omp barrier
!$omp master
call system_clock(countf)
telapsed_write = real(countf - counti, kind=real64)/real(count_rate, kind=real64)
write_speed = mib_written/telapsed_write
call execute_command_line ('echo 3 > /proc/sys/vm/drop_caches', wait=.true.)
!$OMP END MASTER
open(newunit=tunit, file=trim(adjustl(filename)), status='old', action='read', form='unformatted', access='stream')
!$omp barrier
!$omp master
call system_clock(counti)
!$omp end master
!$omp barrier
read(tunit) values
close(unit=tunit)
!$omp barrier
!$omp master
call system_clock(countf)
telapsed_read = real(countf - counti, kind=real64)/real(count_rate, kind=real64)
read_speed = mib_read/telapsed_read
write(*,'(A29,2F10.3)') ' write / read speed (MiB/s): ', write_speed, read_speed
!$OMP END MASTER
call sleep(1)
end do
!$OMP END PARALLEL
END PROGRAM iometer
请帮助我了解如何使用 (G)Fortran 提高顺序的、未格式化的 I/O 吞吐量,尤其是在使用 NVMe SSD 时。
我写了一个小测试程序,见底部post。它的作用是并行打开一个或多个文件 (OpenMP) 并将一组随机数写入其中。然后它刷新系统缓存(需要 root,否则读取测试很可能会从内存中读取)打开文件并从中读取。时间以挂钟时间衡量(试图仅包括 I/O 相关时间),性能数字以 MiB/s 给出。程序循环直到中止。
我用于测试的硬件是三星 970 Evo Plus 1TB SSD,通过 2 个 PCIe 3.0 通道连接。所以理论上,它应该能够进行~1500MiB/s次顺序读写。 预先使用 "dd if=/dev/zero of=./testfile bs=1G count=1 oflag=direct" 进行测试的结果约为 750MB/s。不太好,但仍然比我用 Gfortran 得到的要好。并且取决于您问的是谁,无论如何不应将 dd 用于基准测试。这只是为了确保硬件在理论上能够提供更多功能。
我的代码的结果往往随着文件大小的增加而变得更好,但即使是 1GiB,它的上限也大约为 200MiB/s 写入,420MiB/s 读取。使用更多线程(例如 4 个)可以稍微提高写入速度,但只能达到 270MiB/s 左右。 我确保保持基准运行时间短,并让 SSD 有时间在测试之间放松。
我的印象是,即使只有一个线程,也应该可以使 2 个 PCIe 3.0 通道的带宽饱和。至少在使用未格式化的 I/O 时是这样。
代码似乎没有 CPU 限制,如果我将 "values" 字段的分配和初始化移出循环,top 显示单个内核的使用率不到 50%。考虑到我希望看到至少高出 5 倍的数字,这对整体性能来说仍然不是好兆头。
我还尝试将 access=stream 用于 open 语句,但无济于事。
那么问题是什么?
我的代码是wrong/unoptimized吗?我的期望值是不是太高了?
使用的平台:
Opensuse Leap 15.1,内核 4.12.14-lp151.28.36-default
2x AMD Epyc 7551、Supermicro H11DSI、三星 970 Evo Plus 1TB (2xPCIe 3.0)
gcc 版本 8.2.1,编译器选项:-ffree-line-length-none -O3 -ffast-math -funroll-loops -flto
MODULE types
implicit none
save
INTEGER, PARAMETER :: I8B = SELECTED_INT_KIND(18)
INTEGER, PARAMETER :: I4B = SELECTED_INT_KIND(9)
INTEGER, PARAMETER :: SP = KIND(1.0)
INTEGER, PARAMETER :: DP = KIND(1.0d0)
END MODULE types
MODULE parameters
use types
implicit none
save
INTEGER(I4B) :: filesize ! file size in MiB
INTEGER(I4B) :: nthreads ! number of threads for parallel ececution
INTEGER(I4B) :: alloc_size ! size of the allocated data field
END MODULE parameters
PROGRAM iometer
use types
use parameters
use omp_lib
implicit none
CHARACTER(LEN=100) :: directory_char, filesize_char, nthreads_char
CHARACTER(LEN=40) :: dummy_char1
CHARACTER(LEN=110) :: filename
CHARACTER(LEN=10) :: filenumber
INTEGER(I4B) :: thread, tunit, n
INTEGER(I8B) :: counti, countf, count_rate
REAL(DP) :: telapsed_read, telapsed_write, mib_written, write_speed, mib_read, read_speed
REAL(SP), DIMENSION(:), ALLOCATABLE :: values
call system_clock(counti,count_rate)
call getarg(1,directory_char)
dummy_char1 = ' directory to test:'
write(*,'(A40,A)') dummy_char1, trim(adjustl(directory_char))
call getarg(2,filesize_char)
dummy_char1 = ' file size (MiB):'
read(filesize_char,*) filesize
write(*,'(A40,I12)') dummy_char1, filesize
call getarg(3,nthreads_char)
dummy_char1 = ' number of parallel threads:'
read(nthreads_char,*) nthreads
write(*,'(A40,I12)') dummy_char1, nthreads
alloc_size = filesize * 262144
dummy_char1 = ' allocation size:'
write(*,'(A40,I12)') dummy_char1, alloc_size
mib_written = real(alloc_size,kind=dp) * real(nthreads,kind=dp) / 1048576.0_dp
mib_read = mib_written
CALL OMP_SET_NUM_THREADS(nthreads)
do while(.true.)
!$OMP PARALLEL default(shared) private(thread, filename, filenumber, values, tunit)
thread = omp_get_thread_num()
write(filenumber,'(I0.10)') thread
filename = trim(adjustl(directory_char)) // '/' // trim(adjustl(filenumber)) // '.temp'
allocate(values(alloc_size))
call random_seed()
call RANDOM_NUMBER(values)
tunit = thread + 100
!$OMP BARRIER
!$OMP MASTER
call system_clock(counti)
!$OMP END MASTER
!$OMP BARRIER
open(unit=tunit, file=trim(adjustl(filename)), status='replace', action='write', form='unformatted')
write(tunit) values
close(unit=tunit)
!$OMP BARRIER
!$OMP MASTER
call system_clock(countf)
telapsed_write = real(countf-counti,kind=dp)/real(count_rate,kind=dp)
write_speed = mib_written/telapsed_write
!write(*,*) 'write speed (MiB/s): ', write_speed
call execute_command_line ('echo 3 > /proc/sys/vm/drop_caches', wait=.true.)
call system_clock(counti)
!$OMP END MASTER
!$OMP BARRIER
open(unit=tunit, file=trim(adjustl(filename)), status='old', action='read', form='unformatted')
read(tunit) values
close(unit=tunit)
!$OMP BARRIER
!$OMP MASTER
call system_clock(countf)
telapsed_read = real(countf-counti,kind=dp)/real(count_rate,kind=dp)
read_speed = mib_read/telapsed_read
write(*,'(A29,2F10.3)') ' write / read speed (MiB/s): ', write_speed, read_speed
!$OMP END MASTER
!$OMP BARRIER
deallocate(values)
!$OMP END PARALLEL
call sleep(1)
end do
END PROGRAM iometer
您的代码中的错误是在计算 mib_written
时忘记考虑 real(sp)
变量的大小(4 个字节)。因此,您的结果是太低的 4 倍。例如。计算为
mib_written = filesize * nthreads
一些小错误,一些特定于 GFortran:
- 不要重复调用
random_seed
,尤其不要从每个线程调用。如果要调用的话,在程序的开头调用一次。 - 您可以使用
open(newunit=tunit, ...)
让编译器运行时为每个文件分配一个唯一的单元号。 - 如果你想要 'standard' 64 位 integer/floating 点类型,你可以使用
iso_fortran_env
内部模块中的变量int64
和real64
. - 为了测试更大的文件,您需要制作
alloc_size
类型int64
。 - 使用标准
get_command_argument
内在而不是非标准getarg
。 access='stream'
比默认值(顺序)稍快,因为不需要处理记录长度标记。
你的测试程序与这些修复(和 parameters
模块折叠到主程序中)如下:
PROGRAM iometer
use iso_fortran_env
use omp_lib
implicit none
CHARACTER(LEN=100) :: directory_char, filesize_char, nthreads_char
CHARACTER(LEN=40) :: dummy_char1
CHARACTER(LEN=110) :: filename
CHARACTER(LEN=10) :: filenumber
INTEGER :: thread, tunit
INTEGER(int64) :: counti, countf, count_rate
REAL(real64) :: telapsed_read, telapsed_write, mib_written, write_speed, mib_read, read_speed
REAL, DIMENSION(:), ALLOCATABLE :: values
INTEGER :: filesize ! file size in MiB
INTEGER :: nthreads ! number of threads for parallel ececution
INTEGER(int64) :: alloc_size ! size of the allocated data field
call system_clock(counti,count_rate)
call get_command_argument(1, directory_char)
dummy_char1 = ' directory to test:'
write(*,'(A40,A)') dummy_char1, trim(adjustl(directory_char))
call get_command_argument(2, filesize_char)
dummy_char1 = ' file size (MiB):'
read(filesize_char,*) filesize
write(*,'(A40,I12)') dummy_char1, filesize
call get_command_argument(3, nthreads_char)
dummy_char1 = ' number of parallel threads:'
read(nthreads_char,*) nthreads
write(*,'(A40,I12)') dummy_char1, nthreads
alloc_size = filesize * 262144_int64
dummy_char1 = ' allocation size:'
write(*,'(A40,I12)') dummy_char1, alloc_size
mib_written = filesize * nthreads
dummy_char1 = ' MiB written:'
write(*, '(A40,g0)') dummy_char1, mib_written
mib_read = mib_written
CALL OMP_SET_NUM_THREADS(nthreads)
!$OMP PARALLEL default(shared) private(thread, filename, filenumber, values, tunit)
do while (.true.)
thread = omp_get_thread_num()
write(filenumber,'(I0.10)') thread
filename = trim(adjustl(directory_char)) // '/' // trim(adjustl(filenumber)) // '.temp'
if (.not. allocated(values)) then
allocate(values(alloc_size))
call RANDOM_NUMBER(values)
end if
open(newunit=tunit, file=filename, status='replace', action='write', form='unformatted', access='stream')
!$omp barrier
!$omp master
call system_clock(counti)
!$omp end master
!$omp barrier
write(tunit) values
close(unit=tunit)
!$omp barrier
!$omp master
call system_clock(countf)
telapsed_write = real(countf - counti, kind=real64)/real(count_rate, kind=real64)
write_speed = mib_written/telapsed_write
call execute_command_line ('echo 3 > /proc/sys/vm/drop_caches', wait=.true.)
!$OMP END MASTER
open(newunit=tunit, file=trim(adjustl(filename)), status='old', action='read', form='unformatted', access='stream')
!$omp barrier
!$omp master
call system_clock(counti)
!$omp end master
!$omp barrier
read(tunit) values
close(unit=tunit)
!$omp barrier
!$omp master
call system_clock(countf)
telapsed_read = real(countf - counti, kind=real64)/real(count_rate, kind=real64)
read_speed = mib_read/telapsed_read
write(*,'(A29,2F10.3)') ' write / read speed (MiB/s): ', write_speed, read_speed
!$OMP END MASTER
call sleep(1)
end do
!$OMP END PARALLEL
END PROGRAM iometer