Fortran 中使用 PlanMany 的 cuFFT 双精度误差
Double precision error of cuFFT with PlanMany in Fortran
在 (answer of JackOLantern) 之后,我正在尝试使用 cufftPlanMany 计算批量 1D FFT。
下面的代码执行 nwfs=23
次 n=256
复杂数组的前向 1D FFT 和后向 1D FFT。这是为了训练我处理常规 cufftPlanMany。作为第二步,nwfs
数组将不同。最后,我检查每个数组的错误。
因为数据分配为:cinput_d(n,nwfs)
我这样使用函数:cufftPlanMany(planmany, 1, fftsize, inembed, nwfs,1, onembed, nwfs,1, CUFFT_C2C, nwfs)
其中:
rank = 1
fftsize = {n}
每个 FFT 的暗淡度相同
inembed = onembed = {0}
忽略
istride = ostride = nwfs
两个连续输入和输出之间的距离
idist = odist = 1
两个信号之间的距离
batch = nwfs
要完成的 fft 数
program fft
use cudafor
use precision_m
use cufft_m
implicit none
integer, allocatable:: kx(:)
complex(fp_kind), allocatable:: matrix(:)
complex(fp_kind), allocatable, pinned :: cinput(:,:),coutput(:,:)
complex(fp_kind), allocatable, device :: cinput_d(:,:),coutput_d(:,:)
integer:: i,j,k,n,nwfs
integer, allocatable :: fftsize(:),inembed(:),onembed(:)
type(c_ptr):: plan,planmany
real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h
integer::clock_start,clock_end,clock_rate,istat
real :: elapsed_time
character*1:: a
real(fp_kind):: w,x,y,z
integer:: nerrors
n=256
nwfs=23
h=twopi/real(n,fp_kind)
! allocate arrays on the host
allocate (cinput(n,nwfs),coutput(n,nwfs))
allocate (kx(n),matrix(n))
allocate (fftsize(nwfs),inembed(nwfs),onembed(nwfs))
! allocate arrays on the device
allocate (cinput_d(n,nwfs),coutput_d(n,nwfs))
fftsize(:) = n
inembed(:) = 0
onembed(:) = 0
!initialize arrays on host
kx =(/ ((i-0.5)*0.1953125, i=1,n/2), ((-n+i-0.5)*0.1953125, i=n/2+1,n) /)
matrix = (/ ... /)
!write(*,*) cinput
!copy arrays to device
do i =1,nwfs
cinput(:,i)=matrix(:)
end do
cinput_d=cinput
! Initialize the plan for complex to complex transform
if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1)
if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1)
if (fp_kind== doublePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_Z2Z, nwfs)
if (fp_kind== singlePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, &
nwfs,1, &
onembed, &
nwfs,1, &
CUFFT_C2C, nwfs)
!c_null_ptr fftsize,inembed,onembed
! cufftPlanMany(plan, rank, n, inembed, istride, idist, &
! onembed, ostride, odist, &
! type, batch)
!subroutine cufftPlan1d(plan, nx, type, batch)
call SYSTEM_CLOCK(COUNT_RATE=clock_rate)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_start)
! Forward transform out of place
call cufftExec(planmany,cinput_d,coutput_d,CUFFT_FORWARD)
!$cuf kernel do <<<*,*>>>
do i=1,n
do j =1,n
coutput_d(i,j) = coutput_d(i,j)/real(n,fp_kind)!sqrt(twopi*real(n,fp_kind))*sqrt(2.*pi)/sqrt(real(maxn))
end do
end do
call cufftExec(planmany,coutput_d,coutput_d,CUFFT_INVERSE)
istat=cudaThreadSynchronize()
call SYSTEM_CLOCK(count=clock_end)
! Copy results back to host
coutput=coutput_d
do i=1,n
! write(*,'(i2,1x,2(f8.4),1x,2(f8.4),2x,e13.7)') i,cinput(i),coutput(i),abs(coutput(i)-cinput(i))
end do
nerrors=0
do i=1,n
!write(*,'(i2,5(1x,2(f8.4),1x,2(f8.4),2x,3(e13.7,2x)))') i,cinput(i,1),coutput(i,1),abs(coutput(i,1)-cinput(i,1)),abs(coutput(i,6)-cinput(i,6)),abs(coutput(i,nwfs)-cinput(i,nwfs))
do j=1,nwfs
if (abs(coutput(i,j)-cinput(i,j))>1.d-5) then
write(*,'(i3,i3,1x,e13.7,2x,4(f8.4))') i,j,abs(coutput(i,j)-cinput(i,j)),cinput(i,j),coutput(i,j)
nerrors = nerrors + 1
end if
end do
end do
elapsed_time = REAL(clock_end-clock_start)/REAL(clock_rate)
write(*,*) 'elapsed_time :',elapsed_time,clock_start,clock_end,clock_rate
if (nerrors .eq. 0) then
print *, "Test Passed"
else
print *, "Test Failed"
endif
!release memory on the host and on the device
deallocate (cinput,coutput,kx,cinput_d,coutput_d)
! Destroy the plans
call cufftDestroy(plan)
end program fft
有人能告诉我为什么以下 "many-FFT" 有时在双精度上失败但在单精度上从不失败吗?
单精度:"Test Passed"总是!
双精度:"Test Failed" 有时!
确实,我检查了设备到主机的数据传输。好像不是。
感谢您的帮助。
多亏了利爪。这是 WDDM 超时检测和恢复限制。
见link,到change the TDR
在 (answer of JackOLantern) 之后,我正在尝试使用 cufftPlanMany 计算批量 1D FFT。
下面的代码执行 nwfs=23
次 n=256
复杂数组的前向 1D FFT 和后向 1D FFT。这是为了训练我处理常规 cufftPlanMany。作为第二步,nwfs
数组将不同。最后,我检查每个数组的错误。
因为数据分配为:cinput_d(n,nwfs)
我这样使用函数:cufftPlanMany(planmany, 1, fftsize, inembed, nwfs,1, onembed, nwfs,1, CUFFT_C2C, nwfs)
其中:
rank = 1
fftsize = {n}
每个 FFT 的暗淡度相同inembed = onembed = {0}
忽略istride = ostride = nwfs
两个连续输入和输出之间的距离idist = odist = 1
两个信号之间的距离batch = nwfs
要完成的 fft 数
program fft use cudafor use precision_m use cufft_m implicit none integer, allocatable:: kx(:) complex(fp_kind), allocatable:: matrix(:) complex(fp_kind), allocatable, pinned :: cinput(:,:),coutput(:,:) complex(fp_kind), allocatable, device :: cinput_d(:,:),coutput_d(:,:) integer:: i,j,k,n,nwfs integer, allocatable :: fftsize(:),inembed(:),onembed(:) type(c_ptr):: plan,planmany real(fp_kind):: twopi=8._fp_kind*atan(1._fp_kind),h integer::clock_start,clock_end,clock_rate,istat real :: elapsed_time character*1:: a real(fp_kind):: w,x,y,z integer:: nerrors n=256 nwfs=23 h=twopi/real(n,fp_kind) ! allocate arrays on the host allocate (cinput(n,nwfs),coutput(n,nwfs)) allocate (kx(n),matrix(n)) allocate (fftsize(nwfs),inembed(nwfs),onembed(nwfs)) ! allocate arrays on the device allocate (cinput_d(n,nwfs),coutput_d(n,nwfs)) fftsize(:) = n inembed(:) = 0 onembed(:) = 0 !initialize arrays on host kx =(/ ((i-0.5)*0.1953125, i=1,n/2), ((-n+i-0.5)*0.1953125, i=n/2+1,n) /) matrix = (/ ... /) !write(*,*) cinput !copy arrays to device do i =1,nwfs cinput(:,i)=matrix(:) end do cinput_d=cinput ! Initialize the plan for complex to complex transform if (fp_kind== singlePrecision) call cufftPlan1D(plan,n,CUFFT_C2C,1) if (fp_kind== doublePrecision) call cufftPlan1D(plan,n,CUFFT_Z2Z,1) if (fp_kind== doublePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, & nwfs,1, & onembed, & nwfs,1, & CUFFT_Z2Z, nwfs) if (fp_kind== singlePrecision) call cufftPlanMany(planmany, 1, fftsize, inembed, & nwfs,1, & onembed, & nwfs,1, & CUFFT_C2C, nwfs) !c_null_ptr fftsize,inembed,onembed ! cufftPlanMany(plan, rank, n, inembed, istride, idist, & ! onembed, ostride, odist, & ! type, batch) !subroutine cufftPlan1d(plan, nx, type, batch) call SYSTEM_CLOCK(COUNT_RATE=clock_rate) istat=cudaThreadSynchronize() call SYSTEM_CLOCK(count=clock_start) ! Forward transform out of place call cufftExec(planmany,cinput_d,coutput_d,CUFFT_FORWARD) !$cuf kernel do <<<*,*>>> do i=1,n do j =1,n coutput_d(i,j) = coutput_d(i,j)/real(n,fp_kind)!sqrt(twopi*real(n,fp_kind))*sqrt(2.*pi)/sqrt(real(maxn)) end do end do call cufftExec(planmany,coutput_d,coutput_d,CUFFT_INVERSE) istat=cudaThreadSynchronize() call SYSTEM_CLOCK(count=clock_end) ! Copy results back to host coutput=coutput_d do i=1,n ! write(*,'(i2,1x,2(f8.4),1x,2(f8.4),2x,e13.7)') i,cinput(i),coutput(i),abs(coutput(i)-cinput(i)) end do nerrors=0 do i=1,n !write(*,'(i2,5(1x,2(f8.4),1x,2(f8.4),2x,3(e13.7,2x)))') i,cinput(i,1),coutput(i,1),abs(coutput(i,1)-cinput(i,1)),abs(coutput(i,6)-cinput(i,6)),abs(coutput(i,nwfs)-cinput(i,nwfs)) do j=1,nwfs if (abs(coutput(i,j)-cinput(i,j))>1.d-5) then write(*,'(i3,i3,1x,e13.7,2x,4(f8.4))') i,j,abs(coutput(i,j)-cinput(i,j)),cinput(i,j),coutput(i,j) nerrors = nerrors + 1 end if end do end do elapsed_time = REAL(clock_end-clock_start)/REAL(clock_rate) write(*,*) 'elapsed_time :',elapsed_time,clock_start,clock_end,clock_rate if (nerrors .eq. 0) then print *, "Test Passed" else print *, "Test Failed" endif !release memory on the host and on the device deallocate (cinput,coutput,kx,cinput_d,coutput_d) ! Destroy the plans call cufftDestroy(plan) end program fft
有人能告诉我为什么以下 "many-FFT" 有时在双精度上失败但在单精度上从不失败吗?
单精度:"Test Passed"总是! 双精度:"Test Failed" 有时!
确实,我检查了设备到主机的数据传输。好像不是。
感谢您的帮助。
多亏了利爪。这是 WDDM 超时检测和恢复限制。
见link,到change the TDR