在 3-D numpy 数组的小切片上有效地使用 1-D pyfftw
Efficiently using 1-D pyfftw on small slices of a 3-D numpy array
我有一个大小为 10,000x512x512 的 3D 数据立方体。我想沿着 dim[0] 重复解析 window 个向量(比如 6),并有效地生成傅立叶变换。我认为我正在将数组复制到 pyfftw 包中,这给了我巨大的开销。我现在正在查看文档,因为我认为我需要设置一个选项,但我可以在语法上使用一些额外的帮助。
这段代码原来是别人用numpy.fft.rfft写的,用numba加速的。但是实现在我的工作站上不起作用,所以我重新编写了所有内容并选择使用 pyfftw。
import numpy as np
import pyfftw as ftw
from tkinter import simpledialog
from math import ceil
import multiprocessing
ftw.config.NUM_THREADS = multiprocessing.cpu_count()
ftw.interfaces.cache.enable()
def runme():
# normally I would load a file, but for Stack Overflow, I'm just going to generate a 3D data cube so I'll delete references to the binary saving/loading functions:
# load the file
dataChunk = np.random.random((1000,512,512))
numFrames = dataChunk.shape[0]
# select the window size
windowSize = int(simpledialog.askstring('Window Size',
'How many frames to demodulate a single time point?'))
numChannels = windowSize//2+1
# create fftw arrays
ftwIn = ftw.empty_aligned(windowSize, dtype='complex128')
ftwOut = ftw.empty_aligned(windowSize, dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut)
# perform DFT on the data chunk
demodFrames = dataChunk.shape[0]//windowSize
channelChunks = np.zeros([numChannels,demodFrames,
dataChunk.shape[1],dataChunk.shape[2]])
channelChunks = getDFT(dataChunk,channelChunks,
ftwIn,ftwOut,fftObject,windowSize,numChannels)
return channelChunks
def getDFT(data,channelOut,ftwIn,ftwOut,fftObject,
windowSize,numChannels):
frameLen = data.shape[0]
demodFrames = frameLen//windowSize
for yy in range(data.shape[1]):
for xx in range(data.shape[2]):
index = 0
for i in range(0,frameLen-windowSize+1,windowSize):
ftwIn[:] = data[i:i+windowSize,yy,xx]
fftObject()
channelOut[:,index,yy,xx] = 2*np.abs(ftwOut[:numChannels])/windowSize
index+=1
return channelOut
if __name__ == '__main__':
runme()
我得到了一个 4 维数组;变量 channelChunks。我将每个通道保存到一个二进制文件中(上面的代码中没有包含,但保存部分工作正常)。
此过程适用于我们拥有的解调项目,然后将 4D 数据立方体 channelChunks 解析为 eval(numChannel) 3D 数据立方体(电影),根据我们的实验集,我们可以根据颜色分离电影向上。我希望我可以避免编写一个 C++ 函数,该函数通过 pyfftw 调用矩阵上的 fft。
实际上,我在给定索引 1 和 2 轴处沿 dataChunk 的 0 轴取 windowSize=6 个元素,并执行 1D FFT。我需要在 dataChunk 的整个 3D 体积中执行此操作以生成解调的电影。谢谢
FFTW advanced plans可以通过pyfftw自动构建
可以按以下方式修改代码:
可以使用实数到复数的变换代替复数到复数的变换。
使用 pyfftw,它通常会写:
ftwIn = ftw.empty_aligned(windowSize, dtype='float64')
ftwOut = ftw.empty_aligned(windowSize//2+1, dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut)
向 FFTW 规划器添加一些 标志。例如,FFTW_MEASURE
将计算不同算法的时间并选择最佳算法。 FFTW_DESTROY_INPUT
表示可以修改输入数组:可以使用一些实现技巧。
fftObject = ftw.FFTW(ftwIn,ftwOut, flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',))
限制分割数。除法比乘法成本更高。
scale=1.0/windowSize
for ...
for ...
2*np.abs(ftwOut[:,:,:])*scale #instead of /windowSize
通过 pyfftw 使用 FFTW advanced plan 避免多个 for 循环。
nbwindow=numFrames//windowSize
# create fftw arrays
ftwIn = ftw.empty_aligned((nbwindow,windowSize,dataChunk.shape[2]), dtype='float64')
ftwOut = ftw.empty_aligned((nbwindow,windowSize//2+1,dataChunk.shape[2]), dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut, axes=(1,), flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',))
...
for yy in range(data.shape[1]):
ftwIn[:] = np.reshape(data[0:nbwindow*windowSize,yy,:],(nbwindow,windowSize,data.shape[2]),order='C')
fftObject()
channelOut[:,:,yy,:]=np.transpose(2*np.abs(ftwOut[:,:,:])*scale, (1,0,2))
这是修改后的代码。我也将帧数减少到 100,设置随机生成器的种子以检查结果未被修改并评论 tkinter。 window的大小可以设置为2的幂,或者2、3、5、7的乘积,这样Cooley-Tuckey算法就可以得到有效应用。避免使用大质数。
import numpy as np
import pyfftw as ftw
#from tkinter import simpledialog
from math import ceil
import multiprocessing
import time
ftw.config.NUM_THREADS = multiprocessing.cpu_count()
ftw.interfaces.cache.enable()
ftw.config.PLANNER_EFFORT = 'FFTW_MEASURE'
def runme():
# normally I would load a file, but for Stack Overflow, I'm just going to generate a 3D data cube so I'll delete references to the binary saving/loading functions:
# load the file
np.random.seed(seed=42)
dataChunk = np.random.random((100,512,512))
numFrames = dataChunk.shape[0]
# select the window size
#windowSize = int(simpledialog.askstring('Window Size',
# 'How many frames to demodulate a single time point?'))
windowSize=32
numChannels = windowSize//2+1
nbwindow=numFrames//windowSize
# create fftw arrays
ftwIn = ftw.empty_aligned((nbwindow,windowSize,dataChunk.shape[2]), dtype='float64')
ftwOut = ftw.empty_aligned((nbwindow,windowSize//2+1,dataChunk.shape[2]), dtype='complex128')
#ftwIn = ftw.empty_aligned(windowSize, dtype='complex128')
#ftwOut = ftw.empty_aligned(windowSize, dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut, axes=(1,), flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',))
# perform DFT on the data chunk
demodFrames = dataChunk.shape[0]//windowSize
channelChunks = np.zeros([numChannels,demodFrames,
dataChunk.shape[1],dataChunk.shape[2]])
channelChunks = getDFT(dataChunk,channelChunks,
ftwIn,ftwOut,fftObject,windowSize,numChannels)
return channelChunks
def getDFT(data,channelOut,ftwIn,ftwOut,fftObject,
windowSize,numChannels):
frameLen = data.shape[0]
demodFrames = frameLen//windowSize
printed=0
nbwindow=data.shape[0]//windowSize
scale=1.0/windowSize
for yy in range(data.shape[1]):
#for xx in range(data.shape[2]):
index = 0
ftwIn[:] = np.reshape(data[0:nbwindow*windowSize,yy,:],(nbwindow,windowSize,data.shape[2]),order='C')
fftObject()
channelOut[:,:,yy,:]=np.transpose(2*np.abs(ftwOut[:,:,:])*scale, (1,0,2))
#for i in range(nbwindow):
#channelOut[:,i,yy,xx] = 2*np.abs(ftwOut[i,:])*scale
if printed==0:
for j in range(channelOut.shape[0]):
print j,channelOut[j,0,yy,0]
printed=1
return channelOut
if __name__ == '__main__':
seconds=time.time()
runme()
print "time: ", time.time()-seconds
让我们知道它加快了您的计算速度!我在我的电脑上从 24s 到不到 2s...
我有一个大小为 10,000x512x512 的 3D 数据立方体。我想沿着 dim[0] 重复解析 window 个向量(比如 6),并有效地生成傅立叶变换。我认为我正在将数组复制到 pyfftw 包中,这给了我巨大的开销。我现在正在查看文档,因为我认为我需要设置一个选项,但我可以在语法上使用一些额外的帮助。
这段代码原来是别人用numpy.fft.rfft写的,用numba加速的。但是实现在我的工作站上不起作用,所以我重新编写了所有内容并选择使用 pyfftw。
import numpy as np
import pyfftw as ftw
from tkinter import simpledialog
from math import ceil
import multiprocessing
ftw.config.NUM_THREADS = multiprocessing.cpu_count()
ftw.interfaces.cache.enable()
def runme():
# normally I would load a file, but for Stack Overflow, I'm just going to generate a 3D data cube so I'll delete references to the binary saving/loading functions:
# load the file
dataChunk = np.random.random((1000,512,512))
numFrames = dataChunk.shape[0]
# select the window size
windowSize = int(simpledialog.askstring('Window Size',
'How many frames to demodulate a single time point?'))
numChannels = windowSize//2+1
# create fftw arrays
ftwIn = ftw.empty_aligned(windowSize, dtype='complex128')
ftwOut = ftw.empty_aligned(windowSize, dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut)
# perform DFT on the data chunk
demodFrames = dataChunk.shape[0]//windowSize
channelChunks = np.zeros([numChannels,demodFrames,
dataChunk.shape[1],dataChunk.shape[2]])
channelChunks = getDFT(dataChunk,channelChunks,
ftwIn,ftwOut,fftObject,windowSize,numChannels)
return channelChunks
def getDFT(data,channelOut,ftwIn,ftwOut,fftObject,
windowSize,numChannels):
frameLen = data.shape[0]
demodFrames = frameLen//windowSize
for yy in range(data.shape[1]):
for xx in range(data.shape[2]):
index = 0
for i in range(0,frameLen-windowSize+1,windowSize):
ftwIn[:] = data[i:i+windowSize,yy,xx]
fftObject()
channelOut[:,index,yy,xx] = 2*np.abs(ftwOut[:numChannels])/windowSize
index+=1
return channelOut
if __name__ == '__main__':
runme()
我得到了一个 4 维数组;变量 channelChunks。我将每个通道保存到一个二进制文件中(上面的代码中没有包含,但保存部分工作正常)。
此过程适用于我们拥有的解调项目,然后将 4D 数据立方体 channelChunks 解析为 eval(numChannel) 3D 数据立方体(电影),根据我们的实验集,我们可以根据颜色分离电影向上。我希望我可以避免编写一个 C++ 函数,该函数通过 pyfftw 调用矩阵上的 fft。
实际上,我在给定索引 1 和 2 轴处沿 dataChunk 的 0 轴取 windowSize=6 个元素,并执行 1D FFT。我需要在 dataChunk 的整个 3D 体积中执行此操作以生成解调的电影。谢谢
FFTW advanced plans可以通过pyfftw自动构建 可以按以下方式修改代码:
可以使用实数到复数的变换代替复数到复数的变换。 使用 pyfftw,它通常会写:
ftwIn = ftw.empty_aligned(windowSize, dtype='float64') ftwOut = ftw.empty_aligned(windowSize//2+1, dtype='complex128') fftObject = ftw.FFTW(ftwIn,ftwOut)
向 FFTW 规划器添加一些 标志。例如,
FFTW_MEASURE
将计算不同算法的时间并选择最佳算法。FFTW_DESTROY_INPUT
表示可以修改输入数组:可以使用一些实现技巧。fftObject = ftw.FFTW(ftwIn,ftwOut, flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',))
限制分割数。除法比乘法成本更高。
scale=1.0/windowSize for ... for ... 2*np.abs(ftwOut[:,:,:])*scale #instead of /windowSize
通过 pyfftw 使用 FFTW advanced plan 避免多个 for 循环。
nbwindow=numFrames//windowSize # create fftw arrays ftwIn = ftw.empty_aligned((nbwindow,windowSize,dataChunk.shape[2]), dtype='float64') ftwOut = ftw.empty_aligned((nbwindow,windowSize//2+1,dataChunk.shape[2]), dtype='complex128') fftObject = ftw.FFTW(ftwIn,ftwOut, axes=(1,), flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',)) ... for yy in range(data.shape[1]): ftwIn[:] = np.reshape(data[0:nbwindow*windowSize,yy,:],(nbwindow,windowSize,data.shape[2]),order='C') fftObject() channelOut[:,:,yy,:]=np.transpose(2*np.abs(ftwOut[:,:,:])*scale, (1,0,2))
这是修改后的代码。我也将帧数减少到 100,设置随机生成器的种子以检查结果未被修改并评论 tkinter。 window的大小可以设置为2的幂,或者2、3、5、7的乘积,这样Cooley-Tuckey算法就可以得到有效应用。避免使用大质数。
import numpy as np
import pyfftw as ftw
#from tkinter import simpledialog
from math import ceil
import multiprocessing
import time
ftw.config.NUM_THREADS = multiprocessing.cpu_count()
ftw.interfaces.cache.enable()
ftw.config.PLANNER_EFFORT = 'FFTW_MEASURE'
def runme():
# normally I would load a file, but for Stack Overflow, I'm just going to generate a 3D data cube so I'll delete references to the binary saving/loading functions:
# load the file
np.random.seed(seed=42)
dataChunk = np.random.random((100,512,512))
numFrames = dataChunk.shape[0]
# select the window size
#windowSize = int(simpledialog.askstring('Window Size',
# 'How many frames to demodulate a single time point?'))
windowSize=32
numChannels = windowSize//2+1
nbwindow=numFrames//windowSize
# create fftw arrays
ftwIn = ftw.empty_aligned((nbwindow,windowSize,dataChunk.shape[2]), dtype='float64')
ftwOut = ftw.empty_aligned((nbwindow,windowSize//2+1,dataChunk.shape[2]), dtype='complex128')
#ftwIn = ftw.empty_aligned(windowSize, dtype='complex128')
#ftwOut = ftw.empty_aligned(windowSize, dtype='complex128')
fftObject = ftw.FFTW(ftwIn,ftwOut, axes=(1,), flags=('FFTW_MEASURE','FFTW_DESTROY_INPUT',))
# perform DFT on the data chunk
demodFrames = dataChunk.shape[0]//windowSize
channelChunks = np.zeros([numChannels,demodFrames,
dataChunk.shape[1],dataChunk.shape[2]])
channelChunks = getDFT(dataChunk,channelChunks,
ftwIn,ftwOut,fftObject,windowSize,numChannels)
return channelChunks
def getDFT(data,channelOut,ftwIn,ftwOut,fftObject,
windowSize,numChannels):
frameLen = data.shape[0]
demodFrames = frameLen//windowSize
printed=0
nbwindow=data.shape[0]//windowSize
scale=1.0/windowSize
for yy in range(data.shape[1]):
#for xx in range(data.shape[2]):
index = 0
ftwIn[:] = np.reshape(data[0:nbwindow*windowSize,yy,:],(nbwindow,windowSize,data.shape[2]),order='C')
fftObject()
channelOut[:,:,yy,:]=np.transpose(2*np.abs(ftwOut[:,:,:])*scale, (1,0,2))
#for i in range(nbwindow):
#channelOut[:,i,yy,xx] = 2*np.abs(ftwOut[i,:])*scale
if printed==0:
for j in range(channelOut.shape[0]):
print j,channelOut[j,0,yy,0]
printed=1
return channelOut
if __name__ == '__main__':
seconds=time.time()
runme()
print "time: ", time.time()-seconds
让我们知道它加快了您的计算速度!我在我的电脑上从 24s 到不到 2s...