如何使用 cuFFT 执行实数到复数的转换
How to perform a Real to Complex Transformation with cuFFT
以下代码改编自 here,适用于使用 cufftPlan1d 的单个一维变换。最终我想执行批处理就地 R2C 转换,但下面的代码使用单独的输入和输出数组执行单个转换。
如何调整此代码以就地执行转换,从而减少设备上分配的内存量?
谢谢
Cuda 6.5 - 注意:我是 运行 来自 MATLAB 2015a 中的 mexFunction 的代码
代码:
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#define DATASIZE 8
#define BATCH 1
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void main(int argc, char **argv)
{
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)(j + 1);
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * sizeof(cufftReal)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
// --- Device side output data allocation
cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex)));
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }
cufftStatus = cufftExecR2C(handle, deviceInputData, deviceOutputData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
for (int j=0; j<(DATASIZE / 2 + 1); j++)
printf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
}
解决方案已经在另一个答案中给出:
对于您的示例,这意味着:
将输入分配为 cufftComplex:
cufftComplex *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * sizeof(cufftComplex)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
就地转换:
cufftStatus = cufftExecR2C(handle, (cufftReal *)deviceInputData, deviceInputData);
gpuErrchk(cudaMemcpy(hostOutputData, deviceInputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
顺便说一句:MATLAB 还包含 fft() 的 GPU 加速版本,也许这对您也有用:http://de.mathworks.com/help/distcomp/run-built-in-functions-on-a-gpu.html#btjw5gk
这是我自己的完整解决方案,它以 cufftReal 开头
void process(double *x, double *y, size_t n){
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)x[j];
// --- Device side input data allocation and initialization
cufftReal *deviceData;
gpuErrchk(cudaMalloc((void**)&deviceData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
cudaMemcpy(deviceData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }
cufftStatus = cufftExecR2C(handle, deviceData, (cufftComplex*)deviceData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
for (int j=0; j<(DATASIZE / 2 + 1); j++)
mexPrintf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceData));}
以下代码改编自 here,适用于使用 cufftPlan1d 的单个一维变换。最终我想执行批处理就地 R2C 转换,但下面的代码使用单独的输入和输出数组执行单个转换。
如何调整此代码以就地执行转换,从而减少设备上分配的内存量?
谢谢
Cuda 6.5 - 注意:我是 运行 来自 MATLAB 2015a 中的 mexFunction 的代码
代码:
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#define DATASIZE 8
#define BATCH 1
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void main(int argc, char **argv)
{
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)(j + 1);
// --- Device side input data allocation and initialization
cufftReal *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * sizeof(cufftReal)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
// --- Device side output data allocation
cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex)));
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }
cufftStatus = cufftExecR2C(handle, deviceInputData, deviceOutputData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
for (int j=0; j<(DATASIZE / 2 + 1); j++)
printf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceOutputData));
gpuErrchk(cudaFree(deviceInputData));
}
解决方案已经在另一个答案中给出:
对于您的示例,这意味着:
将输入分配为 cufftComplex:
cufftComplex *deviceInputData;
gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * sizeof(cufftComplex)));
cudaMemcpy(deviceInputData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
就地转换:
cufftStatus = cufftExecR2C(handle, (cufftReal *)deviceInputData, deviceInputData);
gpuErrchk(cudaMemcpy(hostOutputData, deviceInputData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
顺便说一句:MATLAB 还包含 fft() 的 GPU 加速版本,也许这对您也有用:http://de.mathworks.com/help/distcomp/run-built-in-functions-on-a-gpu.html#btjw5gk
这是我自己的完整解决方案,它以 cufftReal 开头
void process(double *x, double *y, size_t n){
// --- Host side input data allocation and initialization
cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*sizeof(cufftReal));
for (int j=0; j<DATASIZE; j++) hostInputData[j] = (cufftReal)x[j];
// --- Device side input data allocation and initialization
cufftReal *deviceData;
gpuErrchk(cudaMalloc((void**)&deviceData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
cudaMemcpy(deviceData, hostInputData, DATASIZE * sizeof(cufftReal), cudaMemcpyHostToDevice);
// --- Host side output data allocation
cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
cufftResult cufftStatus;
cufftHandle handle;
cufftStatus = cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftPlan1d failed!"); }
cufftStatus = cufftExecR2C(handle, deviceData, (cufftComplex*)deviceData);
if (cufftStatus != cudaSuccess) { mexPrintf("cufftExecR2C failed!"); }
// --- Device->Host copy of the results
gpuErrchk(cudaMemcpy(hostOutputData, deviceData, (DATASIZE / 2 + 1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
for (int j=0; j<(DATASIZE / 2 + 1); j++)
mexPrintf("%i %f %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
cufftDestroy(handle);
gpuErrchk(cudaFree(deviceData));}