使用 cucomplex.h 操作袖带数据
Using cucomplex.h to manipulate cufft data
我是新来的。我正在使用 FFT,我需要编写一个简单的代码,但它不起作用。我需要用 cufft 转换 sin(x) 并返回,但在转换之间,我需要将结果乘以 2,这样,当我使用逆变换返回结果时,我将收到 2*sin( x) 例如。使用 fftw.h,我只需将 d_signal[i] 乘以 2,当我返回时,我有 2*sin(x),但我曾经使用 complex.h。任何想法?谢谢
# define SIGNAL_SIZE 64
# define PI acos(-1.0)
# define x 2*PI/SIGNAL_SIZE
void runTest(int argc, char **argv)
{
printf("[simpleCUFFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
// Allocate host memory for the signal
cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = sin(i*x);
h_signal[i].y = 0;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
}
bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
int new_size = signal_size + maxRadius;
// Pad signal
Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
*padded_signal = new_data;
// Pad filter
new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
*padded_filter_kernel = new_data;
return new_size;
}
简单的转换代码
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// Complex data type
typedef float2 Complex;
// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);
// Padding functions
int PadData(const Complex *, Complex **, int,
const Complex *, Complex **, int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 32
#define FILTER_KERNEL_SIZE 11
#define PONTOS 32
#define PI acos(-1)
#define dx 2*PI/PONTOS
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
runTest(argc, argv);
system("Pause");
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
printf("[simpleCUFFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
// Allocate host memory for the signal
cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = sin(i*dx);;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y /= (float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
printf("1 Error %g %g \n", fabs(h_signal[i].x - h_reversed_signal[i].x), fabs(h_signal[i].y - h_reversed_signal[i].y));
}
bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
int new_size = signal_size + maxRadius;
// Pad signal
Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
*padded_signal = new_data;
// Pad filter
new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
*padded_filter_kernel = new_data;
return new_size;
}
结果
[simpleCUFFT] is starting...
GPU Device 0: "GeForce GTX 570" with compute capability 2.0
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
first : 0.001251 0.000000 after 0.001251 0.000000
first : 0.563585 0.195090 after 0.563585 0.195090
first : 0.193304 0.382683 after 0.193304 0.382683
first : 0.808740 0.555570 after 0.808740 0.555570
first : 0.585009 0.707107 after 0.585009 0.707107
first : 0.479873 0.831470 after 0.479873 0.831470
first : 0.350291 0.923880 after 0.350291 0.923879
first : 0.895962 0.980785 after 0.895962 0.980785
first : 0.822840 1.000000 after 0.822840 1.000000
first : 0.746605 0.980785 after 0.746605 0.980785
first : 0.174108 0.923880 after 0.174108 0.923879
first : 0.858943 0.831470 after 0.858943 0.831470
first : 0.710501 0.707107 after 0.710501 0.707107
first : 0.513535 0.555570 after 0.513535 0.555570
first : 0.303995 0.382683 after 0.303995 0.382683
first : 0.014985 0.195090 after 0.014985 0.195090
first : 0.091403 0.000000 after 0.091403 0.000000
first : 0.364452 -0.195090 after 0.364452 -0.195090
first : 0.147313 -0.382683 after 0.147313 -0.382683
first : 0.165899 -0.555570 after 0.165899 -0.555570
first : 0.988525 -0.707107 after 0.988525 -0.707107
first : 0.445692 -0.831470 after 0.445692 -0.831470
first : 0.119083 -0.923880 after 0.119083 -0.923879
first : 0.004669 -0.980785 after 0.004669 -0.980785
first : 0.008911 -1.000000 after 0.008911 -1.000000
first : 0.377880 -0.980785 after 0.377880 -0.980785
first : 0.531663 -0.923880 after 0.531663 -0.923879
first : 0.571184 -0.831470 after 0.571184 -0.831470
first : 0.601764 -0.707107 after 0.601764 -0.707107
first : 0.607166 -0.555570 after 0.607166 -0.555570
first : 0.166234 -0.382683 after 0.166234 -0.382683
first : 0.663045 -0.195090 after 0.663045 -0.195090
L2 = 3.00296e-007
这个代码序列是非法的:
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
d_signal
之前已创建为 device 指针(通过 cudaMalloc
)。在主机代码中直接使用即取消引用设备指针是非法的。
解决此问题的一种可能方法是将代码中将复杂的中间结果乘以 2 的这一部分替换为调用执行类似功能的 CUDA 内核。
虽然它不是精确地将中间结果缩放 2,但有一个 cuda sample code 演示了 FFT->kernel->IFFT 的顺序,可能有助于理解。
这是一个完整的示例,是围绕您展示的代码构建的:
$ cat t612.cu
# define SIGNAL_SIZE 64
# define PI acos(-1.0)
# define MY_X 2*PI/SIGNAL_SIZE
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <helper_cuda.h>
#define MPY_FACTOR 2.0
#define nTPB 256
__global__ void multiply_kernel(cufftComplex *data, int len){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len){
data[idx].x *= MPY_FACTOR;
data[idx].y *= MPY_FACTOR;
}
}
void runTest()
{
printf("FFT multiply is starting...\n");
// Allocate host memory for the signal
cufftComplex *h_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = sin(i*MY_X);
h_signal[i].y = 0;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex), cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
/*
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
*/
multiply_kernel<<<(SIGNAL_SIZE+nTPB-1)/nTPB, nTPB>>>(d_signal, SIGNAL_SIZE);
checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("Kernel execution failed [ scaling kernel]");
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
}
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
int main(){
runTest();
}
$ nvcc -arch=sm_20 -I/usr/local/cuda/samples/common/inc -o t612 t612.cu -lcufft
$ ./t612
[simpleCUFFT] is starting...
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
first : 0.000000 0.000000 after 0.000000 0.000000
first : 0.098017 0.000000 after 0.196034 -0.000000
first : 0.195090 0.000000 after 0.390181 0.000000
first : 0.290285 0.000000 after 0.580569 0.000000
first : 0.382683 0.000000 after 0.765367 0.000000
first : 0.471397 0.000000 after 0.942793 0.000000
first : 0.555570 0.000000 after 1.111140 -0.000000
first : 0.634393 0.000000 after 1.268787 0.000000
first : 0.707107 0.000000 after 1.414214 0.000000
first : 0.773010 0.000000 after 1.546021 0.000000
first : 0.831470 0.000000 after 1.662939 -0.000000
first : 0.881921 0.000000 after 1.763842 0.000000
first : 0.923880 0.000000 after 1.847759 0.000000
first : 0.956940 0.000000 after 1.913881 -0.000000
first : 0.980785 0.000000 after 1.961570 0.000000
first : 0.995185 0.000000 after 1.990369 -0.000000
first : 1.000000 0.000000 after 2.000000 0.000000
first : 0.995185 0.000000 after 1.990370 0.000000
first : 0.980785 0.000000 after 1.961570 -0.000000
first : 0.956940 0.000000 after 1.913881 0.000000
first : 0.923880 0.000000 after 1.847759 0.000000
first : 0.881921 0.000000 after 1.763842 0.000000
first : 0.831470 0.000000 after 1.662939 0.000000
first : 0.773010 0.000000 after 1.546021 0.000000
first : 0.707107 0.000000 after 1.414214 0.000000
first : 0.634393 0.000000 after 1.268786 -0.000000
first : 0.555570 0.000000 after 1.111140 0.000000
first : 0.471397 0.000000 after 0.942793 0.000000
first : 0.382683 0.000000 after 0.765367 0.000000
first : 0.290285 0.000000 after 0.580569 0.000000
first : 0.195090 0.000000 after 0.390181 0.000000
first : 0.098017 0.000000 after 0.196034 0.000000
first : 0.000000 0.000000 after -0.000000 0.000000
first : -0.098017 0.000000 after -0.196034 0.000000
first : -0.195090 0.000000 after -0.390181 0.000000
first : -0.290285 0.000000 after -0.580569 0.000000
first : -0.382683 0.000000 after -0.765367 0.000000
first : -0.471397 0.000000 after -0.942793 0.000000
first : -0.555570 0.000000 after -1.111140 0.000000
first : -0.634393 0.000000 after -1.268787 0.000000
first : -0.707107 0.000000 after -1.414214 0.000000
first : -0.773010 0.000000 after -1.546021 -0.000000
first : -0.831470 0.000000 after -1.662939 0.000000
first : -0.881921 0.000000 after -1.763842 0.000000
first : -0.923880 0.000000 after -1.847759 0.000000
first : -0.956940 0.000000 after -1.913881 0.000000
first : -0.980785 0.000000 after -1.961570 -0.000000
first : -0.995185 0.000000 after -1.990369 0.000000
first : -1.000000 0.000000 after -2.000000 -0.000000
first : -0.995185 0.000000 after -1.990370 -0.000000
first : -0.980785 0.000000 after -1.961570 0.000000
first : -0.956940 0.000000 after -1.913881 -0.000000
first : -0.923880 0.000000 after -1.847759 0.000000
first : -0.881921 0.000000 after -1.763842 -0.000000
first : -0.831470 0.000000 after -1.662939 -0.000000
first : -0.773010 0.000000 after -1.546021 0.000000
first : -0.707107 0.000000 after -1.414214 0.000000
first : -0.634393 0.000000 after -1.268786 0.000000
first : -0.555570 0.000000 after -1.111140 -0.000000
first : -0.471397 0.000000 after -0.942793 0.000000
first : -0.382683 0.000000 after -0.765367 0.000000
first : -0.290285 0.000000 after -0.580569 0.000000
first : -0.195090 0.000000 after -0.390181 -0.000000
first : -0.098017 0.000000 after -0.196034 0.000000
$
我是新来的。我正在使用 FFT,我需要编写一个简单的代码,但它不起作用。我需要用 cufft 转换 sin(x) 并返回,但在转换之间,我需要将结果乘以 2,这样,当我使用逆变换返回结果时,我将收到 2*sin( x) 例如。使用 fftw.h,我只需将 d_signal[i] 乘以 2,当我返回时,我有 2*sin(x),但我曾经使用 complex.h。任何想法?谢谢
# define SIGNAL_SIZE 64
# define PI acos(-1.0)
# define x 2*PI/SIGNAL_SIZE
void runTest(int argc, char **argv)
{
printf("[simpleCUFFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
// Allocate host memory for the signal
cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = sin(i*x);
h_signal[i].y = 0;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
}
bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
int new_size = signal_size + maxRadius;
// Pad signal
Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
*padded_signal = new_data;
// Pad filter
new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
*padded_filter_kernel = new_data;
return new_size;
}
简单的转换代码
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <helper_functions.h>
#include <helper_cuda.h>
// Complex data type
typedef float2 Complex;
// Filtering functions
void Convolve(const Complex *, int, const Complex *, int, Complex *);
// Padding functions
int PadData(const Complex *, Complex **, int,
const Complex *, Complex **, int);
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 32
#define FILTER_KERNEL_SIZE 11
#define PONTOS 32
#define PI acos(-1)
#define dx 2*PI/PONTOS
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
runTest(argc, argv);
system("Pause");
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
printf("[simpleCUFFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
// Allocate host memory for the signal
cufftComplex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = sin(i*dx);;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y /= (float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
printf("1 Error %g %g \n", fabs(h_signal[i].x - h_reversed_signal[i].x), fabs(h_signal[i].y - h_reversed_signal[i].y));
}
bool bTestResult = sdkCompareL2fe((float *)h_reversed_signal, (float *)h_signal, 2 * SIGNAL_SIZE, 1e-5f);
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
// Pad data
int PadData(const Complex *signal, Complex **padded_signal, int signal_size,
const Complex *filter_kernel, Complex **padded_filter_kernel, int filter_kernel_size)
{
int minRadius = filter_kernel_size / 2;
int maxRadius = filter_kernel_size - minRadius;
int new_size = signal_size + maxRadius;
// Pad signal
Complex *new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, signal, signal_size * sizeof(Complex));
memset(new_data + signal_size, 0, (new_size - signal_size) * sizeof(Complex));
*padded_signal = new_data;
// Pad filter
new_data = (Complex *)malloc(sizeof(Complex) * new_size);
memcpy(new_data + 0, filter_kernel + minRadius, maxRadius * sizeof(Complex));
memset(new_data + maxRadius, 0, (new_size - filter_kernel_size) * sizeof(Complex));
memcpy(new_data + new_size - minRadius, filter_kernel, minRadius * sizeof(Complex));
*padded_filter_kernel = new_data;
return new_size;
}
结果
[simpleCUFFT] is starting...
GPU Device 0: "GeForce GTX 570" with compute capability 2.0
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
first : 0.001251 0.000000 after 0.001251 0.000000
first : 0.563585 0.195090 after 0.563585 0.195090
first : 0.193304 0.382683 after 0.193304 0.382683
first : 0.808740 0.555570 after 0.808740 0.555570
first : 0.585009 0.707107 after 0.585009 0.707107
first : 0.479873 0.831470 after 0.479873 0.831470
first : 0.350291 0.923880 after 0.350291 0.923879
first : 0.895962 0.980785 after 0.895962 0.980785
first : 0.822840 1.000000 after 0.822840 1.000000
first : 0.746605 0.980785 after 0.746605 0.980785
first : 0.174108 0.923880 after 0.174108 0.923879
first : 0.858943 0.831470 after 0.858943 0.831470
first : 0.710501 0.707107 after 0.710501 0.707107
first : 0.513535 0.555570 after 0.513535 0.555570
first : 0.303995 0.382683 after 0.303995 0.382683
first : 0.014985 0.195090 after 0.014985 0.195090
first : 0.091403 0.000000 after 0.091403 0.000000
first : 0.364452 -0.195090 after 0.364452 -0.195090
first : 0.147313 -0.382683 after 0.147313 -0.382683
first : 0.165899 -0.555570 after 0.165899 -0.555570
first : 0.988525 -0.707107 after 0.988525 -0.707107
first : 0.445692 -0.831470 after 0.445692 -0.831470
first : 0.119083 -0.923880 after 0.119083 -0.923879
first : 0.004669 -0.980785 after 0.004669 -0.980785
first : 0.008911 -1.000000 after 0.008911 -1.000000
first : 0.377880 -0.980785 after 0.377880 -0.980785
first : 0.531663 -0.923880 after 0.531663 -0.923879
first : 0.571184 -0.831470 after 0.571184 -0.831470
first : 0.601764 -0.707107 after 0.601764 -0.707107
first : 0.607166 -0.555570 after 0.607166 -0.555570
first : 0.166234 -0.382683 after 0.166234 -0.382683
first : 0.663045 -0.195090 after 0.663045 -0.195090
L2 = 3.00296e-007
这个代码序列是非法的:
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
d_signal
之前已创建为 device 指针(通过 cudaMalloc
)。在主机代码中直接使用即取消引用设备指针是非法的。
解决此问题的一种可能方法是将代码中将复杂的中间结果乘以 2 的这一部分替换为调用执行类似功能的 CUDA 内核。
虽然它不是精确地将中间结果缩放 2,但有一个 cuda sample code 演示了 FFT->kernel->IFFT 的顺序,可能有助于理解。
这是一个完整的示例,是围绕您展示的代码构建的:
$ cat t612.cu
# define SIGNAL_SIZE 64
# define PI acos(-1.0)
# define MY_X 2*PI/SIGNAL_SIZE
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <helper_cuda.h>
#define MPY_FACTOR 2.0
#define nTPB 256
__global__ void multiply_kernel(cufftComplex *data, int len){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < len){
data[idx].x *= MPY_FACTOR;
data[idx].y *= MPY_FACTOR;
}
}
void runTest()
{
printf("FFT multiply is starting...\n");
// Allocate host memory for the signal
cufftComplex *h_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
cufftComplex *h_reversed_signal = (cufftComplex *)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = sin(i*MY_X);
h_signal[i].y = 0;
}
cufftComplex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, SIGNAL_SIZE*sizeof(cufftComplex)));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cufftComplex), cudaMemcpyHostToDevice));
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2C, 1));
// Transform signal and kernel
printf("Transforming signal cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
/*
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
d_signal[i].x = 2*d_signal[i].x;
d_signal[i].y = 2*d_signal[i].y;
}
*/
multiply_kernel<<<(SIGNAL_SIZE+nTPB-1)/nTPB, nTPB>>>(d_signal, SIGNAL_SIZE);
checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("Kernel execution failed [ scaling kernel]");
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
// Copy device memory to host
checkCudaErrors(cudaMemcpy(h_reversed_signal, d_signal, SIGNAL_SIZE*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
// check result
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_reversed_signal[i].x = h_reversed_signal[i].x / (float)SIGNAL_SIZE;
h_reversed_signal[i].y = h_reversed_signal[i].y/(float)SIGNAL_SIZE;
printf("first : %f %f after %f %f \n", h_signal[i].x, h_signal[i].y, h_reversed_signal[i].x, h_reversed_signal[i].y);
}
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
// cleanup memory
free(h_signal);
free(h_reversed_signal);
checkCudaErrors(cudaFree(d_signal));
cudaDeviceReset();
}
int main(){
runTest();
}
$ nvcc -arch=sm_20 -I/usr/local/cuda/samples/common/inc -o t612 t612.cu -lcufft
$ ./t612
[simpleCUFFT] is starting...
Transforming signal cufftExecC2C
Transforming signal back cufftExecC2C
first : 0.000000 0.000000 after 0.000000 0.000000
first : 0.098017 0.000000 after 0.196034 -0.000000
first : 0.195090 0.000000 after 0.390181 0.000000
first : 0.290285 0.000000 after 0.580569 0.000000
first : 0.382683 0.000000 after 0.765367 0.000000
first : 0.471397 0.000000 after 0.942793 0.000000
first : 0.555570 0.000000 after 1.111140 -0.000000
first : 0.634393 0.000000 after 1.268787 0.000000
first : 0.707107 0.000000 after 1.414214 0.000000
first : 0.773010 0.000000 after 1.546021 0.000000
first : 0.831470 0.000000 after 1.662939 -0.000000
first : 0.881921 0.000000 after 1.763842 0.000000
first : 0.923880 0.000000 after 1.847759 0.000000
first : 0.956940 0.000000 after 1.913881 -0.000000
first : 0.980785 0.000000 after 1.961570 0.000000
first : 0.995185 0.000000 after 1.990369 -0.000000
first : 1.000000 0.000000 after 2.000000 0.000000
first : 0.995185 0.000000 after 1.990370 0.000000
first : 0.980785 0.000000 after 1.961570 -0.000000
first : 0.956940 0.000000 after 1.913881 0.000000
first : 0.923880 0.000000 after 1.847759 0.000000
first : 0.881921 0.000000 after 1.763842 0.000000
first : 0.831470 0.000000 after 1.662939 0.000000
first : 0.773010 0.000000 after 1.546021 0.000000
first : 0.707107 0.000000 after 1.414214 0.000000
first : 0.634393 0.000000 after 1.268786 -0.000000
first : 0.555570 0.000000 after 1.111140 0.000000
first : 0.471397 0.000000 after 0.942793 0.000000
first : 0.382683 0.000000 after 0.765367 0.000000
first : 0.290285 0.000000 after 0.580569 0.000000
first : 0.195090 0.000000 after 0.390181 0.000000
first : 0.098017 0.000000 after 0.196034 0.000000
first : 0.000000 0.000000 after -0.000000 0.000000
first : -0.098017 0.000000 after -0.196034 0.000000
first : -0.195090 0.000000 after -0.390181 0.000000
first : -0.290285 0.000000 after -0.580569 0.000000
first : -0.382683 0.000000 after -0.765367 0.000000
first : -0.471397 0.000000 after -0.942793 0.000000
first : -0.555570 0.000000 after -1.111140 0.000000
first : -0.634393 0.000000 after -1.268787 0.000000
first : -0.707107 0.000000 after -1.414214 0.000000
first : -0.773010 0.000000 after -1.546021 -0.000000
first : -0.831470 0.000000 after -1.662939 0.000000
first : -0.881921 0.000000 after -1.763842 0.000000
first : -0.923880 0.000000 after -1.847759 0.000000
first : -0.956940 0.000000 after -1.913881 0.000000
first : -0.980785 0.000000 after -1.961570 -0.000000
first : -0.995185 0.000000 after -1.990369 0.000000
first : -1.000000 0.000000 after -2.000000 -0.000000
first : -0.995185 0.000000 after -1.990370 -0.000000
first : -0.980785 0.000000 after -1.961570 0.000000
first : -0.956940 0.000000 after -1.913881 -0.000000
first : -0.923880 0.000000 after -1.847759 0.000000
first : -0.881921 0.000000 after -1.763842 -0.000000
first : -0.831470 0.000000 after -1.662939 -0.000000
first : -0.773010 0.000000 after -1.546021 0.000000
first : -0.707107 0.000000 after -1.414214 0.000000
first : -0.634393 0.000000 after -1.268786 0.000000
first : -0.555570 0.000000 after -1.111140 -0.000000
first : -0.471397 0.000000 after -0.942793 0.000000
first : -0.382683 0.000000 after -0.765367 0.000000
first : -0.290285 0.000000 after -0.580569 0.000000
first : -0.195090 0.000000 after -0.390181 -0.000000
first : -0.098017 0.000000 after -0.196034 0.000000
$