从 FPGA 上的 OpenCL FFT 内核获取 nan 值
Getting nan values from OpenCL FFT kernel on FPGA
我试图通过自己为英特尔 FPGA 编写主机程序来使用英特尔的 FFT1D 内核。 Link到Intel的FFT1d可以查到here
我还在下面给出了我的主机程序,其中,我保存了一个文件(其中包含一些数据),我的任务是读取该数据,计算其 FFT 并打印其中的一些。是一个4K点的FFT
#include <stdio.h>
#include <stdlib.h>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#include <string.h>
#include "fft_config.h"
#define N (1<<LOGN) //Please check the FFT Sample Code for Ref (2 to the power 12 gives 4K points)
#define DATA_FILE "complex_input.data"
using namespace aocl_utils;
cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue0 = NULL;
cl_command_queue queue1 = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel0, kernel1;
cl_mem d_inData, d_outData;
cl_int err = 0;
typedef struct {
float x;
float y;
} float2;
//float2 h_outData[N], h_inData[N];
float2 *h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
float2 *h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
void init(); //Function that does the job of Querying Platform and Device, creating Context, Command Queues, Program and required Kernels to do the job.
void cleanup(); //Function that releases all the Created Contexts, Buffers etc, in order to finish the execution.
void read_data(); //Reads data from the complex numbers from .data file and fills in the float2 struct h_inData[].
int temp_value = 1;
int main()
{
// h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
//h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
int inverse = false;
int temp =1;
init();
read_data();
d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float2)*N, NULL, &err);
checkError(err,"Failed to allocate Buffer for input array\n");
d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(float2)*N, NULL, &err);
checkError(err, "Failed to allocate the Buffer for output\n");
//WE FINISH THE FETCH KERNEL
err = clEnqueueWriteBuffer(queue1,d_inData, CL_TRUE, 0, sizeof(float2)*N, h_inData, 0, NULL, NULL);
checkError(err,"Failed to Write the input Buffer\n");
err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&d_inData);
checkError(err, "Failed to set KerArg for Kernel1 - 0\n");
err = clSetKernelArg(kernel0, 0, sizeof(cl_mem), (void *)&d_outData);
checkError(err, "Failed to set KerArg for Kernel0 - 0\n");
err = clSetKernelArg(kernel0, 1, sizeof(cl_int), (void *)&temp_value);
checkError(err, "Failed to set KerArg for Kernel0 - 1\n");
err = clSetKernelArg(kernel0, 2, sizeof(cl_int), (void *)&inverse);
checkError(err, "Failed to set KerArg for Kernel0 - 2\n");
printf("FFT Initialization Complete!\n\n");
err = clEnqueueTask(queue0, kernel0, 0, NULL, NULL);
checkError(err, "Failed to Launch the Kernel for FFT\n");
size_t local_work_size = N/8;
size_t global_work_size = local_work_size * 1; //Coz the number of Iterations is just 1
err = clEnqueueNDRangeKernel(queue1, kernel1, 1, NULL, &local_work_size, &global_work_size, 0, NULL, NULL);
checkError(err, "Failed to launch the Fetch Kernel\n");
err = clFinish(queue0);
checkError(err, "Failed to finish FFT\n");
err = clFinish(queue1);
checkError(err, "Failed to finish Fetch kernel\n");
err = clEnqueueReadBuffer(queue0, d_outData, CL_TRUE, 0, sizeof(float2)*N, h_outData, 0, NULL, NULL);
checkError(err, "Failed to Read back the Buffer output\n");
printf("FFT is Complete!\n\n");
printf("Printing some of the values, just to make sure they are non-zero\n\n");
for(int ii=100;ii<125;ii++)
{
printf("%f + %f j -> %f + %f j\n",h_inData[ii].x,h_inData[ii].y,h_outData[ii].x,h_outData[ii].y);
}
printf("\n\n");
cleanup();
return 0;
}
void read_data()
{
size_t sourceSize;
float* temp;
FILE *fp = fopen(DATA_FILE,"r");
if(fp==NULL)
{
printf("Could not find the Random Data File! Exiting!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
sourceSize=ftell(fp);
rewind(fp);
temp = (float *)alignedMalloc(sourceSize);
fread(temp, sizeof(float),sourceSize,fp);
fclose(fp);
for(int i=0;i<N;i++)
{
h_inData[i].x = temp[2*i];
h_inData[i].y = temp[(2*i)+1];
}
}
void init()
{
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
if(platform == NULL)
{
printf("Could not find the Platform\n");
exit(1);
}
scoped_array<cl_device_id> devices;
cl_uint num_devices;
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ACCELERATOR, &num_devices));
device = devices[0];
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &err);
checkError(err, "Failed to create Context\n");
queue0 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue0\n");
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue1\n");
program = createProgramFromBinary(context, "bin/fft1d.aocx", &device, 1);
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
checkError(err, "Failed to Build Program\n");
kernel0 = clCreateKernel(program, "fft1d", &err);
checkError(err,"Could not Create Kernel0\n");
kernel1 = clCreateKernel(program, "fetch", &err);
checkError(err, "Could not Create Kernel1\n");
printf("Finished with the Initial Setup!\n");
}
void cleanup()
{
if(kernel0)
clReleaseKernel(kernel0);
if(kernel1)
clReleaseKernel(kernel1);
if(program)
clReleaseProgram(program);
if(queue0)
clReleaseCommandQueue(queue0);
if(queue1)
clReleaseCommandQueue(queue1);
if(d_inData)
clReleaseMemObject(d_inData);
if(d_outData)
clReleaseMemObject(d_outData);
if(context)
clReleaseContext(context);
}
我检查了文件中的数据是否正在正常读取,它是正确的并且符合预期。
请告诉我哪里出了问题!
更新!
我找到了解决办法。在这里,从自身读取并不是一个好主意。我尝试在执行期间在那里生成随机数,它工作得很好!
我试图通过自己为英特尔 FPGA 编写主机程序来使用英特尔的 FFT1D 内核。 Link到Intel的FFT1d可以查到here
我还在下面给出了我的主机程序,其中,我保存了一个文件(其中包含一些数据),我的任务是读取该数据,计算其 FFT 并打印其中的一些。是一个4K点的FFT
#include <stdio.h>
#include <stdlib.h>
#include "CL/opencl.h"
#include "AOCLUtils/aocl_utils.h"
#include <string.h>
#include "fft_config.h"
#define N (1<<LOGN) //Please check the FFT Sample Code for Ref (2 to the power 12 gives 4K points)
#define DATA_FILE "complex_input.data"
using namespace aocl_utils;
cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue0 = NULL;
cl_command_queue queue1 = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel0, kernel1;
cl_mem d_inData, d_outData;
cl_int err = 0;
typedef struct {
float x;
float y;
} float2;
//float2 h_outData[N], h_inData[N];
float2 *h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
float2 *h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
void init(); //Function that does the job of Querying Platform and Device, creating Context, Command Queues, Program and required Kernels to do the job.
void cleanup(); //Function that releases all the Created Contexts, Buffers etc, in order to finish the execution.
void read_data(); //Reads data from the complex numbers from .data file and fills in the float2 struct h_inData[].
int temp_value = 1;
int main()
{
// h_inData = (float2 *)alignedMalloc(sizeof(float2)*N);
//h_outData = (float2 *)alignedMalloc(sizeof(float2)*N);
int inverse = false;
int temp =1;
init();
read_data();
d_inData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float2)*N, NULL, &err);
checkError(err,"Failed to allocate Buffer for input array\n");
d_outData = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_CHANNEL_2_INTELFPGA, sizeof(float2)*N, NULL, &err);
checkError(err, "Failed to allocate the Buffer for output\n");
//WE FINISH THE FETCH KERNEL
err = clEnqueueWriteBuffer(queue1,d_inData, CL_TRUE, 0, sizeof(float2)*N, h_inData, 0, NULL, NULL);
checkError(err,"Failed to Write the input Buffer\n");
err = clSetKernelArg(kernel1, 0, sizeof(cl_mem), (void *)&d_inData);
checkError(err, "Failed to set KerArg for Kernel1 - 0\n");
err = clSetKernelArg(kernel0, 0, sizeof(cl_mem), (void *)&d_outData);
checkError(err, "Failed to set KerArg for Kernel0 - 0\n");
err = clSetKernelArg(kernel0, 1, sizeof(cl_int), (void *)&temp_value);
checkError(err, "Failed to set KerArg for Kernel0 - 1\n");
err = clSetKernelArg(kernel0, 2, sizeof(cl_int), (void *)&inverse);
checkError(err, "Failed to set KerArg for Kernel0 - 2\n");
printf("FFT Initialization Complete!\n\n");
err = clEnqueueTask(queue0, kernel0, 0, NULL, NULL);
checkError(err, "Failed to Launch the Kernel for FFT\n");
size_t local_work_size = N/8;
size_t global_work_size = local_work_size * 1; //Coz the number of Iterations is just 1
err = clEnqueueNDRangeKernel(queue1, kernel1, 1, NULL, &local_work_size, &global_work_size, 0, NULL, NULL);
checkError(err, "Failed to launch the Fetch Kernel\n");
err = clFinish(queue0);
checkError(err, "Failed to finish FFT\n");
err = clFinish(queue1);
checkError(err, "Failed to finish Fetch kernel\n");
err = clEnqueueReadBuffer(queue0, d_outData, CL_TRUE, 0, sizeof(float2)*N, h_outData, 0, NULL, NULL);
checkError(err, "Failed to Read back the Buffer output\n");
printf("FFT is Complete!\n\n");
printf("Printing some of the values, just to make sure they are non-zero\n\n");
for(int ii=100;ii<125;ii++)
{
printf("%f + %f j -> %f + %f j\n",h_inData[ii].x,h_inData[ii].y,h_outData[ii].x,h_outData[ii].y);
}
printf("\n\n");
cleanup();
return 0;
}
void read_data()
{
size_t sourceSize;
float* temp;
FILE *fp = fopen(DATA_FILE,"r");
if(fp==NULL)
{
printf("Could not find the Random Data File! Exiting!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
sourceSize=ftell(fp);
rewind(fp);
temp = (float *)alignedMalloc(sourceSize);
fread(temp, sizeof(float),sourceSize,fp);
fclose(fp);
for(int i=0;i<N;i++)
{
h_inData[i].x = temp[2*i];
h_inData[i].y = temp[(2*i)+1];
}
}
void init()
{
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
if(platform == NULL)
{
printf("Could not find the Platform\n");
exit(1);
}
scoped_array<cl_device_id> devices;
cl_uint num_devices;
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ACCELERATOR, &num_devices));
device = devices[0];
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &err);
checkError(err, "Failed to create Context\n");
queue0 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue0\n");
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
checkError(err, "Failed to create Command Queue1\n");
program = createProgramFromBinary(context, "bin/fft1d.aocx", &device, 1);
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
checkError(err, "Failed to Build Program\n");
kernel0 = clCreateKernel(program, "fft1d", &err);
checkError(err,"Could not Create Kernel0\n");
kernel1 = clCreateKernel(program, "fetch", &err);
checkError(err, "Could not Create Kernel1\n");
printf("Finished with the Initial Setup!\n");
}
void cleanup()
{
if(kernel0)
clReleaseKernel(kernel0);
if(kernel1)
clReleaseKernel(kernel1);
if(program)
clReleaseProgram(program);
if(queue0)
clReleaseCommandQueue(queue0);
if(queue1)
clReleaseCommandQueue(queue1);
if(d_inData)
clReleaseMemObject(d_inData);
if(d_outData)
clReleaseMemObject(d_outData);
if(context)
clReleaseContext(context);
}
我检查了文件中的数据是否正在正常读取,它是正确的并且符合预期。
请告诉我哪里出了问题!
更新!
我找到了解决办法。在这里,从自身读取并不是一个好主意。我尝试在执行期间在那里生成随机数,它工作得很好!