使用 OpenCL 处理 3D 数组,以及带有无效操作数错误的程序构建错误
Dealing with 3D array with OpenCL, and program build error with invalid operand error
我正在编写这段 OpenCL 代码,它使用 leapfrog 方案求解平流方程。我想我已经正确设置了主机代码和内核代码,但我在内核编译期间得到 CL_BUILD_PROGRAM_FAILURE。
我确实查看了内核编译日志,这就是我得到的结果
/tmp/OCL114018T1.cl:72:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:73:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:74:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
~~~~~ ^ ~
/tmp/OCL114018T1.cl:76:32: error: passing '__global float *' to parameter of type 'float *' changes address space of pointer
pbndry(x_siz,y_siz,z_siz,in_p_tf);
^~~~~~~
/tmp/OCL114018T1.cl:1:62: note: passing argument to parameter 'in_arr' here
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
^
4 errors generated.
error: Clang front-end compilation failed!
Frontend phase failed compilation.
Error: Compiling CL to IR
在我看来 u_vel
和 C
都是 float
所以应该不是问题。我在这里做错了什么?
下面是主机代码和内核代码。
主机代码
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include "cl_err.h"
// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define fname "/home/rangke/temp/leap3d.nc"
// Variable sizes and dimensions (constants)
#define ndims 4
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
int main()
{
int i,j,k;
int Nx = 128,
Ny = 128,
Nz = 16,
Nt = 1000;
int *p_nx = &Nx,
*p_ny = &Ny,
*p_nz = &Nz,
*p_nt = &Nt;
float u = 0.0,
v = 5.0,
w = 0.0,
dtdl = 0.01;
float *p_u = &u,
*p_v = &v,
*p_w = &w,
*p_dtdl = &dtdl;
// p_tf : p at future
// p_tn : p at now
// p_tp : p at past
float q_tf[Nz+2][Ny+2][Nx+2];
float q_tn[Nz+2][Ny+2][Nx+2];
float q_tp[Nz+2][Ny+2][Nx+2];
float (*p_tf)[Ny+2][Nx+2] = q_tf;
float (*p_tn)[Ny+2][Nx+2] = q_tn;
float (*p_tp)[Ny+2][Nx+2] = q_tp;
size_t p_siz = sizeof(float) * (Nx+2) * (Ny+2) * (Nz+2);
size_t n_siz = sizeof(int) * 1,
c_siz = sizeof(float) * 1;
int ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int dimids[ndims];
size_t start[ndims], count[ndims];
// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
err(retval);
// Define dimensions
if ((retval = nc_def_dim(ncid, "z", Nz+2, &z_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "y", Ny+2, &y_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "x", Nx+2, &x_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
err(retval);
// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;
// Variable for writing netCDF data one timestep at a time
count[0] = 1; // For time dimension : 1 timestep
count[1] = Nz+2; // For z : write everything
count[2] = Ny+2; // For y : write everything
count[3] = Nx+2; // For x : write everything
start[1] = 0; // For z : don't do anything
start[2] = 0; // For y : don't do anything
start[3] = 0; // For x : don't do anything
printf("line 231\n");
if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
err(retval);
if ((retval = nc_enddef(ncid)))
err(retval);
data_init(Nx,Ny,Nz,(float*)p_tf);
data_init(Nx,Ny,Nz,(float*)p_tn);
data_init(Nx,Ny,Nz,(float*)p_tp);
// for(i=1;i<123;i++)
// printf("",p_tf[])
// Euler scheme for the first time step
for(k=1;k<Nz+1;k++)
for(j=1;j<Ny+1;j++)
for(i=1;i<Nx+1;i++)
{
p_tf[k][j][i] = p_tn[k][j][i]
- u * dtdl * (p_tn[k][j][i] - p_tn[k][j][i-1])
- v * dtdl * (p_tn[k][j][i] - p_tn[k][j-1][i])
- w * dtdl * (p_tn[k][j][i] - p_tn[k-1][j][i]);
}
pbndry(Nx,Ny,Nz,(float*)p_tf);
p_tp = p_tn;
p_tn = p_tf;
start[0] = 0;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
// OpenCL part //
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("line 299\n");
// for(i=0;i<numDevices;i++)
// {
// status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
//
// printf("Device Name #%d: %s\n", i, name_data);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(comp_units), &comp_units, NULL);
//
// printf("Max Work-Group %d\n", comp_units);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
//
// printf("Double FP config = %llu, Support? = %d\n", cfg, status);
// }
printf("line 313\n");
// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
printf("line 317\n");
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("line 323\n");
cl_mem buf_p_tf = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
cl_mem buf_p_tn = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_p_tp = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_nx = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_ny = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nz = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nt = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_u = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_v = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_w = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_c = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
printf("line 335\n");
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, p_tf ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tn , CL_FALSE, 0, p_siz, p_tn ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tp , CL_FALSE, 0, p_siz, p_tp ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nx , CL_FALSE, 0, n_siz, p_nx ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_ny , CL_FALSE, 0, n_siz, p_ny ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nz , CL_FALSE, 0, n_siz, p_nz ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nt , CL_FALSE, 0, n_siz, p_nt ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_u , CL_FALSE, 0, c_siz, p_u ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_v , CL_FALSE, 0, c_siz, p_v ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_w , CL_FALSE, 0, c_siz, p_w ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_c , CL_FALSE, 0, c_siz, p_dtdl,0, NULL, NULL);
printf("line 348\n");
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog.cl","r");
printf("line 357\n");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '[=11=]';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);
printf("line 366\n");
program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
printf("line 370\n");
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
printf("Code : %d\n",status);
printf("Program 1 %s\n",getErrorString(status));
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
// Create a kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "leapfrog3d", &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &buf_nx );
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &buf_ny );
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &buf_nz );
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nt );
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_p_tf);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_p_tn);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buf_p_tp);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buf_u );
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buf_v );
status = clSetKernelArg(kernel, 9, sizeof(cl_mem), &buf_w );
status = clSetKernelArg(kernel,10, sizeof(cl_mem), &buf_c );
// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t glbworksiz[3] = {Nx,Ny,Nz};
printf("\nLine 395\n");
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 3, NULL, glbworksiz, NULL, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 401\n");
// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tf, CL_TRUE, 0, p_siz, p_tf, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 407\n");
start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
if ((retval = nc_close(ncid)))
err(retval);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseMemObject(buf_nt);
clReleaseMemObject(buf_u);
clReleaseMemObject(buf_v);
clReleaseMemObject(buf_w);
clReleaseMemObject(buf_c);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
printf("\nDone. . .\n");
return 0;
}
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
int i_min = 50,
i_max = 70,
j_min = 50,
j_max = 70;
for(k=0;k<in_z_siz+2;k++)
for(j=0;j<in_y_siz+2;j++)
for(i=0;i<in_x_siz+2;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 0.0;
for(k=1;k<in_z_siz+1;k++)
for(j=j_min;j<j_max;j++)
for(i=i_min;i<i_max;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 3.0;
}
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
内核代码
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
kernel void leapfrog3d(
const int x_siz,
const int y_siz,
const int z_siz,
const int t_siz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp,
global float *u_vel,
global float *v_vel,
global float *w_vel,
global float *C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
for(int t=1;t<t_siz;t++)
{
idx0 = i + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_i0 = (i+1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j0 = i + (j+1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k0 = i + j * (x_siz+2) + (k+1) * (x_siz+2) * (y_siz+2);
idx_i1 = (i-1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j1 = i + (j-1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k1 = i + j * (x_siz+2) + (k-1) * (x_siz+2) * (y_siz+2);
in_p_tf[idx0] = in_p_tp[idx0]
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
pbndry(x_siz,y_siz,z_siz,in_p_tf);
in_p_tp = in_p_tn;
in_p_tn = in_p_tf;
}
}
两件事:
C
是一个数组或指针,但您访问它时就好像它是一个标量值一样。使用 C[some_index]
访问数组元素。如果只是一个常量,使用(*C)
或C[0]
.
x_siz
/y_siz
/z_siz
/t_siz
都在 global
内存中 space 因为它们是内核参数,无论是否你明确地写 global const int x_siz
或 const int x_siz
。您需要创建 private
个内核变量,将它们设置为全局变量,并将私有变量传递给函数,因为默认情况下函数参数是私有的。所以在内核中,创建一个变量 int x_siz_private = x_siz;
并将其传递给函数调用。默认情况下,内核中声明的变量位于私有内存 space 中,因此您无需显式写入 private
。在汇编中,这对应于 ld
(从全局内存加载到寄存器)指令。
我正在编写这段 OpenCL 代码,它使用 leapfrog 方案求解平流方程。我想我已经正确设置了主机代码和内核代码,但我在内核编译期间得到 CL_BUILD_PROGRAM_FAILURE。
我确实查看了内核编译日志,这就是我得到的结果
/tmp/OCL114018T1.cl:72:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:73:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
~~~~~ ^ ~
/tmp/OCL114018T1.cl:74:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
~~~~~ ^ ~
/tmp/OCL114018T1.cl:76:32: error: passing '__global float *' to parameter of type 'float *' changes address space of pointer
pbndry(x_siz,y_siz,z_siz,in_p_tf);
^~~~~~~
/tmp/OCL114018T1.cl:1:62: note: passing argument to parameter 'in_arr' here
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
^
4 errors generated.
error: Clang front-end compilation failed!
Frontend phase failed compilation.
Error: Compiling CL to IR
在我看来 u_vel
和 C
都是 float
所以应该不是问题。我在这里做错了什么?
下面是主机代码和内核代码。
主机代码
#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include "cl_err.h"
// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}
#define fname "/home/rangke/temp/leap3d.nc"
// Variable sizes and dimensions (constants)
#define ndims 4
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
int main()
{
int i,j,k;
int Nx = 128,
Ny = 128,
Nz = 16,
Nt = 1000;
int *p_nx = &Nx,
*p_ny = &Ny,
*p_nz = &Nz,
*p_nt = &Nt;
float u = 0.0,
v = 5.0,
w = 0.0,
dtdl = 0.01;
float *p_u = &u,
*p_v = &v,
*p_w = &w,
*p_dtdl = &dtdl;
// p_tf : p at future
// p_tn : p at now
// p_tp : p at past
float q_tf[Nz+2][Ny+2][Nx+2];
float q_tn[Nz+2][Ny+2][Nx+2];
float q_tp[Nz+2][Ny+2][Nx+2];
float (*p_tf)[Ny+2][Nx+2] = q_tf;
float (*p_tn)[Ny+2][Nx+2] = q_tn;
float (*p_tp)[Ny+2][Nx+2] = q_tp;
size_t p_siz = sizeof(float) * (Nx+2) * (Ny+2) * (Nz+2);
size_t n_siz = sizeof(int) * 1,
c_siz = sizeof(float) * 1;
int ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
int dimids[ndims];
size_t start[ndims], count[ndims];
// netCDF file operation
// Creating netCDF file
if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
err(retval);
// Define dimensions
if ((retval = nc_def_dim(ncid, "z", Nz+2, &z_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "y", Ny+2, &y_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "x", Nx+2, &x_dimid)))
err(retval);
if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
err(retval);
// Dimension ids
dimids[0] = t_dimid;
dimids[1] = z_dimid;
dimids[2] = y_dimid;
dimids[3] = x_dimid;
// Variable for writing netCDF data one timestep at a time
count[0] = 1; // For time dimension : 1 timestep
count[1] = Nz+2; // For z : write everything
count[2] = Ny+2; // For y : write everything
count[3] = Nx+2; // For x : write everything
start[1] = 0; // For z : don't do anything
start[2] = 0; // For y : don't do anything
start[3] = 0; // For x : don't do anything
printf("line 231\n");
if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
err(retval);
if ((retval = nc_enddef(ncid)))
err(retval);
data_init(Nx,Ny,Nz,(float*)p_tf);
data_init(Nx,Ny,Nz,(float*)p_tn);
data_init(Nx,Ny,Nz,(float*)p_tp);
// for(i=1;i<123;i++)
// printf("",p_tf[])
// Euler scheme for the first time step
for(k=1;k<Nz+1;k++)
for(j=1;j<Ny+1;j++)
for(i=1;i<Nx+1;i++)
{
p_tf[k][j][i] = p_tn[k][j][i]
- u * dtdl * (p_tn[k][j][i] - p_tn[k][j][i-1])
- v * dtdl * (p_tn[k][j][i] - p_tn[k][j-1][i])
- w * dtdl * (p_tn[k][j][i] - p_tn[k-1][j][i]);
}
pbndry(Nx,Ny,Nz,(float*)p_tf);
p_tp = p_tn;
p_tn = p_tf;
start[0] = 0;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
// OpenCL part //
// Use this to check the output of each API call
cl_int status;
// Retrieve the number of Platforms
cl_uint numPlatforms = 0;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each Platform
cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
// Fill in the Platforms
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
// Retrieve the number of Devices
cl_uint numDevices = 0;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
// Allocate enough spaces for each Devices
char name_data[100];
int *comp_units;
cl_device_fp_config cfg;
cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
// Fill in the Devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
printf("line 299\n");
// for(i=0;i<numDevices;i++)
// {
// status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
//
// printf("Device Name #%d: %s\n", i, name_data);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(comp_units), &comp_units, NULL);
//
// printf("Max Work-Group %d\n", comp_units);
// status = clGetDeviceInfo(devices[i], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
//
// printf("Double FP config = %llu, Support? = %d\n", cfg, status);
// }
printf("line 313\n");
// Create a context and associate it with the devices
cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
printf("line 317\n");
// Create a command queue and associate it with the devices
cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("line 323\n");
cl_mem buf_p_tf = clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
cl_mem buf_p_tn = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_p_tp = clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
cl_mem buf_nx = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_ny = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nz = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_nt = clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
cl_mem buf_u = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_v = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_w = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
cl_mem buf_c = clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
printf("line 335\n");
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tf , CL_FALSE, 0, p_siz, p_tf ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tn , CL_FALSE, 0, p_siz, p_tn ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_p_tp , CL_FALSE, 0, p_siz, p_tp ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nx , CL_FALSE, 0, n_siz, p_nx ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_ny , CL_FALSE, 0, n_siz, p_ny ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nz , CL_FALSE, 0, n_siz, p_nz ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_nt , CL_FALSE, 0, n_siz, p_nt ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_u , CL_FALSE, 0, c_siz, p_u ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_v , CL_FALSE, 0, c_siz, p_v ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_w , CL_FALSE, 0, c_siz, p_w ,0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buf_c , CL_FALSE, 0, c_siz, p_dtdl,0, NULL, NULL);
printf("line 348\n");
// Create Program with the source code
cl_program program = NULL;
size_t program_size;
char *program_source;
FILE *program_handle = fopen("leapfrog.cl","r");
printf("line 357\n");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_source = (char*)malloc(program_size+1);
program_source[program_size] = '[=11=]';
fread(program_source, sizeof(char), program_size, program_handle);
fclose(program_handle);
printf("line 366\n");
program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);
printf("line 370\n");
// Compile the Program for the Device
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
if(status != CL_SUCCESS)
{
printf("Code : %d\n",status);
printf("Program 1 %s\n",getErrorString(status));
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
// Create a kernel
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "leapfrog3d", &status);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_int), &buf_nx );
status = clSetKernelArg(kernel, 1, sizeof(cl_int), &buf_ny );
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &buf_nz );
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nt );
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_p_tf);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_p_tn);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buf_p_tp);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buf_u );
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buf_v );
status = clSetKernelArg(kernel, 9, sizeof(cl_mem), &buf_w );
status = clSetKernelArg(kernel,10, sizeof(cl_mem), &buf_c );
// Define index space (global work size) of work items for execution
// A workgroup size (local work size) is not required, but can be used
size_t glbworksiz[3] = {Nx,Ny,Nz};
printf("\nLine 395\n");
// Execute the kernel for execution
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 3, NULL, glbworksiz, NULL, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 401\n");
// Read the Device output buffer to the host output array
status = clEnqueueReadBuffer(cmdQueue, buf_p_tf, CL_TRUE, 0, p_siz, p_tf, 0, NULL, NULL);
if(status != CL_SUCCESS)
printf("%s\n",getErrorString(status));
printf("\nLine 407\n");
start[0] = 1;
if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
err(retval);
if ((retval = nc_close(ncid)))
err(retval);
clReleaseMemObject(buf_p_tf);
clReleaseMemObject(buf_p_tn);
clReleaseMemObject(buf_p_tp);
clReleaseMemObject(buf_nx);
clReleaseMemObject(buf_ny);
clReleaseMemObject(buf_nz);
clReleaseMemObject(buf_nt);
clReleaseMemObject(buf_u);
clReleaseMemObject(buf_v);
clReleaseMemObject(buf_w);
clReleaseMemObject(buf_c);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
printf("\nDone. . .\n");
return 0;
}
void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
int i_min = 50,
i_max = 70,
j_min = 50,
j_max = 70;
for(k=0;k<in_z_siz+2;k++)
for(j=0;j<in_y_siz+2;j++)
for(i=0;i<in_x_siz+2;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 0.0;
for(k=1;k<in_z_siz+1;k++)
for(j=j_min;j<j_max;j++)
for(i=i_min;i<i_max;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 3.0;
}
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
内核代码
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
int i,j,k;
// Periodic boundary
// x-direction
for(k=1;k<in_z_siz+1;k++)
for(j=1;j<in_y_siz+1;j++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];
// y-direction
for(k=1;k<in_z_siz+1;k++)
for(i=1;i<in_x_siz+1;i++)
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];
// z-direction
for(j=1;j<in_y_siz+1;j++)
for(i=1;i<in_x_siz+1;i++)
in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}
kernel void leapfrog3d(
const int x_siz,
const int y_siz,
const int z_siz,
const int t_siz,
global float *in_p_tf,
global float *in_p_tn,
global float *in_p_tp,
global float *u_vel,
global float *v_vel,
global float *w_vel,
global float *C
)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;
for(int t=1;t<t_siz;t++)
{
idx0 = i + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_i0 = (i+1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j0 = i + (j+1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k0 = i + j * (x_siz+2) + (k+1) * (x_siz+2) * (y_siz+2);
idx_i1 = (i-1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_j1 = i + (j-1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
idx_k1 = i + j * (x_siz+2) + (k-1) * (x_siz+2) * (y_siz+2);
in_p_tf[idx0] = in_p_tp[idx0]
- u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
- v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
- w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
pbndry(x_siz,y_siz,z_siz,in_p_tf);
in_p_tp = in_p_tn;
in_p_tn = in_p_tf;
}
}
两件事:
C
是一个数组或指针,但您访问它时就好像它是一个标量值一样。使用C[some_index]
访问数组元素。如果只是一个常量,使用(*C)
或C[0]
.x_siz
/y_siz
/z_siz
/t_siz
都在global
内存中 space 因为它们是内核参数,无论是否你明确地写global const int x_siz
或const int x_siz
。您需要创建private
个内核变量,将它们设置为全局变量,并将私有变量传递给函数,因为默认情况下函数参数是私有的。所以在内核中,创建一个变量int x_siz_private = x_siz;
并将其传递给函数调用。默认情况下,内核中声明的变量位于私有内存 space 中,因此您无需显式写入private
。在汇编中,这对应于ld
(从全局内存加载到寄存器)指令。