使用 OpenCL 处理 3D 数组,以及带有无效操作数错误的程序构建错误

Dealing with 3D array with OpenCL, and program build error with invalid operand error

我正在编写这段 OpenCL 代码,它使用 leapfrog 方案求解平流方程。我想我已经正确设置了主机代码和内核代码,但我在内核编译期间得到 CL_BUILD_PROGRAM_FAILURE。

我确实查看了内核编译日志,这就是我得到的结果

/tmp/OCL114018T1.cl:72:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
                   - u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
                     ~~~~~ ^ ~
/tmp/OCL114018T1.cl:73:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
                   - v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
                     ~~~~~ ^ ~
/tmp/OCL114018T1.cl:74:28: error: invalid operands to binary expression ('__global float *' and '__global float *')
                   - w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);
                     ~~~~~ ^ ~
/tmp/OCL114018T1.cl:76:32: error: passing '__global float *' to parameter of type 'float *' changes address space of pointer
      pbndry(x_siz,y_siz,z_siz,in_p_tf);
                               ^~~~~~~
/tmp/OCL114018T1.cl:1:62: note: passing argument to parameter 'in_arr' here
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
                                                             ^
4 errors generated.

error: Clang front-end compilation failed!
Frontend phase failed compilation.
Error: Compiling CL to IR

在我看来 u_velC 都是 float 所以应该不是问题。我在这里做错了什么?

下面是主机代码和内核代码。

主机代码

#include <stdio.h>
#include <stdlib.h>
#include <netcdf.h>

#define CL_TARGET_OPENCL_VERSION 120 
#include <CL/cl.h>
#include "cl_err.h"

// netCDF constants
#define err(e) {printf("Error: %s\n", nc_strerror(e)); return(2);}

#define fname "/home/rangke/temp/leap3d.nc"

// Variable sizes and dimensions (constants)
#define ndims 4


void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);
void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr);




int main()
{
   int      i,j,k;

   int      Nx       =  128,
            Ny       =  128,
            Nz       =  16,
            Nt       =  1000;
  
   int      *p_nx    =  &Nx,
            *p_ny    =  &Ny,
            *p_nz    =  &Nz,
            *p_nt    =  &Nt;

   float    u        =  0.0,
            v        =  5.0,
            w        =  0.0,
            dtdl     =  0.01;

   float    *p_u     =  &u,
            *p_v     =  &v,
            *p_w     =  &w,
            *p_dtdl  =  &dtdl;
   

   // p_tf : p at future
   // p_tn : p at now
   // p_tp : p at past
   float    q_tf[Nz+2][Ny+2][Nx+2];
   float    q_tn[Nz+2][Ny+2][Nx+2];
   float    q_tp[Nz+2][Ny+2][Nx+2];
   
   float    (*p_tf)[Ny+2][Nx+2] = q_tf;
   float    (*p_tn)[Ny+2][Nx+2] = q_tn;
   float    (*p_tp)[Ny+2][Nx+2] = q_tp;

   size_t   p_siz =  sizeof(float) * (Nx+2) * (Ny+2) * (Nz+2);
   size_t   n_siz =  sizeof(int) * 1,
            c_siz =  sizeof(float) * 1;


   int      ncid, retval, varid, x_dimid, y_dimid, z_dimid, t_dimid;
   int      dimids[ndims];
   size_t   start[ndims], count[ndims];


   // netCDF file operation
   // Creating netCDF file
   if ((retval = nc_create(fname, NC_CLOBBER, &ncid)))
      err(retval);

   // Define dimensions
   if ((retval = nc_def_dim(ncid, "z", Nz+2, &z_dimid)))
      err(retval);
   if ((retval = nc_def_dim(ncid, "y", Ny+2, &y_dimid)))
      err(retval);
   if ((retval = nc_def_dim(ncid, "x", Nx+2, &x_dimid)))
      err(retval);
   if ((retval = nc_def_dim(ncid, "t", NC_UNLIMITED, &t_dimid)))
      err(retval);


   // Dimension ids
   dimids[0] = t_dimid;
   dimids[1] = z_dimid;
   dimids[2] = y_dimid;
   dimids[3] = x_dimid;

   // Variable for writing netCDF data one timestep at a time
   count[0] = 1;       // For time dimension : 1 timestep
   count[1] = Nz+2;    // For z              : write everything
   count[2] = Ny+2;    // For y              : write everything
   count[3] = Nx+2;    // For x              : write everything
   
   start[1] = 0;       // For z              : don't do anything
   start[2] = 0;       // For y              : don't do anything
   start[3] = 0;       // For x              : don't do anything
   
   printf("line 231\n");

   if ((retval = nc_def_var(ncid, "data", NC_FLOAT, ndims, dimids, &varid)))
      err(retval);
   
   if ((retval = nc_enddef(ncid)))
      err(retval);


   data_init(Nx,Ny,Nz,(float*)p_tf); 
   data_init(Nx,Ny,Nz,(float*)p_tn); 
   data_init(Nx,Ny,Nz,(float*)p_tp); 

//   for(i=1;i<123;i++)
//      printf("",p_tf[])


   // Euler scheme for the first time step
   for(k=1;k<Nz+1;k++)
      for(j=1;j<Ny+1;j++)
         for(i=1;i<Nx+1;i++)
         {
            p_tf[k][j][i] = p_tn[k][j][i]
                           - u * dtdl * (p_tn[k][j][i] - p_tn[k][j][i-1])
                           - v * dtdl * (p_tn[k][j][i] - p_tn[k][j-1][i])
                           - w * dtdl * (p_tn[k][j][i] - p_tn[k-1][j][i]);
         }

   pbndry(Nx,Ny,Nz,(float*)p_tf);

   p_tp  =  p_tn;
   p_tn  =  p_tf;

   start[0] = 0;
   
   if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
      err(retval);



   // OpenCL part //

   // Use this to check the output of each API call
   cl_int status;

   // Retrieve the number of Platforms
   cl_uint numPlatforms = 0;
   status = clGetPlatformIDs(0, NULL, &numPlatforms);

   // Allocate enough space for each Platform
   cl_platform_id *platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));

   // Fill in the Platforms
   status = clGetPlatformIDs(numPlatforms, platforms, NULL);

   // Retrieve the number of Devices
   cl_uint numDevices = 0;
   status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);

   // Allocate enough spaces for each Devices
   char name_data[100];
   int *comp_units;
   cl_device_fp_config cfg;
   cl_device_id *devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));

   // Fill in the Devices
   status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);

   printf("line 299\n");
//   for(i=0;i<numDevices;i++)
//   {
//      status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name_data), name_data, NULL);
//
//      printf("Device Name #%d: %s\n", i, name_data);
//      status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(comp_units), &comp_units, NULL);
// 
//      printf("Max Work-Group %d\n", comp_units);
//      status = clGetDeviceInfo(devices[i], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cfg), &cfg, NULL);
//
//      printf("Double FP config = %llu, Support? = %d\n", cfg, status);
//   }

   printf("line 313\n");
   // Create a context and associate it with the devices
   cl_context context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);

   printf("line 317\n");
   // Create a command queue and associate it with the devices
   cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
   if(status != CL_SUCCESS)
      printf("%s\n",getErrorString(status));

   printf("line 323\n");
   cl_mem buf_p_tf    =  clCreateBuffer(context, CL_MEM_READ_WRITE, p_siz, NULL, &status);
   cl_mem buf_p_tn    =  clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
   cl_mem buf_p_tp    =  clCreateBuffer(context, CL_MEM_READ_ONLY , p_siz, NULL, &status);
   cl_mem buf_nx      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
   cl_mem buf_ny      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
   cl_mem buf_nz      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
   cl_mem buf_nt      =  clCreateBuffer(context, CL_MEM_READ_ONLY , n_siz, NULL, &status);
   cl_mem buf_u       =  clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
   cl_mem buf_v       =  clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
   cl_mem buf_w       =  clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);
   cl_mem buf_c       =  clCreateBuffer(context, CL_MEM_READ_ONLY , c_siz, NULL, &status);


   printf("line 335\n");
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tf  , CL_FALSE, 0, p_siz, p_tf  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tn  , CL_FALSE, 0, p_siz, p_tn  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_p_tp  , CL_FALSE, 0, p_siz, p_tp  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_nx    , CL_FALSE, 0, n_siz, p_nx  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_ny    , CL_FALSE, 0, n_siz, p_ny  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_nz    , CL_FALSE, 0, n_siz, p_nz  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_nt    , CL_FALSE, 0, n_siz, p_nt  ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_u     , CL_FALSE, 0, c_siz, p_u   ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_v     , CL_FALSE, 0, c_siz, p_v   ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_w     , CL_FALSE, 0, c_siz, p_w   ,0, NULL, NULL);
   status   =  clEnqueueWriteBuffer(cmdQueue, buf_c     , CL_FALSE, 0, c_siz, p_dtdl,0, NULL, NULL);

   printf("line 348\n");
  

   // Create Program with the source code
   cl_program program = NULL;
   size_t program_size;
   char *program_source;
   FILE *program_handle = fopen("leapfrog.cl","r");

   printf("line 357\n");
   fseek(program_handle, 0, SEEK_END);
   program_size = ftell(program_handle);
   rewind(program_handle);
   program_source = (char*)malloc(program_size+1);
   program_source[program_size] = '[=11=]';
   fread(program_source, sizeof(char), program_size, program_handle);
   fclose(program_handle);

   printf("line 366\n");
   program = clCreateProgramWithSource(context, 1, (const char**)&program_source, &program_size, &status);


   printf("line 370\n");
   // Compile the Program for the Device
   status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
   if(status != CL_SUCCESS)
   {
      printf("Code : %d\n",status);
      printf("Program 1 %s\n",getErrorString(status));

      size_t log_size;
      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

      char *log = (char *) malloc(log_size);

      clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

      printf("%s\n", log);
   }
   // Create a kernel
   cl_kernel kernel = NULL;
   kernel = clCreateKernel(program, "leapfrog3d", &status);
   if(status != CL_SUCCESS)
      printf("%s\n",getErrorString(status));


   // Associate the input and output buffers with the kernel
   status = clSetKernelArg(kernel, 0, sizeof(cl_int), &buf_nx  );
   status = clSetKernelArg(kernel, 1, sizeof(cl_int), &buf_ny  );
   status = clSetKernelArg(kernel, 2, sizeof(cl_int), &buf_nz  );
   status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_nt  );
   status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_p_tf);
   status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buf_p_tn);
   status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buf_p_tp);
   status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buf_u   );
   status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buf_v   );
   status = clSetKernelArg(kernel, 9, sizeof(cl_mem), &buf_w   );
   status = clSetKernelArg(kernel,10, sizeof(cl_mem), &buf_c   );


   // Define index space (global work size) of work items for execution
   // A workgroup size (local work size) is not required, but can be used
   size_t glbworksiz[3] = {Nx,Ny,Nz};


   printf("\nLine 395\n");
   // Execute the kernel for execution
   status = clEnqueueNDRangeKernel(cmdQueue, kernel, 3, NULL, glbworksiz, NULL, 0, NULL, NULL);
   if(status != CL_SUCCESS)
      printf("%s\n",getErrorString(status));

   printf("\nLine 401\n");
   // Read the Device output buffer to the host output array
   status = clEnqueueReadBuffer(cmdQueue, buf_p_tf, CL_TRUE, 0, p_siz, p_tf, 0, NULL, NULL);
   if(status != CL_SUCCESS)
      printf("%s\n",getErrorString(status));

   printf("\nLine 407\n");

   start[0] = 1;
   
   if (retval = nc_put_vara_float(ncid, varid, start, count, &p_tf[0][0][0]))
      err(retval);

   if ((retval = nc_close(ncid)))
      err(retval);


   clReleaseMemObject(buf_p_tf);
   clReleaseMemObject(buf_p_tn);
   clReleaseMemObject(buf_p_tp);
   clReleaseMemObject(buf_nx);
   clReleaseMemObject(buf_ny);
   clReleaseMemObject(buf_nz);
   clReleaseMemObject(buf_nt);
   clReleaseMemObject(buf_u);
   clReleaseMemObject(buf_v);
   clReleaseMemObject(buf_w);
   clReleaseMemObject(buf_c);
   clReleaseContext(context); 
   clReleaseKernel(kernel);
   clReleaseProgram(program);
   clReleaseCommandQueue(cmdQueue);

   printf("\nDone. . .\n");

   return 0;                     
}                             


void data_init(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
   int i,j,k;

   int i_min   =  50,
       i_max   =  70,
       j_min   =  50,
       j_max   =  70;

   for(k=0;k<in_z_siz+2;k++)
      for(j=0;j<in_y_siz+2;j++)
         for(i=0;i<in_x_siz+2;i++)
            in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 0.0;

    for(k=1;k<in_z_siz+1;k++)
      for(j=j_min;j<j_max;j++)
         for(i=i_min;i<i_max;i++)
            in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) +i] = 3.0;

}


void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
   int i,j,k;

   // Periodic boundary
   // x-direction
   for(k=1;k<in_z_siz+1;k++)
      for(j=1;j<in_y_siz+1;j++)
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];

         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];


   // y-direction
   for(k=1;k<in_z_siz+1;k++)
      for(i=1;i<in_x_siz+1;i++)
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];

         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];


   // z-direction
   for(j=1;j<in_y_siz+1;j++)
      for(i=1;i<in_x_siz+1;i++)
         in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
         in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];

         in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
         in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];

}

内核代码

void pbndry(int in_x_siz, int in_y_siz, int in_z_siz, float *in_arr)
{
   int i,j,k;

   // Periodic boundary
   // x-direction
   for(k=1;k<in_z_siz+1;k++)
      for(j=1;j<in_y_siz+1;j++)
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 0] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + in_x_siz];

         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + (in_x_siz+1)] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + 1];


   // y-direction
   for(k=1;k<in_z_siz+1;k++)
      for(i=1;i<in_x_siz+1;i++)
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 0 * (in_x_siz+2) + i] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + in_y_siz * (in_x_siz+2) + i];

         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + (in_y_siz+1) * (in_x_siz+2) + i] =
         in_arr[k * (in_y_siz+2) * (in_x_siz+2) + 1 * (in_x_siz+2) + i];


   // z-direction
   for(j=1;j<in_y_siz+1;j++)
      for(i=1;i<in_x_siz+1;i++)
         in_arr[0 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
         in_arr[in_z_siz * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];

         in_arr[(in_z_siz+1) * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i] =
         in_arr[1 * (in_y_siz+2) * (in_x_siz+2) + j * (in_x_siz+2) + i];
}


kernel void leapfrog3d(
                        const  int   x_siz,
                        const  int   y_siz,
                        const  int   z_siz,
                        const  int   t_siz,
                        global float *in_p_tf,
                        global float *in_p_tn,
                        global float *in_p_tp,
                        global float *u_vel,
                        global float *v_vel,
                        global float *w_vel,
                        global float *C
                      )
{
   int i = get_global_id(0);
   int j = get_global_id(1);
   int k = get_global_id(2);

   int idx0, idx_i0, idx_i1, idx_j0, idx_j1, idx_k0, idx_k1;


   for(int t=1;t<t_siz;t++)
   {

      idx0     =  i + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);

      idx_i0   =  (i+1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
      idx_j0   =  i + (j+1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
      idx_k0   =  i + j * (x_siz+2) + (k+1) * (x_siz+2) * (y_siz+2);
      
      idx_i1   =  (i-1) + j * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
      idx_j1   =  i + (j-1) * (x_siz+2) + k * (x_siz+2) * (y_siz+2);
      idx_k1   =  i + j * (x_siz+2) + (k-1) * (x_siz+2) * (y_siz+2);

      in_p_tf[idx0] = in_p_tp[idx0] 
                   - u_vel * C * (in_p_tn[idx_i0] - in_p_tn[idx_i1])
                   - v_vel * C * (in_p_tn[idx_j0] - in_p_tn[idx_j1])
                   - w_vel * C * (in_p_tn[idx_k0] - in_p_tn[idx_k1]);

      pbndry(x_siz,y_siz,z_siz,in_p_tf);

      in_p_tp = in_p_tn;
      in_p_tn = in_p_tf;
   }
}

两件事:

  1. C 是一个数组或指针,但您访问它时就好像它是一个标量值一样。使用 C[some_index] 访问数组元素。如果只是一个常量,使用(*C)C[0].
  2. x_siz/y_siz/z_siz/t_siz 都在 global 内存中 space 因为它们是内核参数,无论是否你明确地写 global const int x_sizconst int x_siz。您需要创建 private 个内核变量,将它们设置为全局变量,并将私有变量传递给函数,因为默认情况下函数参数是私有的。所以在内核中,创建一个变量 int x_siz_private = x_siz; 并将其传递给函数调用。默认情况下,内核中声明的变量位于私有内存 space 中,因此您无需显式写入 private。在汇编中,这对应于 ld(从全局内存加载到寄存器)指令。