OpenCL:CL_OUT_OF_HOST_MEMORY on clCreateCommandQueueWithProperties(带有最小、完整和可验证的示例)

OpenCL: CL_OUT_OF_HOST_MEMORY on clCreateCommandQueueWithProperties (with Minimal, Complete, and Verifiable example)

我有一个 MSI Radeon R9 390X 8GB 显卡(名称为 "Hawaii",如下所示)。我在 Windows 10 桌面上安装了 OpenCL,我正在使用 Cygwin 编译和 运行 程序。

我正在尝试 运行 一个示例 OpenCL 程序,我从大学时代 class 保留下来,稍作修改。

它不会 运行 在我的显卡上。这是我得到的:

$ ./ex26.exe -v 30 40
Bw=30 Bn=40 n=1200
OpenCL Platform 0: AMD Accelerated Parallel Processing
 ----- OpenCL Device # 0: Hawaii-----
Gflops: 47.520000
Max Compute Units: 44
Max Clock Frequency: 1080
Total Memory of Device (bytes): 8589934592
Max Size of Memory Object Allocation (bytes): 4244635648
Max Work Group Size: 256

Fastest OpenCL Device: Hawaii
Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY
winnerPlatform: 140717488209200

您可以查看下面的代码以查看此错误语句打印出的位置。无论出于何种原因,clCreateCommandQueueWithProperties 正在返回 CL_OUT_OF_HOST_MEMORY。我不明白我的 CPU 端内存怎么会接近 运行 内存不足而无法运行。我真的不知道。特别是因为这个方法所做的就是创建队列。

事实上,如果我将 CL_DEVICE_TYPE_GPU 切换为 CL_DEVICE_TYPE_CPU,则程序在 CPU.

上执行时没有问题

这一切都在 .cpp 文件中。我真的找不到任何可以削减的东西来使 MCV 更小,因为它已经是一个例子,所以你很漂亮你看到的代码是否正是我所拥有的。

下面是所有代码:

#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <CL/opencl.h>
#include <windows.h>
#include <sys/time.h>

/*
 *  Return elapsed wall time since last call (seconds)
 */
static double t0=0;
float Elapsed(void)
{
#ifdef _WIN32
   //  Windows version of wall time
   LARGE_INTEGER tv,freq;
   QueryPerformanceCounter((LARGE_INTEGER*)&tv);
   QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
   double t = tv.QuadPart/(double)freq.QuadPart;
#else
   //  Unix/Linux/OSX version of wall time
   struct timeval tv;
   gettimeofday(&tv,NULL);
   double t = tv.tv_sec+1e-6*tv.tv_usec;
#endif
   float s = t-t0;
   t0 = t;
   return s;
}

/*
 *  Print message to stderr and exit
 */
void Fatal(const char* format , ...)
{
   va_list args;
   va_start(args,format);
   vfprintf(stderr,format,args);
   va_end(args);
   exit(1);
}

/*
 *  Initialize matrix with random values
 */
void RandomInit(float x[],const unsigned int n)
{
   for (unsigned int i=0;i<n*n;i++)
      x[i] = rand() / (float)RAND_MAX;
}

/*
 *  OpenCL notify callback (echo to stderr)
 */
void Notify(const char* errinfo,const void* private_info,size_t cb,void* user_data)
{
   fprintf(stderr,"%s\n",errinfo);
}


class ErrorReader {

public:

private:

};

/*
 *  Initialize fastest OpenCL device
 */
cl_device_id _DEV_ID;
cl_context _CONTEXT;
cl_command_queue _QUEUE;
int InitGPU(int verbose)
{
   cl_uint Nplat;
   cl_int  err;
   char name[1024];
   int  MaxGflops = -1;

   cl_platform_id winnerPlatform = 0;

   // Get platforms
   cl_platform_id platforms[8];
   if (clGetPlatformIDs(8, platforms, &Nplat)) Fatal("Cannot get number of OpenCL platforms\n");
   else if (Nplat<1) Fatal("No OpenCL platforms found\n");

   // Loop over platforms
   for (unsigned int platform = 0; platform < Nplat; platform++) {

      if (clGetPlatformInfo(platforms[platform], CL_PLATFORM_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL platform name\n");
      if (verbose) printf("OpenCL Platform %d: %s\n", platform, name);

      // Get GPU device IDs
      cl_uint Ndev;
      cl_device_id id[1024];
      if (clGetDeviceIDs(platforms[platform], CL_DEVICE_TYPE_GPU, 1024, id, &Ndev))
         Fatal("Cannot get number of OpenCL devices\n");
      else if (Ndev<1)
         Fatal("No OpenCL devices found\n");

      // Find the fastest device
      for (unsigned int devId = 0; devId < Ndev; devId++) {

         cl_uint compUnits, freq;
         cl_ulong memSize, maxAlloc;
         size_t maxWorkGrps;

         if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compUnits), &compUnits, NULL)) Fatal("Cannot get OpenCL device units\n");
         if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(freq), &freq, NULL)) Fatal("Cannot get OpenCL device frequency\n");
         if (clGetDeviceInfo(id[devId], CL_DEVICE_NAME, sizeof(name), name, NULL)) Fatal("Cannot get OpenCL device name\n");

         if (clGetDeviceInfo(id[devId], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(memSize), &memSize, NULL)) Fatal("Cannot get OpenCL memory size.\n");
         if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(memSize), &maxAlloc, NULL)) Fatal("Cannot get OpenCL memory size.\n");

         if (clGetDeviceInfo(id[devId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGrps), &maxWorkGrps, NULL)) Fatal("Cannot get OpenCL max work group size\n");

         int Gflops = compUnits * freq;

         if (verbose) printf(" ----- OpenCL Device # %d: %s-----\n"
           "Gflops: %f\n"
           "Max Compute Units: %d\n"
           "Max Clock Frequency: %d\n"
           "Total Memory of Device (bytes): %lu\n"
           "Max Size of Memory Object Allocation (bytes): %lu\n"
           "Max Work Group Size: %zu\n\n",
           devId,
           name,
           1e-3*Gflops,
           compUnits,
           freq,
           memSize,
           maxAlloc,
           maxWorkGrps);

         if (Gflops > MaxGflops)
         {
            _DEV_ID = id[devId];
            MaxGflops = Gflops;

            winnerPlatform = platforms[platform];
         }
      }
   }

   //  Print fastest device info
   if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
   printf("Fastest OpenCL Device: %s\n",name);

   //  Check thread count
   size_t mwgs;
   if (clGetDeviceInfo(_DEV_ID,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");

   printf("winnerPlatform: %zu", winnerPlatform);

   // cl_platform_id platform = NULL;
   // int retValue = GetPlatform(&platform, winnerPlatform, true);

   //  Create OpenCL _CONTEXT for fastest device
   // _CONTEXT = clCreateContext(0,1,&_DEV_ID,Notify,NULL,&err);
   cl_context_properties cps[3] =
   {
      CL_CONTEXT_PLATFORM,
      (cl_context_properties)winnerPlatform,
      (cl_context_properties)0
   };
   _CONTEXT = clCreateContextFromType(cps,
      CL_DEVICE_TYPE_GPU, NULL, NULL, &err);

   if (!_CONTEXT || err) Fatal("Cannot create OpenCL Context\n");

   cl_command_queue_properties *propers;

   cl_command_queue_properties prop = 0;
   //prop |= CL_QUEUE_PROFILING_ENABLE;
   //prop |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;

   propers = &prop;

   _QUEUE = clCreateCommandQueueWithProperties(_CONTEXT, _DEV_ID, propers, &err); //  Create OpenCL command _QUEUE for fastest device
   if (err) { 
      if (err == CL_INVALID_CONTEXT) Fatal("Cannot create OpenCL command cue: CL_INVALID_CONTEXT\n");
      else if (err == CL_INVALID_DEVICE) Fatal("Cannot create OpenCL command cue: CL_INVALID_DEVICE\n");
      else if (err == CL_INVALID_VALUE) Fatal("Cannot create OpenCL command cue: CL_INVALID_VALUE\n");
      else if (err == CL_INVALID_QUEUE_PROPERTIES) Fatal("Cannot create OpenCL command cue: CL_INVALID_QUEUE_PROPERTIES\n");
      else if (err == CL_OUT_OF_HOST_MEMORY) Fatal("Cannot create OpenCL command cue: CL_OUT_OF_HOST_MEMORY\n");
      else Fatal("Cannot create OpenCL command cue: ???????????? Unknown Error\n");
   } else if (!_QUEUE) {
      Fatal("Cannot create OpenCL command cue: NULL\n");
   }

   return mwgs;
} 

/*
 * C = A * B -- host
 */
void AxBh(float C[], const float A[], const float B[], unsigned int n)
{
   for (unsigned int i=0;i<n;i++)
      for (unsigned int j=0;j<n;j++)
      {
         double sum=0;
         for (unsigned int k=0;k<n;k++)
            sum += (double)A[i*n+k] * (double)B[k*n+j];
         C[i*n+j] = (float)sum;
      }
}

/*
* Compute one element of A * B
*/
const char* source =
  "__kernel void AxB(__global float C[],__global const float A[],__global const float B[],const unsigned int n)\n"
  "{\n"
  "   unsigned int j = get_global_id(0);\n"
  "   unsigned int i = get_global_id(1);\n"
  "   float sum =0;\n"
  "   for (int k=0;k<n;k++)\n"
  "      sum += A[i*n+k] * B[k*n+j];\n"
  "   C[i*n+j] = sum;\n"
  "}\n";

/*
 * C = A * B -- device
 */
void AxBd(float Ch[],float Ah[],float Bh[],const unsigned int Bw,const unsigned int Bn)
{
   //  Calculate matrix dimensions
   int n = Bw*Bn;
   int N = n*n*sizeof(float);

   // Allocate device memory and copy A&B from host to device
   cl_int  err;
   cl_mem Ad = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Ah, &err);
   if (err) Fatal("Cannot create and copy A from host to device\n");
   cl_mem Bd = clCreateBuffer(_CONTEXT, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_PERSISTENT_MEM_AMD, N, Bh, &err);
   if (err) Fatal("Cannot create and copy B from host to device\n");

   //  Allocate device memory for C on device
   cl_mem Cd = clCreateBuffer(_CONTEXT,CL_MEM_WRITE_ONLY,N,NULL,&err);
   if (err) Fatal("Cannot create C on device\n");

   //  Compile kernel
   cl_program prog = clCreateProgramWithSource(_CONTEXT,1,&source,0,&err);
   if (err) Fatal("Cannot create program\n");
   if (clBuildProgram(prog,0,NULL,NULL,NULL,NULL))
   {
      char log[1048576];
      if (clGetProgramBuildInfo(prog,_DEV_ID,CL_PROGRAM_BUILD_LOG,sizeof(log),log,NULL))
         Fatal("Cannot get build log\n");
      else
         Fatal("Cannot build program\n%s\n",log);
   }
   cl_kernel kernel = clCreateKernel(prog,"AxB",&err);
   if (err) Fatal("Cannot create kernel\n");

   //  Set parameters for kernel
   if (clSetKernelArg(kernel,0,sizeof(cl_mem),&Cd)) Fatal("Cannot set kernel parameter Cd\n");
   if (clSetKernelArg(kernel,1,sizeof(cl_mem),&Ad)) Fatal("Cannot set kernel parameter Ad\n");
   if (clSetKernelArg(kernel,2,sizeof(cl_mem),&Bd)) Fatal("Cannot set kernel parameter Bd\n");
   if (clSetKernelArg(kernel,3,sizeof(int),&n)) Fatal("Cannot set kernel parameter n\n");

   //  Run kernel
   size_t Global[2] = {(size_t)n, (size_t)n};
   size_t Local[2]  = {(size_t)Bw, (size_t)Bw};
   if (clEnqueueNDRangeKernel(_QUEUE,kernel,2,NULL,Global,Local,0,NULL,NULL)) Fatal("Cannot run kernel\n");

   //  Release kernel and program
   if (clReleaseKernel(kernel)) Fatal("Cannot release kernel\n");
   if (clReleaseProgram(prog)) Fatal("Cannot release program\n");

   // Copy C from device to host (block until done)
   if (clEnqueueReadBuffer(_QUEUE,Cd,CL_TRUE,0,N,Ch,0,NULL,NULL)) Fatal("Cannot copy C from device to host\n");

   //  Free device memory
   clReleaseMemObject(Ad);
   clReleaseMemObject(Bd);
   clReleaseMemObject(Cd);
}

/*
 *  main
 */
int main(int argc, char* argv[])
{
   //  Process options
   int opt;
   int verbose=0;
   while ((opt=getopt(argc,argv,"v"))!=-1)
   {
      if (opt=='v')
         verbose++;
      else
         Fatal("Usage: [-v] <block width> <number of blocks>\n");
   }
   argc -= optind;
   argv += optind;

   //  Get width and number of blocks
   if (argc!=2) Fatal("Usage: [-v] <block width> <number of blocks>\n");
   int Bw = atoi(argv[0]);
   if (Bw<1) Fatal("Block width out of range %d\n",Bw);
   int Bn = atoi(argv[1]);
   if (Bn<1) Fatal("Number of blocks out of range %d\n",Bn);
   //  Total width is block times number of blocks
   int n = Bw*Bn;
   int N = n*n*sizeof(float);
   printf("Bw=%d Bn=%d n=%d\n",Bw,Bn,n);

   //  Initialize GPU
   int Mw = InitGPU(verbose);
   if (Mw<Bw*Bw) Fatal("Thread count %d exceeds max work group size of %d\n",Bw*Bw,Mw);

   // Allocate host matrices A/B/C/R
   float* Ah = (float*)malloc(N);
   float* Bh = (float*)malloc(N);
   float* Ch = (float*)malloc(N);
   float* Rh = (float*)malloc(N);
   if (!Ah || !Bh || !Ch || !Rh) Fatal("Cannot allocate host memory\n");

   // Initialize A & B
   srand(9999);
   RandomInit(Ah,n);
   RandomInit(Bh,n);

   //  Compute R = AB on host
   Elapsed();
   AxBh(Rh,Ah,Bh,n);
   float Th = Elapsed();

   //  Compute C = AB on device
   Elapsed();
   AxBd(Ch,Ah,Bh,Bw,Bn);
   float Td = Elapsed();

   //  Compute difference between R and C
   double r2=0;
   for (int i=0;i<n*n;i++)
      r2 += fabs(Ch[i]-Rh[i]);
   r2 /= n*n;

   //  Free host memory
   free(Ah);
   free(Bh);
   free(Ch);
   free(Rh);

   //  Print results
   printf("Host   Time = %6.3f s\n",Th);
   printf("Device Time = %6.3f s\n",Td);
   printf("Speedup = %.1f\n",Th/Td);
   printf("Difference = %.2e\n",r2);

   //  Done
   return 0;
}

我编译它使用(你显然需要稍微改变一下):

g++ -Wall -o exMatrixMult -I"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/include" -L"/cygdrive/c/Program Files (x86)/AMD APP SDK/3.0/lib/x86_64" exMatrixMult.cpp -lOpenCL

我的 "Hawaii" 显卡可以 运行 OpenCL SDK 附带的示例程序(在 "AMD APP SDK.0\samples\opencl\bin\x86_64" 中)。我花了大半个下午的时间看他们的源代码和我的有什么不同,目前还没有成功。

如果有用,发生错误的代码看起来像这样(仍然出现同样的问题):

   //  Print fastest device info
   if (clGetDeviceInfo(devid,CL_DEVICE_NAME,sizeof(name),name,NULL)) Fatal("Cannot get OpenCL device name\n");
   printf("Fastest OpenCL Device: %s\n",name);

   //  Check thread count
   size_t mwgs;
   if (clGetDeviceInfo(devid,CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(mwgs),&mwgs,NULL)) Fatal("Cannot get OpenCL max work group size\n");

   //  Create OpenCL context for fastest device
   context = clCreateContext(0,1,&devid,Notify,NULL,&err);
   if(!context || err) Fatal("Cannot create OpenCL context\n");

   //  Create OpenCL command queue for fastest device
   queue = clCreateCommandQueueWithProperties(context, devid, 0, &err);
   if (err) Fatal("Cannot create OpenCL command cue\n");

现在如果你没有类似的显卡可能无法验证问题。但是我不知道。

更新显卡驱动程序后问题消失了。