如何使用 OPENCL 在多 GPU 环境中唯一标识一个 GPU?

How to uniquely identify a GPU in a multiGPU environment using OPENCL?

我正在学习 OpenCL 并且已经编写了初步代码来查询机器并找出与之关联的平台和设备。

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>

int main(int argc,char** argv){

printf("Let's see what devices are there in this Node\n\n");

cl_int errNum,errCPU,errGPU;
cl_uint numPlatforms;
cl_platform_id *platformIds;
cl_context context=NULL;
char dname[500];
int i,dc,dg;
cl_device_id *cpuDevices,*gpuDevices;
cl_uint numCPUDevices,numGPUDevices,entries;
cl_ulong long_entries;
size_t p_size;

errNum = clGetPlatformIDs(0,NULL,&numPlatforms);

if(errNum==CL_SUCCESS){ printf("Number of Platforms on this Node: %d\n\n",numPlatforms); }
else{ printf("Error:Failure in clGetPlatformIDs,error code=%d\n",errNum); }

platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id)*numPlatforms);
errNum = clGetPlatformIDs(numPlatforms,platformIds,NULL);

if(errNum==CL_SUCCESS){

for(i=0;i<numPlatforms;i++){

   printf("Platform Information on %d Platform\n",i+1);
   /*Obtain information about platform*/
   clGetPlatformInfo(platformIds[i],CL_PLATFORM_NAME,500,dname,NULL);
   printf("\tCL_PLATFORM_NAME = %s\n",dname);
   clGetPlatformInfo(platformIds[i],CL_PLATFORM_VERSION,500,dname,NULL);
   printf("\tCL_PLATFORM_VERSION = %s\n",dname);

   /*obtain list of devices available on platform*/
   clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,0,NULL,&numCPUDevices);
   printf("\t%d CPU devices found\n",numCPUDevices);
   clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,0,NULL,&numGPUDevices);
   printf("\t%d GPU devices found\n",numGPUDevices);       
   cpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numCPUDevices);
   gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
   printf("\tPrinting devices Information\n");

   if(numCPUDevices>0){
    for(dc=0;dc<numCPUDevices;dc++){
     printf("\t\tPrinting CPU Devices Information\n");
     errCPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_CPU,1,&cpuDevices[dc],NULL);
     if(errCPU==CL_SUCCESS){
        printf("\t\tDevice Id is %d\n",cpuDevices[dc]);
        printf("\t\tDevice Information of %d device on %d platform\n",dc+1,i+1);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_NAME,500,dname,NULL);
        printf("\t\tDevice # %d name = %s\n",dc,dname);

        clGetDeviceInfo(cpuDevices[dc],CL_DRIVER_VERSION,500,dname,NULL);
        printf("\t\tDriver version = %s\n",dname);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory (MB) : \t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tLocal Memory (KB): \t%llu\n",long_entries/1024);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tMax clock (MHz) : \t%llu\n",long_entries);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
        printf("\t\tMax Work Group Size: \t%d\n",p_size);

        clGetDeviceInfo(cpuDevices[dc],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
        printf("\t\tNumber of parallel compute cores:\t%d\n",entries);

     }else{printf("\t\tError:Failure in clGetDeviceIds,error code = %d\n",errCPU);}
    }
   }else{printf("\t\tZero CPU Devices found\n");}
   /* query devices for information */
   if(numGPUDevices>0){
    for(dg = 0;dg<numGPUDevices;dg++ ){
     printf("\t\tPrinting GPU Devices Information\n");
     errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,1,&gpuDevices[dg],NULL);
     if(errGPU==CL_SUCCESS){

        printf("\t\tDevice Id is %d\n",gpuDevices[dg]);
        printf("\t\tDevice Information of %d device on %d platform\n",dg+1,i+1);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_NAME,500,dname,NULL);
        printf("\t\tDevice # %d name = %s\n",dg,dname);

        clGetDeviceInfo(gpuDevices[dg],CL_DRIVER_VERSION,500,dname,NULL);
        printf("\t\tDriver version = %s\n",dname);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory (MB) : \t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tGlobal Memory Cache (MB):\t%llu\n",long_entries/1024/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tLocal Memory (KB): \t%llu\n",long_entries/1024);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
        printf("\t\tMax clock (MHz) : \t%llu\n",long_entries);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
        printf("\t\tMax Work Group Size: \t%d\n",p_size);

        clGetDeviceInfo(gpuDevices[dg],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
        printf("\t\tNumber of parallel compute cores:\t%d\n",entries);

     }else{printf("\t\tError:Failure in clGetDeviceIds,error code = %d\n\t\t,Or,This platform cannot interact with the GPUs.Check for the drivers\n",errGPU);}
    }
   }else{printf("\t\tZero GPU Devices found\n");}

 }

  }
  else{
    printf("Error:Failure in clGetPlatformIDs,error code = %d\n",errNum);
  }


  return 0;

}

当我执行它时,我得到了这样的示例输出

            Let's see what devices are there in this Node

            Number of Platforms on this Node: 2

            Platform Information on 1 Platform
                CL_PLATFORM_NAME = AMD Accelerated Parallel Processing
                CL_PLATFORM_VERSION = OpenCL 2.1 AMD-APP (2527.3)
                0 CPU devices found
                3 GPU devices found
                Printing devices Information
                    Zero CPU Devices found
                    Printing GPU Devices Information
                    Device Id is 13401120
                    Device Information of 1 device on 1 platform
                    Device # 0 name = gfx900
                    Driver version = 2527.3 (HSA1.1,HSAIL)
                    Global Memory (MB) :    16368
                    Global Memory Cache (MB):   0
                    Local Memory (KB):  64
                    Max clock (MHz) :   1500
                    Max Work Group Size:    256
                    Number of parallel compute cores:   64
                    Printing GPU Devices Information
                    Device Id is 13401120
                    Device Information of 2 device on 1 platform
                    Device # 1 name = gfx900
                    Driver version = 2527.3 (HSA1.1,HSAIL)
                    Global Memory (MB) :    16368
                    Global Memory Cache (MB):   0
                    Local Memory (KB):  64
                    Max clock (MHz) :   1500
                    Max Work Group Size:    256
                    Number of parallel compute cores:   64
                    Printing GPU Devices Information
                    Device Id is 13401120
                    Device Information of 3 device on 1 platform
                    Device # 2 name = gfx900
                    Driver version = 2527.3 (HSA1.1,HSAIL)
                    Global Memory (MB) :    16368
                    Global Memory Cache (MB):   0
                    Local Memory (KB):  64
                    Max clock (MHz) :   1500
                    Max Work Group Size:    256
                    Number of parallel compute cores:   64
            Platform Information on 2 Platform
                CL_PLATFORM_NAME = Intel(R) OpenCL
                CL_PLATFORM_VERSION = OpenCL 2.0 LINUX
                1 CPU devices found
                3 GPU devices found
                Printing devices Information
                    Printing CPU Devices Information
                    Device Id is 16035224
                    Device Information of 1 device on 2 platform
                    Device # 0 name = Intel(R) Xeon(R) Platinum 8164 CPU @ 2.00GHz
                    Driver version = 1.2.0.37
                    Global Memory (MB) :    47782
                    Global Memory Cache (MB):   0
                    Local Memory (KB):  32
                    Max clock (MHz) :   2000
                    Max Work Group Size:    8192
                    Number of parallel compute cores:   52
                    Printing GPU Devices Information
                    Error:Failure in clGetDeviceIds,error code = -1
                    ,Or,This platform cannot interact with the GPUs.Check for the drivers
                    Printing GPU Devices Information
                    Error:Failure in clGetDeviceIds,error code = -1
                    ,Or,This platform cannot interact with the GPUs.Check for the drivers
                    Printing GPU Devices Information
                    Error:Failure in clGetDeviceIds,error code = -1
                    ,Or,This platform cannot interact with the GPUs.Check for the drivers

我的问题,为什么三个gpu的deviceID都一样。如果我输入 %p,我会得到十六进制的答案。

一般来说,如何在给定的平台上唯一标识一个设备,以便我可以 运行 内核在那个特定的设备上?假设,我想在多 GPU 环境中将具有特定数据的内核发送到一个 GPU?

for (dg in gpuDevices)
    errGPU = clGetDeviceIDs(platformIds[i],CL_DEVICE_TYPE_GPU,1,&gpuDevices[dg],NULL);

这是在请求单个设备 ID(始终返回第一个)并将其写入 gpuDevices

中的不同位置

clGetDeviceIDs从设备循环中拉出来,取而代之的是运行它在获取设备数量后立即执行。这将立即填充所有设备 ID。

gpuDevices = (cl_device_id *)malloc(sizeof(cl_device_id)*numGPUDevices);
errGPU = clGetDeviceIDs(platformIds[i], CL_DEVICE_TYPE_GPU, numGPUDevices,  gpuDevices, NULL)