OpenCL 演示程序可以在一个系统上运行，但不能在另一个非常相似的 VirtualBox 系统上运行

Question

我正在尝试以下简单的 OpenCL 矢量加法程序（为简洁起见，我没有包括我的 printSystemInfo() 函数）：

// Vector addition demo similar to one from Oak Ridge lab:
// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/#vecAdd.c


#include <stdio.h>
#include <stdlib.h>

//To suppress warnings when using the deprecated clCreateCommandQueue of OpenCL v1.0:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS

#include <math.h>
#include <CL/opencl.h>

// Declaration of a printing function, that is not seen at this Whosebug question
int printSystemInfo (cl_platform_id platform_id, cl_device_id device_id);

// OpenCL kernel. Each work item takes care of one element of vector c
const char *kernelSource =                                       "\n" \
"__kernel void vecAdd(  __global int *a,                       \n" \
"                       __global int *b,                       \n" \
"                       __global int *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;

int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 10;

    // Host input vectors
    int *h_a;
    int *h_b;
    // Host output vector
    int *h_c;

    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;

    cl_platform_id platform_id;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(int);

    // Allocate memory for each vector on host
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);
    h_c = (int*)malloc(bytes);

    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = i;
        h_b[i] = i+1;
    }

    size_t globalSize, localSize;
    cl_int err;

    // Number of work items in each local work group
    localSize = 64;

    // Number of total work items - localSize must be a divisor
    globalSize = ceil(n/(float)localSize)*localSize;

    // Bind to platform
    err = clGetPlatformIDs(1, &platform_id, NULL);

    // Get ID for the device
    //err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);

    // Call a local function that fetches and prints system info
    err = printSystemInfo (platform_id, device_id);

    // Create a context
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

    // Create a command queue
    queue = clCreateCommandQueue(context, device_id, 0, &err);

    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);

    // Build the program executable
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);

    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);

    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);

    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

    // Execute the kernel over the entire range of the data set
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);

    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);

    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );

    //Print vectors a, b and c=a+b
    for(i=0; i<n; i++)
        printf("a: %d  b: %d  c=a+b: %d \n", h_a[i], h_b[i], h_c[i] );

    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

它适用于 Amazon EC2 系统运行 Ubuntu 14.04...

ubuntu@ip-xxx:~/programs/OpenCL$ gcc ./cldemo.c ./printSystemInfo.c  -o ./cldemo -I/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-sdk-5.0.0.43/include -L/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-5.0.0.43/lib64 -lOpenCL -Wall -lm

ubuntu@ip-xxx:~/programs/OpenCL$ ./cldemo

OS name: Linux
Release:3.13.0-52-generic
Version:#86-Ubuntu SMP Mon May 4 04:32:59 UTC 2015
Machine:x86_64

Platform name = Intel(R) OpenCL
Platform version = OpenCL 1.2 LINUX
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64

Device name =       Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Device version = OpenCL 1.2 (Build 43)
Device global memory size= 1040740352
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64

a: 0  b: 1  c=a+b: 1
a: 1  b: 2  c=a+b: 3
a: 2  b: 3  c=a+b: 5
a: 3  b: 4  c=a+b: 7
a: 4  b: 5  c=a+b: 9
a: 5  b: 6  c=a+b: 11
a: 6  b: 7  c=a+b: 13
a: 7  b: 8  c=a+b: 15
a: 8  b: 9  c=a+b: 17
a: 9  b: 10  c=a+b: 19

但不是在家里的类似系统上（但 Ubuntu 14.04 运行作为 Windows 7 主机中的 Vagrant VirtualBox 机器）：

OS name: Linux 
Release:3.13.0-53-generic 
Version:#89-Ubuntu SMP Wed May 20 10:34:39 UTC 2015 
Machine:x86_64 

Platform name = Intel(R) OpenCL 
Platform version = OpenCL 1.2 LINUX 
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir  

Device name =        Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz 
Device version = OpenCL 1.2 (Build 43) 
Device global memory size= 3156189184
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir  

a: 0  b: 1  c=a+b: 0 
a: 1  b: 2  c=a+b: 0 
a: 2  b: 3  c=a+b: 0 
a: 3  b: 4  c=a+b: 0 
a: 4  b: 5  c=a+b: 0 
a: 5  b: 6  c=a+b: 0 
a: 6  b: 7  c=a+b: 0 
a: 7  b: 8  c=a+b: 0 
a: 8  b: 9  c=a+b: 0 
a: 9  b: 10  c=a+b: 0

我是 OpenCL 的新手。任何有用的指示将不胜感激！

Answer 1

我也无法在 VirtualBox 下使用 Intel SDK/drivers 在 Ubuntu 14.04 上运行 Intel OpenCL。如果这对您没有影响（不应该），您可以安装 AMD APP SDK，它在 Intel CPU.

上运行良好

Link: AMD APP SDK

Answer 2

正如其他人在这里所说的那样，英特尔的 OpenCL SDK 在 VirtualBox 上开箱即用。显然，SDK 要求 CPU 支持 SIMD 扩展 SSE4_1 和 SSE4_2，但 VirtualBox 的默认设置已为它们关闭（可以检查：cat /proc/cpuinfo）

因此，打开主机（Windows，在我的例子中）控制台，转到 VirtualBox 安装目录并输入：

VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.1 1
VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.2 1

现在重新启动 VM，OpenCL 应该可以工作（至少对我来说是这样）。

OpenCL 演示程序可以在一个系统上运行，但不能在另一个非常相似的 VirtualBox 系统上运行

OpenCL demo program works on one system, but not on an other very similar VirtualBox system

c

virtualbox

sse

opencl

vagrant