OpenCL 演示程序可以在一个系统上运行,但不能在另一个非常相似的 VirtualBox 系统上运行
OpenCL demo program works on one system, but not on an other very similar VirtualBox system
我正在尝试以下简单的 OpenCL 矢量加法程序(为简洁起见,我没有包括我的 printSystemInfo() 函数):
// Vector addition demo similar to one from Oak Ridge lab:
// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/#vecAdd.c
#include <stdio.h>
#include <stdlib.h>
//To suppress warnings when using the deprecated clCreateCommandQueue of OpenCL v1.0:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <math.h>
#include <CL/opencl.h>
// Declaration of a printing function, that is not seen at this Whosebug question
int printSystemInfo (cl_platform_id platform_id, cl_device_id device_id);
// OpenCL kernel. Each work item takes care of one element of vector c
const char *kernelSource = "\n" \
"__kernel void vecAdd( __global int *a, \n" \
" __global int *b, \n" \
" __global int *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main( int argc, char* argv[] )
{
// Length of vectors
unsigned int n = 10;
// Host input vectors
int *h_a;
int *h_b;
// Host output vector
int *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
cl_platform_id platform_id; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
// Size, in bytes, of each vector
size_t bytes = n*sizeof(int);
// Allocate memory for each vector on host
h_a = (int*)malloc(bytes);
h_b = (int*)malloc(bytes);
h_c = (int*)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
h_a[i] = i;
h_b[i] = i+1;
}
size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be a divisor
globalSize = ceil(n/(float)localSize)*localSize;
// Bind to platform
err = clGetPlatformIDs(1, &platform_id, NULL);
// Get ID for the device
//err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
// Call a local function that fetches and prints system info
err = printSystemInfo (platform_id, device_id);
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
(const char **) & kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
//Print vectors a, b and c=a+b
for(i=0; i<n; i++)
printf("a: %d b: %d c=a+b: %d \n", h_a[i], h_b[i], h_c[i] );
// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
它适用于 Amazon EC2 系统 运行 Ubuntu 14.04...
ubuntu@ip-xxx:~/programs/OpenCL$ gcc ./cldemo.c ./printSystemInfo.c -o ./cldemo -I/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-sdk-5.0.0.43/include -L/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-5.0.0.43/lib64 -lOpenCL -Wall -lm
ubuntu@ip-xxx:~/programs/OpenCL$ ./cldemo
OS name: Linux
Release:3.13.0-52-generic
Version:#86-Ubuntu SMP Mon May 4 04:32:59 UTC 2015
Machine:x86_64
Platform name = Intel(R) OpenCL
Platform version = OpenCL 1.2 LINUX
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64
Device name = Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Device version = OpenCL 1.2 (Build 43)
Device global memory size= 1040740352
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64
a: 0 b: 1 c=a+b: 1
a: 1 b: 2 c=a+b: 3
a: 2 b: 3 c=a+b: 5
a: 3 b: 4 c=a+b: 7
a: 4 b: 5 c=a+b: 9
a: 5 b: 6 c=a+b: 11
a: 6 b: 7 c=a+b: 13
a: 7 b: 8 c=a+b: 15
a: 8 b: 9 c=a+b: 17
a: 9 b: 10 c=a+b: 19
但不是在家里的类似系统上(但 Ubuntu 14.04 运行 作为 Windows 7 主机中的 Vagrant VirtualBox 机器):
OS name: Linux
Release:3.13.0-53-generic
Version:#89-Ubuntu SMP Wed May 20 10:34:39 UTC 2015
Machine:x86_64
Platform name = Intel(R) OpenCL
Platform version = OpenCL 1.2 LINUX
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir
Device name = Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz
Device version = OpenCL 1.2 (Build 43)
Device global memory size= 3156189184
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir
a: 0 b: 1 c=a+b: 0
a: 1 b: 2 c=a+b: 0
a: 2 b: 3 c=a+b: 0
a: 3 b: 4 c=a+b: 0
a: 4 b: 5 c=a+b: 0
a: 5 b: 6 c=a+b: 0
a: 6 b: 7 c=a+b: 0
a: 7 b: 8 c=a+b: 0
a: 8 b: 9 c=a+b: 0
a: 9 b: 10 c=a+b: 0
我是 OpenCL 的新手。任何有用的指示将不胜感激!
我也无法在 VirtualBox 下使用 Intel SDK/drivers 在 Ubuntu 14.04 上运行 Intel OpenCL。如果这对您没有影响(不应该),您可以安装 AMD APP SDK,它在 Intel CPU.
上运行良好
Link:
AMD APP SDK
正如其他人在这里所说的那样,英特尔的 OpenCL SDK 在 VirtualBox 上开箱即用。显然,SDK 要求 CPU 支持 SIMD 扩展 SSE4_1 和 SSE4_2,但 VirtualBox 的默认设置已为它们关闭(可以检查:cat /proc/cpuinfo
)
因此,打开主机(Windows,在我的例子中)控制台,转到 VirtualBox 安装目录并输入:
VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.1 1
VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.2 1
现在重新启动 VM,OpenCL 应该可以工作(至少对我来说是这样)。
我正在尝试以下简单的 OpenCL 矢量加法程序(为简洁起见,我没有包括我的 printSystemInfo() 函数):
// Vector addition demo similar to one from Oak Ridge lab:
// https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/#vecAdd.c
#include <stdio.h>
#include <stdlib.h>
//To suppress warnings when using the deprecated clCreateCommandQueue of OpenCL v1.0:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <math.h>
#include <CL/opencl.h>
// Declaration of a printing function, that is not seen at this Whosebug question
int printSystemInfo (cl_platform_id platform_id, cl_device_id device_id);
// OpenCL kernel. Each work item takes care of one element of vector c
const char *kernelSource = "\n" \
"__kernel void vecAdd( __global int *a, \n" \
" __global int *b, \n" \
" __global int *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main( int argc, char* argv[] )
{
// Length of vectors
unsigned int n = 10;
// Host input vectors
int *h_a;
int *h_b;
// Host output vector
int *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
cl_platform_id platform_id; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
// Size, in bytes, of each vector
size_t bytes = n*sizeof(int);
// Allocate memory for each vector on host
h_a = (int*)malloc(bytes);
h_b = (int*)malloc(bytes);
h_c = (int*)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
h_a[i] = i;
h_b[i] = i+1;
}
size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be a divisor
globalSize = ceil(n/(float)localSize)*localSize;
// Bind to platform
err = clGetPlatformIDs(1, &platform_id, NULL);
// Get ID for the device
//err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
// Call a local function that fetches and prints system info
err = printSystemInfo (platform_id, device_id);
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
(const char **) & kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
//Print vectors a, b and c=a+b
for(i=0; i<n; i++)
printf("a: %d b: %d c=a+b: %d \n", h_a[i], h_b[i], h_c[i] );
// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
它适用于 Amazon EC2 系统 运行 Ubuntu 14.04...
ubuntu@ip-xxx:~/programs/OpenCL$ gcc ./cldemo.c ./printSystemInfo.c -o ./cldemo -I/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-sdk-5.0.0.43/include -L/opt/intel/intel-opencl-1.2-5.0.0.43/opencl-1.2-5.0.0.43/lib64 -lOpenCL -Wall -lm
ubuntu@ip-xxx:~/programs/OpenCL$ ./cldemo
OS name: Linux
Release:3.13.0-52-generic
Version:#86-Ubuntu SMP Mon May 4 04:32:59 UTC 2015
Machine:x86_64
Platform name = Intel(R) OpenCL
Platform version = OpenCL 1.2 LINUX
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64
Device name = Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Device version = OpenCL 1.2 (Build 43)
Device global memory size= 1040740352
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir cl_khr_fp64
a: 0 b: 1 c=a+b: 1
a: 1 b: 2 c=a+b: 3
a: 2 b: 3 c=a+b: 5
a: 3 b: 4 c=a+b: 7
a: 4 b: 5 c=a+b: 9
a: 5 b: 6 c=a+b: 11
a: 6 b: 7 c=a+b: 13
a: 7 b: 8 c=a+b: 15
a: 8 b: 9 c=a+b: 17
a: 9 b: 10 c=a+b: 19
但不是在家里的类似系统上(但 Ubuntu 14.04 运行 作为 Windows 7 主机中的 Vagrant VirtualBox 机器):
OS name: Linux
Release:3.13.0-53-generic
Version:#89-Ubuntu SMP Wed May 20 10:34:39 UTC 2015
Machine:x86_64
Platform name = Intel(R) OpenCL
Platform version = OpenCL 1.2 LINUX
Platform extensions = cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir
Device name = Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz
Device version = OpenCL 1.2 (Build 43)
Device global memory size= 3156189184
Device extensions= cl_khr_icd cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_3d_image_writes cl_intel_exec_by_local_thread cl_khr_spir
a: 0 b: 1 c=a+b: 0
a: 1 b: 2 c=a+b: 0
a: 2 b: 3 c=a+b: 0
a: 3 b: 4 c=a+b: 0
a: 4 b: 5 c=a+b: 0
a: 5 b: 6 c=a+b: 0
a: 6 b: 7 c=a+b: 0
a: 7 b: 8 c=a+b: 0
a: 8 b: 9 c=a+b: 0
a: 9 b: 10 c=a+b: 0
我是 OpenCL 的新手。任何有用的指示将不胜感激!
我也无法在 VirtualBox 下使用 Intel SDK/drivers 在 Ubuntu 14.04 上运行 Intel OpenCL。如果这对您没有影响(不应该),您可以安装 AMD APP SDK,它在 Intel CPU.
上运行良好Link: AMD APP SDK
正如其他人在这里所说的那样,英特尔的 OpenCL SDK 在 VirtualBox 上开箱即用。显然,SDK 要求 CPU 支持 SIMD 扩展 SSE4_1 和 SSE4_2,但 VirtualBox 的默认设置已为它们关闭(可以检查:cat /proc/cpuinfo
)
因此,打开主机(Windows,在我的例子中)控制台,转到 VirtualBox 安装目录并输入:
VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.1 1
VBoxManage setextradata "your-VM-name" VBoxInternal/CPUM/SSE4.2 1
现在重新启动 VM,OpenCL 应该可以工作(至少对我来说是这样)。