Radeon 上的 OpenCL (aparapi) 简单还原速度慢
OpenCL (aparapi) simple reduction slow on Radeon
我正在尝试对 OpenCL 中的大型双精度数组进行简单归约(在本例中为总和)。我看了网上的教程,发现基本上就是这样解决我的问题:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef struct This_s{
__global double *nums;
int nums__javaArrayLength;
__local double *buffer;
__global double *res;
int passid;
}This;
int get_pass_id(This *this){
return this->passid;
}
__kernel void run(
__global double *nums,
int nums__javaArrayLength,
__local double *buffer,
__global double *res,
int passid
){
This thisStruct;
This* this=&thisStruct;
this->nums = nums;
this->nums__javaArrayLength = nums__javaArrayLength;
this->buffer = buffer;
this->res = res;
this->passid = passid;
{
int tid = get_local_id(0);
int i = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
int gridSize = get_local_size(0) * get_num_groups(0);
int n = this->nums__javaArrayLength;
double cur = 0.0;
for (; i<n; i = i + gridSize){
cur = cur + this->nums[i];
}
this->buffer[tid] = cur;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<32){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 32)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<16){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 16)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<8){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 8)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<4){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 4)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<2){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 2)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<1){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 1)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid==0){
this->res[get_group_id(0)] = this->buffer[0];
}
return;
}
}
如果您想了解奇怪的 this
,那是 aparapi 的一个(不幸的是必需的)工件,我用它来将 Java 转换为 OpenCL。
我的内核产生了正确的结果,并且在相当强大的 Nvidia 硬件上,它比 Java 中的顺序求和快大约 10 倍。然而,在 Radeon R9 280 上,它的性能与简单的 Java 代码相当。
我已经使用 CodeXL 分析了内核。它告诉我 MemUnitBusy 仅占 6%。为什么这么低?
原来 OpenCL 没有(直接)错误,但 aparapis 缓冲区管理是。
我在没有 aparapi 的情况下尝试了完全相同的内核,性能很好。我一使用 CL_MEM_USE_HOST_PTR
就变坏了,遗憾的是,这是使用 aparapi 时唯一的选择。似乎 AMD 没有将主机内存复制到具有该选项的设备,即使在运行了几次 "warmup" 之后也是如此。
您可能要考虑转到 aparapi.com. It includes several fixes to bugs and a lot of extra features and performance enhancements over the older library you linked above. It is also in maven central with about a dozen releases. so it is easier to use. The new Github repository is here 上更活跃的项目。
我正在尝试对 OpenCL 中的大型双精度数组进行简单归约(在本例中为总和)。我看了网上的教程,发现基本上就是这样解决我的问题:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
typedef struct This_s{
__global double *nums;
int nums__javaArrayLength;
__local double *buffer;
__global double *res;
int passid;
}This;
int get_pass_id(This *this){
return this->passid;
}
__kernel void run(
__global double *nums,
int nums__javaArrayLength,
__local double *buffer,
__global double *res,
int passid
){
This thisStruct;
This* this=&thisStruct;
this->nums = nums;
this->nums__javaArrayLength = nums__javaArrayLength;
this->buffer = buffer;
this->res = res;
this->passid = passid;
{
int tid = get_local_id(0);
int i = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
int gridSize = get_local_size(0) * get_num_groups(0);
int n = this->nums__javaArrayLength;
double cur = 0.0;
for (; i<n; i = i + gridSize){
cur = cur + this->nums[i];
}
this->buffer[tid] = cur;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<32){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 32)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<16){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 16)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<8){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 8)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<4){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 4)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<2){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 2)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid<1){
this->buffer[tid] = this->buffer[tid] + this->buffer[(tid + 1)];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid==0){
this->res[get_group_id(0)] = this->buffer[0];
}
return;
}
}
如果您想了解奇怪的 this
,那是 aparapi 的一个(不幸的是必需的)工件,我用它来将 Java 转换为 OpenCL。
我的内核产生了正确的结果,并且在相当强大的 Nvidia 硬件上,它比 Java 中的顺序求和快大约 10 倍。然而,在 Radeon R9 280 上,它的性能与简单的 Java 代码相当。
我已经使用 CodeXL 分析了内核。它告诉我 MemUnitBusy 仅占 6%。为什么这么低?
原来 OpenCL 没有(直接)错误,但 aparapis 缓冲区管理是。
我在没有 aparapi 的情况下尝试了完全相同的内核,性能很好。我一使用 CL_MEM_USE_HOST_PTR
就变坏了,遗憾的是,这是使用 aparapi 时唯一的选择。似乎 AMD 没有将主机内存复制到具有该选项的设备,即使在运行了几次 "warmup" 之后也是如此。
您可能要考虑转到 aparapi.com. It includes several fixes to bugs and a lot of extra features and performance enhancements over the older library you linked above. It is also in maven central with about a dozen releases. so it is easier to use. The new Github repository is here 上更活跃的项目。