并行缩减无法正常工作
Parallel Reduction does not work correctly
我在 OpenCL 上编写了以下并行内核缩减。我只想对 BlockSum
数组中的所有值求和。在使用 work_group_reduce_add(BlockSum[GetIndex]);
时它工作得很好,使用我从 https://www.fz-juelich.de/SharedDocs/Downloads/IAS/JSC/EN/slides/opencl/opencl-05-reduction.pdf?__blob=publicationFile (幻灯片 11)中读取的优化代码不能正常工作。这里似乎有什么错误? global_work_size 设置为 {16,16} 以及 local_work_size(意味着每个工作组总共有 256 个线程)。在 work_group_reduce_add
的情况下,我得到 255 这是正确的,但使用优化代码我得到 0
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
unsigned short BlockSum[256];
int SumOfAll= 0;
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
SumOfAll= work_group_reduce_add(BlockSum[GetIndex]); //works great
// BUT CODE BELOW DOES NOT SUM CORRECTLY
/*
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
*/
printf("SumOfAll=%d\n",SumOfAll);
}
好的,问题已解决。 BlockSum[256];
未声明为 __local
而是私有内存(没有 __local
地址 Space 限定符),这意味着每个线程(或核心)都有自己的副本这些数据,但优化的缩减代码正在寻找 shared local memory 线程之间的数据,以求和值。此外,变量 int SumOfAll;
也应声明为带初始化的 __local
或 private
在我的情况下之前没有任何初始化。你选。
所以工作内核现在看起来像这样。
我希望这种类型的错误能帮助像我这样不谨慎的人。
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
//*********************************************************
//below was the offending code and the root of the problem
//**********************************************************
__local unsigned short BlockSum[256];
int SumOfAll;
//**********************************************************
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
//SumOfAll = work_group_reduce_add(BlockSum[GetIndex]);
// OPTIMIZED CODE BELOW NOW SUM UP CORRECTLY
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
printf("SumOfAll=%d\n",SumOfAll);
}
我在 OpenCL 上编写了以下并行内核缩减。我只想对 BlockSum
数组中的所有值求和。在使用 work_group_reduce_add(BlockSum[GetIndex]);
时它工作得很好,使用我从 https://www.fz-juelich.de/SharedDocs/Downloads/IAS/JSC/EN/slides/opencl/opencl-05-reduction.pdf?__blob=publicationFile (幻灯片 11)中读取的优化代码不能正常工作。这里似乎有什么错误? global_work_size 设置为 {16,16} 以及 local_work_size(意味着每个工作组总共有 256 个线程)。在 work_group_reduce_add
的情况下,我得到 255 这是正确的,但使用优化代码我得到 0
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
unsigned short BlockSum[256];
int SumOfAll= 0;
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
SumOfAll= work_group_reduce_add(BlockSum[GetIndex]); //works great
// BUT CODE BELOW DOES NOT SUM CORRECTLY
/*
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
*/
printf("SumOfAll=%d\n",SumOfAll);
}
好的,问题已解决。 BlockSum[256];
未声明为 __local
而是私有内存(没有 __local
地址 Space 限定符),这意味着每个线程(或核心)都有自己的副本这些数据,但优化的缩减代码正在寻找 shared local memory 线程之间的数据,以求和值。此外,变量 int SumOfAll;
也应声明为带初始化的 __local
或 private
在我的情况下之前没有任何初始化。你选。
所以工作内核现在看起来像这样。
我希望这种类型的错误能帮助像我这样不谨慎的人。
__kernel void Reduction()
{
unsigned char GetThreadX = get_local_id(0); //it takes values from 0..15
unsigned char GetThreadY = get_local_id(1); //it takes values from 0..15
unsigned char GetGroup = get_local_size(0); //16
//*********************************************************
//below was the offending code and the root of the problem
//**********************************************************
__local unsigned short BlockSum[256];
int SumOfAll;
//**********************************************************
unsigned short GetIndex = GetThreadX + (GetGroup * GetThreadY); // takes values 0..255, group=16
BlockSum[GetIndex] = 1;
barrier(CLK_LOCAL_MEM_FENCE);
//SumOfAll = work_group_reduce_add(BlockSum[GetIndex]);
// OPTIMIZED CODE BELOW NOW SUM UP CORRECTLY
for(unsigned short stride=128; stride>1; stride >>= 1) {
if(GetIndex < stride)
BlockSum[GetIndex] += BlockSum[GetIndex + stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(GetIndex==0)
SumOfAll = BlockSum[0] + BlockSum[1];
barrier(CLK_LOCAL_MEM_FENCE);
printf("SumOfAll=%d\n",SumOfAll);
}