在块中找到最大值的 OpenCL 障碍
OpenCL barrier of finding max in a block
我在 Nvidia 的开发者网站上找到了一段 OpenCL 内核示例代码
目的函数maxOneBlock
是找出数组maxValue
的最大值存入maxValue[0].
我完全理解循环部分,但对unroll
部分感到困惑:为什么展开部分不需要在每个步骤完成后同步线程?
例如:当一个线程完成localId和localId+32的比较时,它如何确保其他线程将其结果存储到localId+16?
内核代码:
void maxOneBlock(__local float maxValue[],
__local int maxInd[])
{
uint localId = get_local_id(0);
uint localSize = get_local_size(0);
int idx;
float m1, m2, m3;
for (uint s = localSize/2; s > 32; s >>= 1)
{
if (localId < s)
{
m1 = maxValue[localId];
m2 = maxValue[localId+s];
m3 = (m1 >= m2) ? m1 : m2;
idx = (m1 >= m2) ? localId : localId + s;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// unroll the final warp to reduce loop and sync overheads
if (localId < 32)
{
m1 = maxValue[localId];
m2 = maxValue[localId+32];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 32;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+16];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 16;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+8];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 8;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+4];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 4;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+2];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 2;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+1];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 1;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
}
Why the unroll part do not need to sync thread after each step is done?
样本不正确,每一步后确实需要一个屏障。
看起来样本是以warp-synchronous风格编写的,这是一种利用warp在NVIDIA硬件上的底层执行机制的方式,但是如果底层执行机制发生变化或不正确的同步会导致它中断存在编译器优化。
我在 Nvidia 的开发者网站上找到了一段 OpenCL 内核示例代码
目的函数maxOneBlock
是找出数组maxValue
的最大值存入maxValue[0].
我完全理解循环部分,但对unroll
部分感到困惑:为什么展开部分不需要在每个步骤完成后同步线程?
例如:当一个线程完成localId和localId+32的比较时,它如何确保其他线程将其结果存储到localId+16?
内核代码:
void maxOneBlock(__local float maxValue[],
__local int maxInd[])
{
uint localId = get_local_id(0);
uint localSize = get_local_size(0);
int idx;
float m1, m2, m3;
for (uint s = localSize/2; s > 32; s >>= 1)
{
if (localId < s)
{
m1 = maxValue[localId];
m2 = maxValue[localId+s];
m3 = (m1 >= m2) ? m1 : m2;
idx = (m1 >= m2) ? localId : localId + s;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// unroll the final warp to reduce loop and sync overheads
if (localId < 32)
{
m1 = maxValue[localId];
m2 = maxValue[localId+32];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 32;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+16];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 16;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+8];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 8;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+4];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 4;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+2];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 2;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
m1 = maxValue[localId];
m2 = maxValue[localId+1];
m3 = (m1 > m2) ? m1 : m2;
idx = (m1 > m2) ? localId : localId + 1;
maxValue[localId] = m3;
maxInd[localId] = maxInd[idx];
}
}
Why the unroll part do not need to sync thread after each step is done?
样本不正确,每一步后确实需要一个屏障。
看起来样本是以warp-synchronous风格编写的,这是一种利用warp在NVIDIA硬件上的底层执行机制的方式,但是如果底层执行机制发生变化或不正确的同步会导致它中断存在编译器优化。