为什么额外的 __syncthreads() 调用会导致意外行为?
Why does extra __syncthreads() call cause unexpected behavior?
我在 CUDA 示例中使用了改编自 threadFenceReduction 的缩减代码,this presentation (PDF) 中也对此进行了描述。
在进行一些调试时,我发现简单地插入一个额外的 __syncthreads() 调用,减少不再给出正确的总和:
typedef int64_t SumType;
template <int blockSize>
static __device__ void
reduceBlock(
SumType mySum,
const unsigned int tid
)
{
// Each thread puts its local sum into shared memory
extern __shared__ SumType sdata[];
sdata[tid] = mySum;
__syncthreads();
// Sum values at an offset of 128 and 64
if( blockSize >= 256 ) { if (tid < 128) { sdata[tid] = mySum = mySum + (sdata[tid + 128]); } __syncthreads(); }
if( blockSize >= 128 ) { if (tid < 64) { sdata[tid] = mySum = mySum + (sdata[tid + 64]); } __syncthreads(); }
if( tid < 32 )
{
__syncthreads(); // <=== Extra __syncthreads(), breaks reduction!
// Synchronize within warp using volatile type
volatile SumType *smem = sdata;
if( blockSize >= 64 ) { smem[tid] = mySum = mySum + (smem[tid + 32]); }
if( blockSize >= 32 ) { smem[tid] = mySum = mySum + (smem[tid + 16]); }
if( blockSize >= 16 ) { smem[tid] = mySum = mySum + (smem[tid + 8]); }
if( blockSize >= 8 ) { smem[tid] = mySum = mySum + (smem[tid + 4]); }
if( blockSize >= 4 ) { smem[tid] = mySum = mySum + (smem[tid + 2]); }
if( blockSize >= 2 ) { smem[tid] = mySum = mySum + (smem[tid + 1]); }
}
}
为什么插入额外的 __syncthreads() 会导致此代码不再有效?
请参阅下面我的回答以获取独立的代码示例。
编辑:将 __syncthreads() 移动到示例中的 if() 语句中,以反映实际触发错误的代码。
问题与 __syncthreads() 仅针对块中的某些线程被调用有关。最终结果是一些非常奇怪的行为。
来自 CUDA C Programming Guide,B.6 节:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block,
otherwise the code execution is likely to hang or produce unintended
side effects.
我将其归结为以下简单示例。共享内存中的标志 s_onlyOneBlock 由每个块中的一个线程设置;在块 0 中为真,而在其他块中为假。人们会期望块 0 中的 all 个线程得到 s_onlyOneBlock=true;然而,因为 __syncthreads() 只获取线程 0 到 31 的调用,所以行为是意外的:只有线程 0 到 31 获取 s_onlyOneBlock=true:
#include <stdio.h>
static __global__ void
kernel()
{
__shared__ bool s_onlyOneBlock;
const unsigned int tid = threadIdx.x;
// Call __syncthreads() for only some threads (don't do this!)
if( tid < 32 )
__syncthreads();
// Thread 0 sets s_onlyOneBlock
if( tid == 0 )
s_onlyOneBlock = ( blockIdx.x == 0 );
__syncthreads();
if( s_onlyOneBlock )
{
// Only block 0 should reach this point
if( tid==0 || tid==31 || tid==32 || tid==128 )
printf("s_onlyOneBlock is TRUE: block=%d thread=%d\n", blockIdx.x, threadIdx.x);
}
else
{
if( tid==0 || tid==31 || tid==32 || tid==128 )
printf("s_onlyOneBlock is false: block=%d thread=%d\n", blockIdx.x, threadIdx.x);
}
}
int main()
{
kernel<<<2, 256>>>();
cudaDeviceSynchronize();
}
结果:
nvcc syncproblem.cu -o syncproblem
./syncproblem
s_onlyOneBlock is false: block=0 thread=128 <--- should be true!
s_onlyOneBlock is false: block=1 thread=128
s_onlyOneBlock is false: block=0 thread=32 <--- should be true!
s_onlyOneBlock is false: block=1 thread=32
s_onlyOneBlock is TRUE: block=0 thread=0
s_onlyOneBlock is TRUE: block=0 thread=31
s_onlyOneBlock is false: block=1 thread=0
s_onlyOneBlock is false: block=1 thread=31
我在 CUDA 示例中使用了改编自 threadFenceReduction 的缩减代码,this presentation (PDF) 中也对此进行了描述。
在进行一些调试时,我发现简单地插入一个额外的 __syncthreads() 调用,减少不再给出正确的总和:
typedef int64_t SumType;
template <int blockSize>
static __device__ void
reduceBlock(
SumType mySum,
const unsigned int tid
)
{
// Each thread puts its local sum into shared memory
extern __shared__ SumType sdata[];
sdata[tid] = mySum;
__syncthreads();
// Sum values at an offset of 128 and 64
if( blockSize >= 256 ) { if (tid < 128) { sdata[tid] = mySum = mySum + (sdata[tid + 128]); } __syncthreads(); }
if( blockSize >= 128 ) { if (tid < 64) { sdata[tid] = mySum = mySum + (sdata[tid + 64]); } __syncthreads(); }
if( tid < 32 )
{
__syncthreads(); // <=== Extra __syncthreads(), breaks reduction!
// Synchronize within warp using volatile type
volatile SumType *smem = sdata;
if( blockSize >= 64 ) { smem[tid] = mySum = mySum + (smem[tid + 32]); }
if( blockSize >= 32 ) { smem[tid] = mySum = mySum + (smem[tid + 16]); }
if( blockSize >= 16 ) { smem[tid] = mySum = mySum + (smem[tid + 8]); }
if( blockSize >= 8 ) { smem[tid] = mySum = mySum + (smem[tid + 4]); }
if( blockSize >= 4 ) { smem[tid] = mySum = mySum + (smem[tid + 2]); }
if( blockSize >= 2 ) { smem[tid] = mySum = mySum + (smem[tid + 1]); }
}
}
为什么插入额外的 __syncthreads() 会导致此代码不再有效?
请参阅下面我的回答以获取独立的代码示例。
编辑:将 __syncthreads() 移动到示例中的 if() 语句中,以反映实际触发错误的代码。
问题与 __syncthreads() 仅针对块中的某些线程被调用有关。最终结果是一些非常奇怪的行为。 来自 CUDA C Programming Guide,B.6 节:
__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.
我将其归结为以下简单示例。共享内存中的标志 s_onlyOneBlock 由每个块中的一个线程设置;在块 0 中为真,而在其他块中为假。人们会期望块 0 中的 all 个线程得到 s_onlyOneBlock=true;然而,因为 __syncthreads() 只获取线程 0 到 31 的调用,所以行为是意外的:只有线程 0 到 31 获取 s_onlyOneBlock=true:
#include <stdio.h>
static __global__ void
kernel()
{
__shared__ bool s_onlyOneBlock;
const unsigned int tid = threadIdx.x;
// Call __syncthreads() for only some threads (don't do this!)
if( tid < 32 )
__syncthreads();
// Thread 0 sets s_onlyOneBlock
if( tid == 0 )
s_onlyOneBlock = ( blockIdx.x == 0 );
__syncthreads();
if( s_onlyOneBlock )
{
// Only block 0 should reach this point
if( tid==0 || tid==31 || tid==32 || tid==128 )
printf("s_onlyOneBlock is TRUE: block=%d thread=%d\n", blockIdx.x, threadIdx.x);
}
else
{
if( tid==0 || tid==31 || tid==32 || tid==128 )
printf("s_onlyOneBlock is false: block=%d thread=%d\n", blockIdx.x, threadIdx.x);
}
}
int main()
{
kernel<<<2, 256>>>();
cudaDeviceSynchronize();
}
结果:
nvcc syncproblem.cu -o syncproblem
./syncproblem
s_onlyOneBlock is false: block=0 thread=128 <--- should be true!
s_onlyOneBlock is false: block=1 thread=128
s_onlyOneBlock is false: block=0 thread=32 <--- should be true!
s_onlyOneBlock is false: block=1 thread=32
s_onlyOneBlock is TRUE: block=0 thread=0
s_onlyOneBlock is TRUE: block=0 thread=31
s_onlyOneBlock is false: block=1 thread=0
s_onlyOneBlock is false: block=1 thread=31