CUDA 向右求和
CUDA sum to the right
我正在尝试使用 CUDA 实现总和缩减,但是我希望缩减在右侧而不是左侧..
我写了下面的代码,但我不确定为什么它不起作用
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
int count = 2;
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride < BLOCK_DIM;
stride = stride + (BLOCK_DIM / count))
{
if (threadIdx.x >= stride) {
count = count * 2;
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
printf("%d ", stride);
__syncthreads();
if (stride == BLOCK_DIM - 1) {
break;
}
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
知道我做错了什么吗?
只要输入的元素数是 2 的幂,这就应该完全符合您的要求。部分和应该在右边结束。这种算法的步幅必须从 1
增加到 BLOCK_DIM / 2
(产生更多的扭曲发散)或从 BLOCK_DIM / 2
缩小到 1
。无论哪种方式,它都应该通过 2
.
multiplied/divided 来实现。
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride > 0;
stride /= 2)
{
if (threadIdx.x >= BLOCK_DIM - stride) {
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
条件语句中的__syncthreads();
是另一个错误,因为块的所有线程都必须参与同步。否则会导致未定义的行为。
我正在尝试使用 CUDA 实现总和缩减,但是我希望缩减在右侧而不是左侧.. 我写了下面的代码,但我不确定为什么它不起作用
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
int count = 2;
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride < BLOCK_DIM;
stride = stride + (BLOCK_DIM / count))
{
if (threadIdx.x >= stride) {
count = count * 2;
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
printf("%d ", stride);
__syncthreads();
if (stride == BLOCK_DIM - 1) {
break;
}
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
知道我做错了什么吗?
只要输入的元素数是 2 的幂,这就应该完全符合您的要求。部分和应该在右边结束。这种算法的步幅必须从 1
增加到 BLOCK_DIM / 2
(产生更多的扭曲发散)或从 BLOCK_DIM / 2
缩小到 1
。无论哪种方式,它都应该通过 2
.
__global__ void reduce_kernel(
float *input,
float *partialSums,
unsigned int N)
{
unsigned int segment = blockIdx.x * blockDim.x * 2;
unsigned int i = segment + threadIdx.x;
__shared__ float input_s[BLOCK_DIM];
input_s[threadIdx.x] = input[i] + input[i + BLOCK_DIM];
__syncthreads();
for (unsigned int stride = BLOCK_DIM / 2;
stride > 0;
stride /= 2)
{
if (threadIdx.x >= BLOCK_DIM - stride) {
input_s[threadIdx.x] += input_s[threadIdx.x - stride];
}
__syncthreads();
}
if (threadIdx.x == BLOCK_DIM - 1) {
partialSums[blockIdx.x] = input_s[threadIdx.x];
}
}
条件语句中的__syncthreads();
是另一个错误,因为块的所有线程都必须参与同步。否则会导致未定义的行为。