如何通过减少在CUDA中找到数组的总和
How to find the sum of array in CUDA by reduction
我正在实现一个函数,通过归约求出数组的总和,我的数组有 32*32 个元素,它的值是 0 ... 1023。
我的预期和值为 523776,但我的结果是 15872,这是错误的。
这是我的代码:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(w+blocksize.x-1)/blocksize.x;
gridsize.y=(h+blocksize.y-1)/blocksize.y;
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
您的代码中至少有 2 个问题
您正在对 dev_b
数组中的第一个元素执行 atomicAdd
,但您并未将该元素初始化为已知值(即 0)。当然,在 运行 内核之前,您正在将 b
复制到 dev_b
,但由于您尚未将 b
初始化为任何已知值,因此这无济于事。在 C 或 C++ 中,数组 b
不会自动初始化为零,如果这是您的想法的话。我们可以通过在将 b
复制到 dev_b
.
之前将 b[0]
设置为零来解决此问题
您的缩减内核是为处理一维情况而编写的(即,使用的唯一线程索引是基于 .x
值的一维线程索引),但是您正在启动一个内核二维线程块和网格。这种不匹配将无法正常工作,我们要么需要启动 1D 线程块和网格,要么重新编写内核以使用 2D 索引(即 .x
和 .y
)。我选择了前者(1D)。
这是一个对您的代码进行了这些更改的有效示例,它似乎产生了正确的结果:
$ cat t1218.cu
#include <stdio.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
b[0] = 0; //initialize the first value of b to zero
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N/blocksize.x); //create 1D grid
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
printf("Value should be: %d \n", ((N-1)*(N/2)));
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
// note use of 1D thread indices (only) in this kernel
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
$ nvcc -o t1218 t1218.cu
$ cuda-memcheck ./t1218
========= CUDA-MEMCHECK
Reduced sum of Array elements = 523776
Value should be: 523776
========= ERROR SUMMARY: 0 errors
$
备注:
内核和您编写的代码取决于 N
是线程块大小 (256) 的精确倍数。对于这种情况,这是令人满意的,但如果不满意,事情就会崩溃。
我没有看到 proper cuda error checking 的任何证据。它不会在这里出现任何东西,但它是一种很好的做法。作为快速测试,运行 你的代码 cuda-memcheck
正如我在这里所做的那样。
我正在实现一个函数,通过归约求出数组的总和,我的数组有 32*32 个元素,它的值是 0 ... 1023。 我的预期和值为 523776,但我的结果是 15872,这是错误的。 这是我的代码:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(w+blocksize.x-1)/blocksize.x;
gridsize.y=(h+blocksize.y-1)/blocksize.y;
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
您的代码中至少有 2 个问题
您正在对
dev_b
数组中的第一个元素执行atomicAdd
,但您并未将该元素初始化为已知值(即 0)。当然,在 运行 内核之前,您正在将b
复制到dev_b
,但由于您尚未将b
初始化为任何已知值,因此这无济于事。在 C 或 C++ 中,数组b
不会自动初始化为零,如果这是您的想法的话。我们可以通过在将b
复制到dev_b
. 之前将 您的缩减内核是为处理一维情况而编写的(即,使用的唯一线程索引是基于
.x
值的一维线程索引),但是您正在启动一个内核二维线程块和网格。这种不匹配将无法正常工作,我们要么需要启动 1D 线程块和网格,要么重新编写内核以使用 2D 索引(即.x
和.y
)。我选择了前者(1D)。
b[0]
设置为零来解决此问题
这是一个对您的代码进行了这些更改的有效示例,它似乎产生了正确的结果:
$ cat t1218.cu
#include <stdio.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
b[0] = 0; //initialize the first value of b to zero
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N/blocksize.x); //create 1D grid
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
printf("Value should be: %d \n", ((N-1)*(N/2)));
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
// note use of 1D thread indices (only) in this kernel
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
$ nvcc -o t1218 t1218.cu
$ cuda-memcheck ./t1218
========= CUDA-MEMCHECK
Reduced sum of Array elements = 523776
Value should be: 523776
========= ERROR SUMMARY: 0 errors
$
备注:
内核和您编写的代码取决于
N
是线程块大小 (256) 的精确倍数。对于这种情况,这是令人满意的,但如果不满意,事情就会崩溃。我没有看到 proper cuda error checking 的任何证据。它不会在这里出现任何东西,但它是一种很好的做法。作为快速测试,运行 你的代码
cuda-memcheck
正如我在这里所做的那样。