CUDA 大型输入数组
CUDA large input arrays
我是 CUDA 的新手,我一直在研究 "Reduce algorithm"。
该算法适用于小于 1<<24 的任何数组大小。
当我使用大小为 1<<25 的数组时,"total sum" 中的程序 returns 0 是错误的。总和应该是 2^25
编辑 cuda-memcheck compiled_code
========= CUDA-MEMCHECK
@@STARTING@@
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaLaunch.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2f2d83]
========= Host Frame:test [0x3b37e]
========= Host Frame:test [0x2b71]
========= Host Frame:test [0x2a18]
========= Host Frame:test [0x2a4c]
========= Host Frame:test [0x2600]
========= Host Frame:test [0x2904]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
========= Host Frame:test [0x23e9]
=========
我的设置是:
- 列表项
- Nvidia Tesla K40
- CUDA 6.5
- 科学 Linux 版本 6.4(碳)
该程序由内核、内核包装器和执行内核包装器的 main 组成。
/* -------- KERNEL -------- */
__global__ void reduce_kernel(int * d_out, int * d_in, int size)
{
// position and threadId
int pos = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
// do reduction in global memory
for (unsigned int s = blockDim.x / 2; s>0; s>>=1)
{
if (tid < s)
{
if (pos+s < size) // Handling out of bounds
{
d_in[pos] = d_in[pos] + d_in[pos+s];
}
}
__syncthreads();
}
// only thread 0 writes result, as thread
if ((tid==0) && (pos < size))
{
d_out[blockIdx.x] = d_in[pos];
}
}
这是内核包装器
/* -------- KERNEL WRAPPER -------- */
void reduce(int * d_out, int * d_in, int size, int num_threads)
{
// setting up blocks and intermediate result holder
int num_blocks;
if(((size) % num_threads))
{
num_blocks = ((size) / num_threads) + 1;
}
else
{
num_blocks = (size) / num_threads;
}
int * d_intermediate;
cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
cudaMemset(d_intermediate, 0, sizeof(int)*num_blocks);
int prev_num_blocks;
int i = 1;
int size_rest = 0;
// recursively solving, will run approximately log base num_threads times.
do
{
printf("Round:%.d\n", i);
printf("NumBlocks:%.d\n", num_blocks);
printf("NumThreads:%.d\n", num_threads);
printf("size of array:%.d\n", size);
i++;
reduce_kernel<<<num_blocks, num_threads>>>(d_intermediate, d_in, size);
size_rest = size % num_threads;
size = size / num_threads + size_rest;
// updating input to intermediate
cudaMemcpy(d_in, d_intermediate, sizeof(int)*num_blocks, cudaMemcpyDeviceToDevice);
// Updating num_blocks to reflect how many blocks we now want to compute on
prev_num_blocks = num_blocks;
if(size % num_threads)
{
num_blocks = size / num_threads + 1;
}
else
{
num_blocks = size / num_threads;
}
// updating intermediate
cudaFree(d_intermediate);
cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
}
while(size > num_threads); // if it is too small, compute rest.
// computing rest
reduce_kernel<<<1, size>>>(d_out, d_in, prev_num_blocks);
}
这里是主要的:
/* -------- MAIN -------- */
int main(int argc, char **argv)
{
printf("@@STARTING@@ \n");
// Setting num_threads
int num_threads = 512;
// Making non-bogus data and setting it on the GPU
const int size = 1<<24;
const int size_out = 1;
int * d_in;
int * d_out;
cudaMalloc(&d_in, sizeof(int)*size);
cudaMalloc(&d_out, sizeof(int)*size_out);
int * h_in = (int *)malloc(size*sizeof(int));
for (int i = 0; i < size; i++) h_in[i] = 1;
cudaMemcpy(d_in, h_in, sizeof(int)*size, cudaMemcpyHostToDevice);
// Running kernel wrapper
reduce(d_out, d_in, size, num_threads);
int result;
cudaMemcpy(&result, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("\nFINAL SUM IS: %d\n", result);
}
这种编译代码的方法:
nvcc -o my_reduce my_reduce.cu
builds for a compute architecture CC2.0 on CUDA 6.5
该架构在网格中 limited to 65535 blocks(在 x-dimension 中,这是您使用的唯一维度)。
在 size
of 1<<24
,num_threads=512
,启动的块数是:
num_blocks = (size) / num_threads;
即 1<<24/512 或 31250 个块
略高于 1<<25 的某个数字将超过 cc2.0 设备的块限制。
要解决此问题,请使用
进行编译
nvcc -o -arch=sm_35 my_reduce my_reduce.cu
这是您的 K40 的 correct compile architecture (i.e. compute capability),并将块限制提高到 2^31-1
任何时候在使用 CUDA 代码时遇到问题,请使用 proper cuda error checking,在 在此寻求帮助之前。即使您不理解错误结果,它也可能会帮助那些试图帮助您的人。
我是 CUDA 的新手,我一直在研究 "Reduce algorithm"。
该算法适用于小于 1<<24 的任何数组大小。
当我使用大小为 1<<25 的数组时,"total sum" 中的程序 returns 0 是错误的。总和应该是 2^25
编辑 cuda-memcheck compiled_code
========= CUDA-MEMCHECK
@@STARTING@@
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaLaunch.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/libcuda.so.1 [0x2f2d83]
========= Host Frame:test [0x3b37e]
========= Host Frame:test [0x2b71]
========= Host Frame:test [0x2a18]
========= Host Frame:test [0x2a4c]
========= Host Frame:test [0x2600]
========= Host Frame:test [0x2904]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed5d]
========= Host Frame:test [0x23e9]
=========
我的设置是:
- 列表项
- Nvidia Tesla K40
- CUDA 6.5
- 科学 Linux 版本 6.4(碳)
该程序由内核、内核包装器和执行内核包装器的 main 组成。
/* -------- KERNEL -------- */
__global__ void reduce_kernel(int * d_out, int * d_in, int size)
{
// position and threadId
int pos = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
// do reduction in global memory
for (unsigned int s = blockDim.x / 2; s>0; s>>=1)
{
if (tid < s)
{
if (pos+s < size) // Handling out of bounds
{
d_in[pos] = d_in[pos] + d_in[pos+s];
}
}
__syncthreads();
}
// only thread 0 writes result, as thread
if ((tid==0) && (pos < size))
{
d_out[blockIdx.x] = d_in[pos];
}
}
这是内核包装器
/* -------- KERNEL WRAPPER -------- */
void reduce(int * d_out, int * d_in, int size, int num_threads)
{
// setting up blocks and intermediate result holder
int num_blocks;
if(((size) % num_threads))
{
num_blocks = ((size) / num_threads) + 1;
}
else
{
num_blocks = (size) / num_threads;
}
int * d_intermediate;
cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
cudaMemset(d_intermediate, 0, sizeof(int)*num_blocks);
int prev_num_blocks;
int i = 1;
int size_rest = 0;
// recursively solving, will run approximately log base num_threads times.
do
{
printf("Round:%.d\n", i);
printf("NumBlocks:%.d\n", num_blocks);
printf("NumThreads:%.d\n", num_threads);
printf("size of array:%.d\n", size);
i++;
reduce_kernel<<<num_blocks, num_threads>>>(d_intermediate, d_in, size);
size_rest = size % num_threads;
size = size / num_threads + size_rest;
// updating input to intermediate
cudaMemcpy(d_in, d_intermediate, sizeof(int)*num_blocks, cudaMemcpyDeviceToDevice);
// Updating num_blocks to reflect how many blocks we now want to compute on
prev_num_blocks = num_blocks;
if(size % num_threads)
{
num_blocks = size / num_threads + 1;
}
else
{
num_blocks = size / num_threads;
}
// updating intermediate
cudaFree(d_intermediate);
cudaMalloc(&d_intermediate, sizeof(int)*num_blocks);
}
while(size > num_threads); // if it is too small, compute rest.
// computing rest
reduce_kernel<<<1, size>>>(d_out, d_in, prev_num_blocks);
}
这里是主要的:
/* -------- MAIN -------- */
int main(int argc, char **argv)
{
printf("@@STARTING@@ \n");
// Setting num_threads
int num_threads = 512;
// Making non-bogus data and setting it on the GPU
const int size = 1<<24;
const int size_out = 1;
int * d_in;
int * d_out;
cudaMalloc(&d_in, sizeof(int)*size);
cudaMalloc(&d_out, sizeof(int)*size_out);
int * h_in = (int *)malloc(size*sizeof(int));
for (int i = 0; i < size; i++) h_in[i] = 1;
cudaMemcpy(d_in, h_in, sizeof(int)*size, cudaMemcpyHostToDevice);
// Running kernel wrapper
reduce(d_out, d_in, size, num_threads);
int result;
cudaMemcpy(&result, d_out, sizeof(int), cudaMemcpyDeviceToHost);
printf("\nFINAL SUM IS: %d\n", result);
}
这种编译代码的方法:
nvcc -o my_reduce my_reduce.cu
builds for a compute architecture CC2.0 on CUDA 6.5
该架构在网格中 limited to 65535 blocks(在 x-dimension 中,这是您使用的唯一维度)。
在 size
of 1<<24
,num_threads=512
,启动的块数是:
num_blocks = (size) / num_threads;
即 1<<24/512 或 31250 个块
略高于 1<<25 的某个数字将超过 cc2.0 设备的块限制。
要解决此问题,请使用
进行编译nvcc -o -arch=sm_35 my_reduce my_reduce.cu
这是您的 K40 的 correct compile architecture (i.e. compute capability),并将块限制提高到 2^31-1
任何时候在使用 CUDA 代码时遇到问题,请使用 proper cuda error checking,在 在此寻求帮助之前。即使您不理解错误结果,它也可能会帮助那些试图帮助您的人。