CUDA-大小为 4 的 __global__ 写入无效
CUDA- Invalid __global__ write of size 4
我已经在 Cuda 中实现了 Mandelbrot 集。当我输入附加代码中存在的高度和宽度时,我通过 运行 cuda-memcheck 命令收到此错误。它是由什么引起的?我认为这可能是由于输出结果向量的索引索引溢出错误,但我不明白为什么在这种情况下会出现错误,而当我输入其他值时,不会发生这种情况(例如,当我有一个高度= 16384 * 4和宽度= 8192 * 4)。
谢谢大家抽出时间。
#include <iostream>
static void
writePPM( int *buf, int width, int height, int Max_Iterations, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i]== Max_Iterations) ? char(255): 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
__device__ static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_im = c_im + new_im;
z_re = c_re + new_re;
}
return i;
}
__global__ void kernel (float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int *output)
{
int w= blockIdx.x*blockDim.x+threadIdx.x;
int h= blockIdx.y*blockDim.y+threadIdx.y;
float dx =(x1 - x0) / width;
float dy =(y1 - y0) / height;
if (h<height && w<width) {
//for (int i = 0; i < width; ++i) {
float x = x0 + w * dx;
float y = y0 + h * dy;
int index= (width*h+w);
output[index] = mandel(x, y, maxIterations);
//}
}
}
int main(){
unsigned int width =16384*8;
unsigned int height=8192*8;
float x0 = -2;
float x1 = 1;
float myy0 = -1;
float myy1 = 1;
uint32_t maxIterations = 1024;
size_t THREADSPERBLOCK = 1024;
size_t THREADSPERBLOCK_X = 256;
size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
int *buf= (int *) malloc (width *height *sizeof(unsigned int));
int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
cudaEvent_t start, stop;
float streamElapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
int *buf_d;
int *buff;
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int));
cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);
dim3 gridDims (num_blocks_x,num_blocks_y);
dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);
kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);
cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
cudaEventRecord( stop, 0);
cudaEventSynchronize( stop);
cudaEventElapsedTime( &streamElapsedTime, start, stop );
cudaEventDestroy( start);
cudaEventDestroy( stop );
cudaFree(buf_d);
cudaFree(buff);
free(buf_h);
free(buf);
printf("\nCUDA stream elapsed time: %f", streamElapsedTime);
return 0;
}**strong text**
运行 cuda-memcheck 这个错误我报了好几次:
Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
========= Host Frame:./Mandelbrot [0x1d3eb]
========= Host Frame:./Mandelbrot [0x3a63e]
========= Host Frame:./Mandelbrot [0x678c]
========= Host Frame:./Mandelbrot [0x6655]
========= Host Frame:./Mandelbrot [0x66cd]
========= Host Frame:./Mandelbrot [0x63af]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Invalid __global__ write of size 4
========= at 0x00000310 in kernel(float, float, float, float, int, int, int, int*)
========= by thread (176,3,0) in block (92,0,0)
========= Address 0x001972c0 is out of bounds
========= Device Frame:kernel(float, float, float, float, int, int, int, int*) (kernel(float, float, float, float, int, int, int, int*) : 0x310)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
========= Host Frame:./Mandelbrot [0x1d3eb]
========= Host Frame:./Mandelbrot [0x3a63e]
========= Host Frame:./Mandelbrot [0x678c]
========= Host Frame:./Mandelbrot [0x6655]
========= Host Frame:./Mandelbrot [0x66cd]
========= Host Frame:./Mandelbrot [0x63af]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
最后:
Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a9a0]
========= Host Frame:./Mandelbrot [0x641a]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventElapsedTime.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a641]
========= Host Frame:./Mandelbrot [0x6434]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a7f0]
========= Host Frame:./Mandelbrot [0x6440]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a7f0]
========= Host Frame:./Mandelbrot [0x644c]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x48350]
========= Host Frame:./Mandelbrot [0x6458]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x48350]
========= Host Frame:./Mandelbrot [0x6464]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
I believe that it may be due to an overflow error of the index
这当然是个问题。
I would not understand why I would have the error in this case, and when I input other values, this does not happen (for example, when i have a height= 16384 * 4 and width=8192 * 4).
16384*8*8192*8 = 8,589,934,592
该数字不适合 unsigned int
变量。它会溢出计算。
如果我们乘以 4 两次而不是乘以 8 两次,那么乘积当然会变小 4 倍,并且不会溢出 unsigned int
计算。稍后,当中间产品乘以 sizeof(...)
时,sizeof
returns 一个 size_t
的数量,因此计算转换为 64 位形式 那时.
总之,none这跟CUDA有关系。我们可以通过将数量转换为 size_t
.
来解决问题
另一个问题是内存大小的问题。给定所需的数字和上述计算,您需要 32GB 的设备缓冲区:
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
你也在这样做:
cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int));
buff
分配实际上并没有在您的代码中的任何地方使用,但是它当然需要另外 32GB。因此,除非您 运行 在 A100 80GB GPU 上运行,否则这是行不通的。
我正在测试的 GPU 有 32GB,所以如果我删除不必要的分配,并将 GPU 内存要求降低到 16GB,我得到 运行s 在 cuda-memcheck
下没有错误的代码(但是 运行 需要相当长的时间):
#include <iostream>
static void
writePPM( int *buf, size_t width, size_t height, int Max_Iterations, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%lu %lu\n", width, height);
fprintf(fp, "255\n");
for (size_t i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i]== Max_Iterations) ? char(255): 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
__device__ static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_im = c_im + new_im;
z_re = c_re + new_re;
}
return i;
}
__global__ void kernel (float x0, float y0, float x1, float y1,
size_t width, size_t height, int maxIterations,
int *output)
{
size_t w= blockIdx.x*blockDim.x+threadIdx.x;
size_t h= blockIdx.y*blockDim.y+threadIdx.y;
float dx =(x1 - x0) / width;
float dy =(y1 - y0) / height;
if (h<height && w<width) {
//for (int i = 0; i < width; ++i) {
float x = x0 + w * dx;
float y = y0 + h * dy;
size_t index= (width*h+w);
output[index] = mandel(x, y, maxIterations);
//}
}
}
int main(){
size_t width =16384*4;
size_t height=8192*8;
float x0 = -2;
float x1 = 1;
float myy0 = -1;
float myy1 = 1;
uint32_t maxIterations = 1024;
size_t THREADSPERBLOCK = 1024;
size_t THREADSPERBLOCK_X = 256;
size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
int *buf= (int *) malloc (width *height *sizeof(unsigned int));
int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
cudaEvent_t start, stop;
float streamElapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
int *buf_d;
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);
dim3 gridDims (num_blocks_x,num_blocks_y);
dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);
kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);
cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
cudaEventRecord( stop, 0);
cudaEventSynchronize( stop);
cudaEventElapsedTime( &streamElapsedTime, start, stop );
cudaEventDestroy( start);
cudaEventDestroy( stop );
cudaFree(buf_d);
free(buf_h);
free(buf);
printf("\nCUDA stream elapsed time: %f", streamElapsedTime);
return 0;
}
我已经在 Cuda 中实现了 Mandelbrot 集。当我输入附加代码中存在的高度和宽度时,我通过 运行 cuda-memcheck 命令收到此错误。它是由什么引起的?我认为这可能是由于输出结果向量的索引索引溢出错误,但我不明白为什么在这种情况下会出现错误,而当我输入其他值时,不会发生这种情况(例如,当我有一个高度= 16384 * 4和宽度= 8192 * 4)。 谢谢大家抽出时间。
#include <iostream>
static void
writePPM( int *buf, int width, int height, int Max_Iterations, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i]== Max_Iterations) ? char(255): 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
__device__ static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_im = c_im + new_im;
z_re = c_re + new_re;
}
return i;
}
__global__ void kernel (float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int *output)
{
int w= blockIdx.x*blockDim.x+threadIdx.x;
int h= blockIdx.y*blockDim.y+threadIdx.y;
float dx =(x1 - x0) / width;
float dy =(y1 - y0) / height;
if (h<height && w<width) {
//for (int i = 0; i < width; ++i) {
float x = x0 + w * dx;
float y = y0 + h * dy;
int index= (width*h+w);
output[index] = mandel(x, y, maxIterations);
//}
}
}
int main(){
unsigned int width =16384*8;
unsigned int height=8192*8;
float x0 = -2;
float x1 = 1;
float myy0 = -1;
float myy1 = 1;
uint32_t maxIterations = 1024;
size_t THREADSPERBLOCK = 1024;
size_t THREADSPERBLOCK_X = 256;
size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
int *buf= (int *) malloc (width *height *sizeof(unsigned int));
int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
cudaEvent_t start, stop;
float streamElapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
int *buf_d;
int *buff;
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int));
cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);
dim3 gridDims (num_blocks_x,num_blocks_y);
dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);
kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);
cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
cudaEventRecord( stop, 0);
cudaEventSynchronize( stop);
cudaEventElapsedTime( &streamElapsedTime, start, stop );
cudaEventDestroy( start);
cudaEventDestroy( stop );
cudaFree(buf_d);
cudaFree(buff);
free(buf_h);
free(buf);
printf("\nCUDA stream elapsed time: %f", streamElapsedTime);
return 0;
}**strong text**
运行 cuda-memcheck 这个错误我报了好几次:
Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
========= Host Frame:./Mandelbrot [0x1d3eb]
========= Host Frame:./Mandelbrot [0x3a63e]
========= Host Frame:./Mandelbrot [0x678c]
========= Host Frame:./Mandelbrot [0x6655]
========= Host Frame:./Mandelbrot [0x66cd]
========= Host Frame:./Mandelbrot [0x63af]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Invalid __global__ write of size 4
========= at 0x00000310 in kernel(float, float, float, float, int, int, int, int*)
========= by thread (176,3,0) in block (92,0,0)
========= Address 0x001972c0 is out of bounds
========= Device Frame:kernel(float, float, float, float, int, int, int, int*) (kernel(float, float, float, float, int, int, int, int*) : 0x310)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
========= Host Frame:./Mandelbrot [0x1d3eb]
========= Host Frame:./Mandelbrot [0x3a63e]
========= Host Frame:./Mandelbrot [0x678c]
========= Host Frame:./Mandelbrot [0x6655]
========= Host Frame:./Mandelbrot [0x66cd]
========= Host Frame:./Mandelbrot [0x63af]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
最后:
Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a9a0]
========= Host Frame:./Mandelbrot [0x641a]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventElapsedTime.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a641]
========= Host Frame:./Mandelbrot [0x6434]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a7f0]
========= Host Frame:./Mandelbrot [0x6440]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x4a7f0]
========= Host Frame:./Mandelbrot [0x644c]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x48350]
========= Host Frame:./Mandelbrot [0x6458]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
========= Host Frame:./Mandelbrot [0x48350]
========= Host Frame:./Mandelbrot [0x6464]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./Mandelbrot [0x5efa]
I believe that it may be due to an overflow error of the index
这当然是个问题。
I would not understand why I would have the error in this case, and when I input other values, this does not happen (for example, when i have a height= 16384 * 4 and width=8192 * 4).
16384*8*8192*8 = 8,589,934,592
该数字不适合 unsigned int
变量。它会溢出计算。
如果我们乘以 4 两次而不是乘以 8 两次,那么乘积当然会变小 4 倍,并且不会溢出 unsigned int
计算。稍后,当中间产品乘以 sizeof(...)
时,sizeof
returns 一个 size_t
的数量,因此计算转换为 64 位形式 那时.
总之,none这跟CUDA有关系。我们可以通过将数量转换为 size_t
.
另一个问题是内存大小的问题。给定所需的数字和上述计算,您需要 32GB 的设备缓冲区:
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
你也在这样做:
cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int));
buff
分配实际上并没有在您的代码中的任何地方使用,但是它当然需要另外 32GB。因此,除非您 运行 在 A100 80GB GPU 上运行,否则这是行不通的。
我正在测试的 GPU 有 32GB,所以如果我删除不必要的分配,并将 GPU 内存要求降低到 16GB,我得到 运行s 在 cuda-memcheck
下没有错误的代码(但是 运行 需要相当长的时间):
#include <iostream>
static void
writePPM( int *buf, size_t width, size_t height, int Max_Iterations, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%lu %lu\n", width, height);
fprintf(fp, "255\n");
for (size_t i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i]== Max_Iterations) ? char(255): 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
__device__ static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_im = c_im + new_im;
z_re = c_re + new_re;
}
return i;
}
__global__ void kernel (float x0, float y0, float x1, float y1,
size_t width, size_t height, int maxIterations,
int *output)
{
size_t w= blockIdx.x*blockDim.x+threadIdx.x;
size_t h= blockIdx.y*blockDim.y+threadIdx.y;
float dx =(x1 - x0) / width;
float dy =(y1 - y0) / height;
if (h<height && w<width) {
//for (int i = 0; i < width; ++i) {
float x = x0 + w * dx;
float y = y0 + h * dy;
size_t index= (width*h+w);
output[index] = mandel(x, y, maxIterations);
//}
}
}
int main(){
size_t width =16384*4;
size_t height=8192*8;
float x0 = -2;
float x1 = 1;
float myy0 = -1;
float myy1 = 1;
uint32_t maxIterations = 1024;
size_t THREADSPERBLOCK = 1024;
size_t THREADSPERBLOCK_X = 256;
size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
int *buf= (int *) malloc (width *height *sizeof(unsigned int));
int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
cudaEvent_t start, stop;
float streamElapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord( start, 0 );
int *buf_d;
cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);
dim3 gridDims (num_blocks_x,num_blocks_y);
dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);
kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);
cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
cudaEventRecord( stop, 0);
cudaEventSynchronize( stop);
cudaEventElapsedTime( &streamElapsedTime, start, stop );
cudaEventDestroy( start);
cudaEventDestroy( stop );
cudaFree(buf_d);
free(buf_h);
free(buf);
printf("\nCUDA stream elapsed time: %f", streamElapsedTime);
return 0;
}