CUDA-大小为 4 的 __global__ 写入无效

CUDA- Invalid __global__ write of size 4

我已经在 Cuda 中实现了 Mandelbrot 集。当我输入附加代码中存在的高度和宽度时,我通过 运行 cuda-memcheck 命令收到此错误。它是由什么引起的?我认为这可能是由于输出结果向量的索引索引溢出错误,但我不明白为什么在这种情况下会出现错误,而当我输入其他值时,不会发生这种情况(例如,当我有一个高度= 16384 * 4和宽度= 8192 * 4)。 谢谢大家抽出时间。


#include <iostream> 
    
static void
writePPM( int *buf, int width, int height, int Max_Iterations, const char *fn) {
    FILE *fp = fopen(fn, "wb");
    fprintf(fp, "P6\n");
    fprintf(fp, "%d %d\n", width, height);
    fprintf(fp, "255\n");
    for (int i = 0; i < width*height; ++i) {
        // Map the iteration count to colors by just alternating between
        // two greys.
        char c = (buf[i]== Max_Iterations) ? char(255): 20;
        for (int j = 0; j < 3; ++j)
            fputc(c, fp);
    }
    fclose(fp);
    printf("Wrote image file %s\n", fn);
}

__device__ static int mandel(float c_re, float c_im, int count) {
   
   
   float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
        z_im = c_im + new_im;
        z_re = c_re + new_re;
      
    }

    return i;
}


__global__ void kernel (float x0, float y0, float x1, float y1,
                       int width, int height, int maxIterations,
                       int  *output)
{

    int w= blockIdx.x*blockDim.x+threadIdx.x;
    int h= blockIdx.y*blockDim.y+threadIdx.y; 

    float dx =(x1 - x0) / width;
    float dy =(y1 - y0) / height;
    
    if (h<height && w<width) {
        //for (int i = 0; i < width; ++i) {
            float x = x0 + w * dx;
            float y = y0 + h * dy;
        int index= (width*h+w);
            output[index] = mandel(x, y, maxIterations);
        //}
    }
}



int main(){
    

    unsigned int width =16384*8;
    unsigned int height=8192*8;
        float x0 = -2;
        float x1 = 1;
        float myy0 = -1;
        float myy1 = 1;
        
    uint32_t maxIterations = 1024; 
    size_t THREADSPERBLOCK = 1024;
    size_t THREADSPERBLOCK_X = 256;
    size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;
    
    
         
    

    int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
    int *buf= (int *) malloc (width *height *sizeof(unsigned int)); 

    
    int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
    int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;
    
    cudaEvent_t start, stop;
    float streamElapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord( start, 0 );

    
    int *buf_d;
    int *buff;
    
    cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));
    cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int)); 

    
    cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);

    dim3 gridDims (num_blocks_x,num_blocks_y);
    dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);


        
    kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);

    cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
    writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");  
    cudaEventRecord( stop, 0);
    cudaEventSynchronize( stop);
    cudaEventElapsedTime( &streamElapsedTime, start, stop );
    cudaEventDestroy( start);
    cudaEventDestroy( stop );
    
    cudaFree(buf_d);
    cudaFree(buff); 

    free(buf_h);
    free(buf);

    printf("\nCUDA stream elapsed time:  %f", streamElapsedTime);

    
    return 0;
}**strong text**

运行 cuda-memcheck 这个错误我报了好几次:

Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
=========     Host Frame:./Mandelbrot [0x1d3eb]
=========     Host Frame:./Mandelbrot [0x3a63e]
=========     Host Frame:./Mandelbrot [0x678c]
=========     Host Frame:./Mandelbrot [0x6655]
=========     Host Frame:./Mandelbrot [0x66cd]
=========     Host Frame:./Mandelbrot [0x63af]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Invalid __global__ write of size 4
=========     at 0x00000310 in kernel(float, float, float, float, int, int, int, int*)
=========     by thread (176,3,0) in block (92,0,0)
=========     Address 0x001972c0 is out of bounds
=========     Device Frame:kernel(float, float, float, float, int, int, int, int*) (kernel(float, float, float, float, int, int, int, int*) : 0x310)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2fe) [0x28ccce]
=========     Host Frame:./Mandelbrot [0x1d3eb]
=========     Host Frame:./Mandelbrot [0x3a63e]
=========     Host Frame:./Mandelbrot [0x678c]
=========     Host Frame:./Mandelbrot [0x6655]
=========     Host Frame:./Mandelbrot [0x66cd]
=========     Host Frame:./Mandelbrot [0x63af]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========

最后:

 Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a9a0]
=========     Host Frame:./Mandelbrot [0x641a]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventElapsedTime.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a641]
=========     Host Frame:./Mandelbrot [0x6434]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a7f0]
=========     Host Frame:./Mandelbrot [0x6440]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaEventDestroy.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x4a7f0]
=========     Host Frame:./Mandelbrot [0x644c]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x48350]
=========     Host Frame:./Mandelbrot [0x6458]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]
=========

========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3a0403]
=========     Host Frame:./Mandelbrot [0x48350]
=========     Host Frame:./Mandelbrot [0x6464]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./Mandelbrot [0x5efa]

I believe that it may be due to an overflow error of the index

这当然是个问题。

I would not understand why I would have the error in this case, and when I input other values, this does not happen (for example, when i have a height= 16384 * 4 and width=8192 * 4).

16384*8*8192*8 = 8,589,934,592

该数字不适合 unsigned int 变量。它会溢出计算。

如果我们乘以 4 两次而不是乘以 8 两次,那么乘积当然会变小 4 倍,并且不会溢出 unsigned int 计算。稍后,当中间产品乘以 sizeof(...) 时,sizeof returns 一个 size_t 的数量,因此计算转换为 64 位形式 那时.

总之,none这跟CUDA有关系。我们可以通过将数量转换为 size_t.

来解决问题

另一个问题是内存大小的问题。给定所需的数字和上述计算,您需要 32GB 的设备缓冲区:

cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));

你也在这样做:

cudaMalloc ((void**) & buff, width *height * sizeof(unsigned int)); 

buff 分配实际上并没有在您的代码中的任何地方使用,但是它当然需要另外 32GB。因此,除非您 运行 在 A100 80GB GPU 上运行,否则这是行不通的。

我正在测试的 GPU 有 32GB,所以如果我删除不必要的分配,并将 GPU 内存要求降低到 16GB,我得到 运行s 在 cuda-memcheck 下没有错误的代码(但是 运行 需要相当长的时间):

#include <iostream>

static void
writePPM( int *buf, size_t width, size_t height, int Max_Iterations, const char *fn) {
    FILE *fp = fopen(fn, "wb");
    fprintf(fp, "P6\n");
    fprintf(fp, "%lu %lu\n", width, height);
    fprintf(fp, "255\n");
    for (size_t i = 0; i < width*height; ++i) {
        // Map the iteration count to colors by just alternating between
        // two greys.
        char c = (buf[i]== Max_Iterations) ? char(255): 20;
        for (int j = 0; j < 3; ++j)
            fputc(c, fp);
    }
    fclose(fp);
    printf("Wrote image file %s\n", fn);
}

__device__ static int mandel(float c_re, float c_im, int count) {


   float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
        float new_im = 2.f * z_re * z_im;
        z_im = c_im + new_im;
        z_re = c_re + new_re;

    }

    return i;
}


__global__ void kernel (float x0, float y0, float x1, float y1,
                       size_t width, size_t height, int maxIterations,
                       int  *output)
{

    size_t w= blockIdx.x*blockDim.x+threadIdx.x;
    size_t h= blockIdx.y*blockDim.y+threadIdx.y;

    float dx =(x1 - x0) / width;
    float dy =(y1 - y0) / height;

    if (h<height && w<width) {
        //for (int i = 0; i < width; ++i) {
            float x = x0 + w * dx;
            float y = y0 + h * dy;
        size_t index= (width*h+w);
            output[index] = mandel(x, y, maxIterations);
        //}
    }
}



int main(){


    size_t width =16384*4;
    size_t height=8192*8;
        float x0 = -2;
        float x1 = 1;
        float myy0 = -1;
        float myy1 = 1;

    uint32_t maxIterations = 1024;
    size_t THREADSPERBLOCK = 1024;
    size_t THREADSPERBLOCK_X = 256;
    size_t THREADSPERBLOCK_Y = THREADSPERBLOCK / THREADSPERBLOCK_X;





    int *buf_h= (int *)malloc(width * height * sizeof(unsigned int));
    int *buf= (int *) malloc (width *height *sizeof(unsigned int));


    int num_blocks_x= (width + THREADSPERBLOCK_X-1)/THREADSPERBLOCK_X;
    int num_blocks_y=(height + THREADSPERBLOCK_Y-1)/THREADSPERBLOCK_Y;

    cudaEvent_t start, stop;
    float streamElapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord( start, 0 );


    int *buf_d;

    cudaMalloc ((void **) & buf_d,width * height * sizeof(unsigned int));


    cudaMemcpy (buf_d,buf_h,width * height * sizeof(unsigned int),cudaMemcpyHostToDevice);

    dim3 gridDims (num_blocks_x,num_blocks_y);
    dim3 blockDims(THREADSPERBLOCK_X,THREADSPERBLOCK_Y);



    kernel<<<gridDims,blockDims>>>(x0,myy0,x1,myy1,width,height,maxIterations,buf_d);

    cudaMemcpy( buf,buf_d, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost );
    writePPM(buf, width, height,maxIterations, "mandelbrot-parallel.ppm");
    cudaEventRecord( stop, 0);
    cudaEventSynchronize( stop);
    cudaEventElapsedTime( &streamElapsedTime, start, stop );
    cudaEventDestroy( start);
    cudaEventDestroy( stop );

    cudaFree(buf_d);

    free(buf_h);
    free(buf);

    printf("\nCUDA stream elapsed time:  %f", streamElapsedTime);


    return 0;
}