内核启动失败:参数无效,cuda 运行时错误

Kernel launch failed: invalid argument ,cuda runtime error

我正在尝试在附加代码中启动内核。我正在接受按摩 "kernel launched failed:invalid argument"。

// System includes
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
    if(cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
        exit(-1);
    }
}

static const int MAX_FILTER_WIDTH = 7;

char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";

// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
    //forward declaration of a function that is being used here
    void parse_coefs(const char* coefs_txt, int radius, float* coefs);

    char* coefs_txt;

    if (argv==NULL || filt_width==NULL || factor==NULL || bias==NULL || coefs==NULL)
    {
        printf("Error: Bad params to load_coefs\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
    {
        *filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
        if (*filt_width < 1 || *filt_width >  MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
        {
            printf("Error: Invalid filter width (%d)\n",*filt_width);
            exit(-1);
        }
    }
    else
    {
        printf("Error: Filter width is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
        *bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
    else
    {
        printf("Error: Bias is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
        *factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
    else
    {
        printf("Error: Factor is not specified\n");
        exit(-1);
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
        getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);

    parse_coefs(coefs_txt,*filt_width,coefs);

    if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
        *use_shared = true;
    else
        *use_shared = false;
}

// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
    const char* ptxt = coefs_txt;
    int skip_chars;

    memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));

    for (int i = filt_width - 1; i >= 0; i--)
    {
        for (int j = filt_width - 1; j >= 0; j--)
        {
            if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
            {
                printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width);
                exit(-1);
            }
            ptxt += skip_chars+1;
        }
    }
}

__global__ void convolution2D_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        float* filter,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth,
        float hfactor,
        float hbias
        )
{/*
    int idx=blockDim.x*blockIdx.x+threadIdx.x;
    int idy=blockDim.y*blockIdx.y+threadIdx.y;
    if(0<idx<imageWidth && 0<idy<imageHeight){
        float sum = 0.f;

        //multiply every value of the filter with corresponding image pixel
        for(int filterX = 0; filterX < filterWidth; filterX++)
        for(int filterY = 0; filterY < filterWidth; filterY++)
        {
            int imageX = idx - filterWidth / 2 + filterX;
            int imageY = idy - filterWidth / 2 + filterY;
            if (imageX >=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
                sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
            }
            //sum*=hfactor;
            //sum+=hbias;
            //sum=
                                                                                                                                                                                                                                                                                                                                                                                                                                                        //truncate values smaller than zero and larger than 255
            outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);

        }
    }*/
}

__global__ void convolution2DShared_kernel(
        unsigned char* inputImage,
        unsigned char* outputImage,
        int imageWidth,
        int imageHeight,
        int imagePitch,
        int filterWidth
        )
{

}

void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
                int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
    // Allocate device memory
    unsigned char *d_in=NULL, *d_out=NULL;
    float *d_filter=NULL;

    int imgSize=sizeof(float)*width*height;
    int filterSize=sizeof(float)*hfilt_width*hfilt_width;

    int blockWidth=32;
    int gridx=width/blockWidth;
    if(width%blockWidth!=0)
        gridx++;
    printf("gridx size is %d\n",gridx);
    int gridy=height/blockWidth;
    if(height%blockWidth!=0)
        gridy++;
    printf("gridy size is %d\n",gridy);
    printf("blockWidth size is %d\n",blockWidth);

    // measure execution time
    cudaEvent_t start,stop;
    const int iters = 10;

    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, NULL);

    printf("allocating mem\n");
    cudaMalloc((void **) d_in, imgSize);
    cudaMalloc((void **) d_out, imgSize);
    cudaMalloc((void **) &d_filter, filterSize);


    cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);

    // Setup execution parameters
    dim3 threads(blockWidth, blockWidth);       
    dim3 grid(gridx,gridy);
    printf("kernel starts\n");
    // calculate execution time average over iters iterations
    for (int i=0; i<iters; i++)
    {
        if (!use_shared)
            convolution2D_kernel<<<grid,threads>>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
        else
            convolution2DShared_kernel<<<grid,threads>>>(d_in, d_out, width, height, width, hfilt_width);
    }

    checkCudaErrors(cudaEventRecord(stop, NULL));
    checkCudaErrors(cudaEventSynchronize(stop));

    // check for errors during kernel launch
    cudaError_t err;
    if ((err = cudaGetLastError()) != cudaSuccess)
    {
        printf("Kernel launch failed: %s",cudaGetErrorString(err));
        exit(1);
    }

    float msec = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));

    printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n",
            hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);

    // write results to results file
    unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
    if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
    {
        printf("Error: Writing results file failed.");
        exit(1);
    }

    cudaFree(d_in);
    cudaFree(d_out);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
        int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
    for(int x = 0; x < width; x++)
        for(int y = 0; y < height; y++)
        {
            float sum = 0.f;

            //multiply every value of the filter with corresponding image pixel
            for(int filterX = 0; filterX < hfilt_width; filterX++)
            for(int filterY = 0; filterY < hfilt_width; filterY++)
            {
                int imageX = x - hfilt_width / 2 + filterX;
                int imageY = y - hfilt_width / 2 + filterY;
                if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
                    sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
                }
            }

            //truncate values smaller than zero and larger than 255
            output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
        }
}


/**
* Program main
*/
int main(int argc, char **argv)
{
    unsigned char* h_inimg = NULL;
    unsigned char* h_outimg = NULL;
    unsigned char* h_refimg = NULL;
    unsigned int width, height;
    int hfilt_width = -1;
    float hfactor = 1.f, hbias = 0.f;
    float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
    bool use_shared = false;

    // load parameters of filter
    if (argc > 1)
        load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
    else {
        hfilt_width = 5;
        hfactor = 1.0f / 13.0f;
        hbias = 0.0f;
        parse_coefs(
            "0,0,1,0,0,"
            "0,1,1,1,0,"
            "1,1,1,1,1,"
            "0,1,1,1,0,"
            "0,0,1,0,0,",
            hfilt_width,hcoefs);
    }


    char* image_path = sdkFindFilePath(image_filename, argv[0]);
    if (image_path == NULL) {
        printf("Unable to source image file: %s\n", image_filename);
        exit(-1);
    }
    // Load image from disk
    sdkLoadPGM(image_path, &h_inimg, &width, &height);
    h_outimg = (unsigned char*)malloc(width * height);

    printf("Starting convolution\n");
    convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);

    printf("Validating...\n");
    h_refimg = (unsigned char*)malloc(width * height);
    convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
    int err_cnt = 0;
    for (int r=0; r<height; r++)
        for (int c=0; c<width; c++)
            if (h_outimg[c+r*width]!=h_refimg[c+r*width])
            {
                ++err_cnt;
                printf("Err %2d: [%d,%d] GPU %d | CPU %d\n",err_cnt,r,c,h_outimg[c+r*width],h_refimg[c+r*width]);
                if(err_cnt > 4)
                {
                    printf("Terminating...\n");
                    exit(1);
                }
            }
    if (0 == err_cnt)
        printf("OK\n");

    // Save image
    sdkSavePGM(out_filename,h_outimg,width,height);

    free(h_inimg);
    free(h_outimg);
}

如果我将第 191 行放入注释中,一切都运行良好且花花公子(内核中没有数据)。

谁能指出将数据传送到内核的正确方法?

首先,您在 proper cuda error checking 方面做得不够好。您应该检查 每个 CUDA API 调用的 return 值。

如果您这样做了,您会发现 "invalid argument" 错误与您的内核启动 没有任何关系,但是因为那是您唯一的地方正在检查错误,在那里收到报告。

实际错误发生在这些行上:

cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

您可以通过添加 necessary ampersands:

来修复它
cudaMalloc((void **) &d_in, imgSize);
cudaMalloc((void **) &d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);

修复该错误后,您会发现下一个错误是 cudaMemcpy 操作的段错误:

cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);

根本原因在这里:

int imgSize=sizeof(float)*width*height;
            ^^^^^^^^^^^^^

因为你的 d_inunsigned char 而你的 input_imgunsigned char,我不确定你为什么认为你应该将图像大小乘以 sizeof(float).无论如何,将该行更改为:

int imgSize=width*height;

将修复段错误。进行这些更改可以让您的代码 运行 对我来说没有任何 CUDA 错误。显然结果是假的,因为你的内核什么都不做。