内核启动失败:参数无效,cuda 运行时错误
Kernel launch failed: invalid argument ,cuda runtime error
我正在尝试在附加代码中启动内核。我正在接受按摩 "kernel launched failed:invalid argument"。
// System includes
#include <stdio.h>
#include <assert.h>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
static const int MAX_FILTER_WIDTH = 7;
char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";
// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
//forward declaration of a function that is being used here
void parse_coefs(const char* coefs_txt, int radius, float* coefs);
char* coefs_txt;
if (argv==NULL || filt_width==NULL || factor==NULL || bias==NULL || coefs==NULL)
{
printf("Error: Bad params to load_coefs\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
{
*filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
if (*filt_width < 1 || *filt_width > MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
{
printf("Error: Invalid filter width (%d)\n",*filt_width);
exit(-1);
}
}
else
{
printf("Error: Filter width is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
*bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
else
{
printf("Error: Bias is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
*factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
else
{
printf("Error: Factor is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);
parse_coefs(coefs_txt,*filt_width,coefs);
if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
*use_shared = true;
else
*use_shared = false;
}
// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
const char* ptxt = coefs_txt;
int skip_chars;
memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));
for (int i = filt_width - 1; i >= 0; i--)
{
for (int j = filt_width - 1; j >= 0; j--)
{
if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
{
printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width);
exit(-1);
}
ptxt += skip_chars+1;
}
}
}
__global__ void convolution2D_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
float* filter,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth,
float hfactor,
float hbias
)
{/*
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
if(0<idx<imageWidth && 0<idy<imageHeight){
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < filterWidth; filterX++)
for(int filterY = 0; filterY < filterWidth; filterY++)
{
int imageX = idx - filterWidth / 2 + filterX;
int imageY = idy - filterWidth / 2 + filterY;
if (imageX >=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
}
//sum*=hfactor;
//sum+=hbias;
//sum=
//truncate values smaller than zero and larger than 255
outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);
}
}*/
}
__global__ void convolution2DShared_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth
)
{
}
void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
// Allocate device memory
unsigned char *d_in=NULL, *d_out=NULL;
float *d_filter=NULL;
int imgSize=sizeof(float)*width*height;
int filterSize=sizeof(float)*hfilt_width*hfilt_width;
int blockWidth=32;
int gridx=width/blockWidth;
if(width%blockWidth!=0)
gridx++;
printf("gridx size is %d\n",gridx);
int gridy=height/blockWidth;
if(height%blockWidth!=0)
gridy++;
printf("gridy size is %d\n",gridy);
printf("blockWidth size is %d\n",blockWidth);
// measure execution time
cudaEvent_t start,stop;
const int iters = 10;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, NULL);
printf("allocating mem\n");
cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);
// Setup execution parameters
dim3 threads(blockWidth, blockWidth);
dim3 grid(gridx,gridy);
printf("kernel starts\n");
// calculate execution time average over iters iterations
for (int i=0; i<iters; i++)
{
if (!use_shared)
convolution2D_kernel<<<grid,threads>>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
else
convolution2DShared_kernel<<<grid,threads>>>(d_in, d_out, width, height, width, hfilt_width);
}
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
// check for errors during kernel launch
cudaError_t err;
if ((err = cudaGetLastError()) != cudaSuccess)
{
printf("Kernel launch failed: %s",cudaGetErrorString(err));
exit(1);
}
float msec = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));
printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n",
hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);
// write results to results file
unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
{
printf("Error: Writing results file failed.");
exit(1);
}
cudaFree(d_in);
cudaFree(d_out);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
for(int x = 0; x < width; x++)
for(int y = 0; y < height; y++)
{
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < hfilt_width; filterX++)
for(int filterY = 0; filterY < hfilt_width; filterY++)
{
int imageX = x - hfilt_width / 2 + filterX;
int imageY = y - hfilt_width / 2 + filterY;
if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
}
}
//truncate values smaller than zero and larger than 255
output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
}
}
/**
* Program main
*/
int main(int argc, char **argv)
{
unsigned char* h_inimg = NULL;
unsigned char* h_outimg = NULL;
unsigned char* h_refimg = NULL;
unsigned int width, height;
int hfilt_width = -1;
float hfactor = 1.f, hbias = 0.f;
float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
bool use_shared = false;
// load parameters of filter
if (argc > 1)
load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
else {
hfilt_width = 5;
hfactor = 1.0f / 13.0f;
hbias = 0.0f;
parse_coefs(
"0,0,1,0,0,"
"0,1,1,1,0,"
"1,1,1,1,1,"
"0,1,1,1,0,"
"0,0,1,0,0,",
hfilt_width,hcoefs);
}
char* image_path = sdkFindFilePath(image_filename, argv[0]);
if (image_path == NULL) {
printf("Unable to source image file: %s\n", image_filename);
exit(-1);
}
// Load image from disk
sdkLoadPGM(image_path, &h_inimg, &width, &height);
h_outimg = (unsigned char*)malloc(width * height);
printf("Starting convolution\n");
convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);
printf("Validating...\n");
h_refimg = (unsigned char*)malloc(width * height);
convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
int err_cnt = 0;
for (int r=0; r<height; r++)
for (int c=0; c<width; c++)
if (h_outimg[c+r*width]!=h_refimg[c+r*width])
{
++err_cnt;
printf("Err %2d: [%d,%d] GPU %d | CPU %d\n",err_cnt,r,c,h_outimg[c+r*width],h_refimg[c+r*width]);
if(err_cnt > 4)
{
printf("Terminating...\n");
exit(1);
}
}
if (0 == err_cnt)
printf("OK\n");
// Save image
sdkSavePGM(out_filename,h_outimg,width,height);
free(h_inimg);
free(h_outimg);
}
如果我将第 191 行放入注释中,一切都运行良好且花花公子(内核中没有数据)。
谁能指出将数据传送到内核的正确方法?
首先,您在 proper cuda error checking 方面做得不够好。您应该检查 每个 CUDA API 调用的 return 值。
如果您这样做了,您会发现 "invalid argument" 错误与您的内核启动 没有任何关系,但是因为那是您唯一的地方正在检查错误,在那里收到报告。
实际错误发生在这些行上:
cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
您可以通过添加 necessary ampersands:
来修复它
cudaMalloc((void **) &d_in, imgSize);
cudaMalloc((void **) &d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
修复该错误后,您会发现下一个错误是 cudaMemcpy
操作的段错误:
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
根本原因在这里:
int imgSize=sizeof(float)*width*height;
^^^^^^^^^^^^^
因为你的 d_in
是 unsigned char
而你的 input_img
是 unsigned char
,我不确定你为什么认为你应该将图像大小乘以 sizeof(float)
.无论如何,将该行更改为:
int imgSize=width*height;
将修复段错误。进行这些更改可以让您的代码 运行 对我来说没有任何 CUDA 错误。显然结果是假的,因为你的内核什么都不做。
我正在尝试在附加代码中启动内核。我正在接受按摩 "kernel launched failed:invalid argument"。
// System includes
#include <stdio.h>
#include <assert.h>
// CUDA runtime
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
if(cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
exit(-1);
}
}
static const int MAX_FILTER_WIDTH = 7;
char *image_filename = "lena_bw_big.pgm";
char *out_filename = "lena_bw.out.pgm";
char *results_filename = "results.log";
// Loads filter configuration parameters from the command line
void load_filter(int argc, char** argv, int* filt_width, float* factor, float* bias, float* coefs, bool* use_shared)
{
//forward declaration of a function that is being used here
void parse_coefs(const char* coefs_txt, int radius, float* coefs);
char* coefs_txt;
if (argv==NULL || filt_width==NULL || factor==NULL || bias==NULL || coefs==NULL)
{
printf("Error: Bad params to load_coefs\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "filter_width"))
{
*filt_width = getCmdLineArgumentInt(argc, (const char **)argv, "filter_width");
if (*filt_width < 1 || *filt_width > MAX_FILTER_WIDTH || (*filt_width % 2) != 1)
{
printf("Error: Invalid filter width (%d)\n",*filt_width);
exit(-1);
}
}
else
{
printf("Error: Filter width is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "bias"))
*bias = getCmdLineArgumentFloat(argc, (const char **)argv, "bias");
else
{
printf("Error: Bias is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "factor"))
*factor = getCmdLineArgumentFloat(argc, (const char **)argv, "factor");
else
{
printf("Error: Factor is not specified\n");
exit(-1);
}
if (checkCmdLineFlag(argc, (const char **)argv, "coefs"))
getCmdLineArgumentString(argc, (const char **)argv, "coefs",&coefs_txt);
parse_coefs(coefs_txt,*filt_width,coefs);
if (checkCmdLineFlag(argc, (const char **)argv, "shared"))
*use_shared = true;
else
*use_shared = false;
}
// Parse filter coefficients from string. The number of coefficients should be radius*radius.
void parse_coefs(const char* coefs_txt, int filt_width, float* coefs)
{
const char* ptxt = coefs_txt;
int skip_chars;
memset(coefs,0,MAX_FILTER_WIDTH*MAX_FILTER_WIDTH*sizeof(float));
for (int i = filt_width - 1; i >= 0; i--)
{
for (int j = filt_width - 1; j >= 0; j--)
{
if (sscanf(ptxt,"%f%n", &coefs[i*MAX_FILTER_WIDTH+j], &skip_chars) != 1)
{
printf("Error: Not enough coefficients. Read %d/%d coefficients.\n",i*filt_width+j,filt_width*filt_width);
exit(-1);
}
ptxt += skip_chars+1;
}
}
}
__global__ void convolution2D_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
float* filter,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth,
float hfactor,
float hbias
)
{/*
int idx=blockDim.x*blockIdx.x+threadIdx.x;
int idy=blockDim.y*blockIdx.y+threadIdx.y;
if(0<idx<imageWidth && 0<idy<imageHeight){
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < filterWidth; filterX++)
for(int filterY = 0; filterY < filterWidth; filterY++)
{
int imageX = idx - filterWidth / 2 + filterX;
int imageY = idy - filterWidth / 2 + filterY;
if (imageX >=0 && imageX < imageWidth && imageY >=0 && imageY < imageHeight) {
sum += inputImage[imageX+imageWidth*imageY] * filter[filterX + filterY*filterWidth];
}
//sum*=hfactor;
//sum+=hbias;
//sum=
//truncate values smaller than zero and larger than 255
outputImage[idx+imageWidth*idy] = fminf(fmaxf(int(hfactor * sum + hbias), 0), 255);
}
}*/
}
__global__ void convolution2DShared_kernel(
unsigned char* inputImage,
unsigned char* outputImage,
int imageWidth,
int imageHeight,
int imagePitch,
int filterWidth
)
{
}
void convolution2D(unsigned char* input_img, unsigned char* output_img, float* hfilter, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs, bool use_shared)
{
// Allocate device memory
unsigned char *d_in=NULL, *d_out=NULL;
float *d_filter=NULL;
int imgSize=sizeof(float)*width*height;
int filterSize=sizeof(float)*hfilt_width*hfilt_width;
int blockWidth=32;
int gridx=width/blockWidth;
if(width%blockWidth!=0)
gridx++;
printf("gridx size is %d\n",gridx);
int gridy=height/blockWidth;
if(height%blockWidth!=0)
gridy++;
printf("gridy size is %d\n",gridy);
printf("blockWidth size is %d\n",blockWidth);
// measure execution time
cudaEvent_t start,stop;
const int iters = 10;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, NULL);
printf("allocating mem\n");
cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
cudaMemcpy(d_filter,hfilter,filterSize,cudaMemcpyHostToDevice);
// Setup execution parameters
dim3 threads(blockWidth, blockWidth);
dim3 grid(gridx,gridy);
printf("kernel starts\n");
// calculate execution time average over iters iterations
for (int i=0; i<iters; i++)
{
if (!use_shared)
convolution2D_kernel<<<grid,threads>>>(d_in, d_out, d_filter, width, height, width, hfilt_width, hfactor, hbias);
else
convolution2DShared_kernel<<<grid,threads>>>(d_in, d_out, width, height, width, hfilt_width);
}
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
// check for errors during kernel launch
cudaError_t err;
if ((err = cudaGetLastError()) != cudaSuccess)
{
printf("Kernel launch failed: %s",cudaGetErrorString(err));
exit(1);
}
float msec = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msec, start, stop));
printf("Applying %dx%d filter on image of size %dx%d %s using shared memory took %f ms\n",
hfilt_width,hfilt_width,width,height,(use_shared?"with":"without"),msec/iters);
// write results to results file
unsigned long long result_values[] = {hfilt_width,hfilt_width,width,height,use_shared,msec/iters*1000};
if (true != sdkWriteFile(results_filename,result_values,6,0,false,true))
{
printf("Error: Writing results file failed.");
exit(1);
}
cudaFree(d_in);
cudaFree(d_out);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void convolution_cpu(unsigned char* input_img, unsigned char* output_img, int width, int height,
int hfilt_width, float hfactor, float hbias, float* hcoefs)
{
for(int x = 0; x < width; x++)
for(int y = 0; y < height; y++)
{
float sum = 0.f;
//multiply every value of the filter with corresponding image pixel
for(int filterX = 0; filterX < hfilt_width; filterX++)
for(int filterY = 0; filterY < hfilt_width; filterY++)
{
int imageX = x - hfilt_width / 2 + filterX;
int imageY = y - hfilt_width / 2 + filterY;
if (imageX >=0 && imageX < width && imageY >=0 && imageY < height) {
sum += input_img[imageX+width*imageY] * hcoefs[filterX + filterY*MAX_FILTER_WIDTH];
}
}
//truncate values smaller than zero and larger than 255
output_img[x+width*y] = std::min(std::max(int(hfactor * sum + hbias), 0), 255);
}
}
/**
* Program main
*/
int main(int argc, char **argv)
{
unsigned char* h_inimg = NULL;
unsigned char* h_outimg = NULL;
unsigned char* h_refimg = NULL;
unsigned int width, height;
int hfilt_width = -1;
float hfactor = 1.f, hbias = 0.f;
float hcoefs[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
bool use_shared = false;
// load parameters of filter
if (argc > 1)
load_filter(argc,argv,&hfilt_width,&hfactor,&hbias,hcoefs,&use_shared);
else {
hfilt_width = 5;
hfactor = 1.0f / 13.0f;
hbias = 0.0f;
parse_coefs(
"0,0,1,0,0,"
"0,1,1,1,0,"
"1,1,1,1,1,"
"0,1,1,1,0,"
"0,0,1,0,0,",
hfilt_width,hcoefs);
}
char* image_path = sdkFindFilePath(image_filename, argv[0]);
if (image_path == NULL) {
printf("Unable to source image file: %s\n", image_filename);
exit(-1);
}
// Load image from disk
sdkLoadPGM(image_path, &h_inimg, &width, &height);
h_outimg = (unsigned char*)malloc(width * height);
printf("Starting convolution\n");
convolution2D(h_inimg,h_outimg,hcoefs,width,height,hfilt_width,hfactor,hbias,hcoefs,use_shared);
printf("Validating...\n");
h_refimg = (unsigned char*)malloc(width * height);
convolution_cpu(h_inimg,h_refimg,width,height,hfilt_width,hfactor,hbias,hcoefs);
int err_cnt = 0;
for (int r=0; r<height; r++)
for (int c=0; c<width; c++)
if (h_outimg[c+r*width]!=h_refimg[c+r*width])
{
++err_cnt;
printf("Err %2d: [%d,%d] GPU %d | CPU %d\n",err_cnt,r,c,h_outimg[c+r*width],h_refimg[c+r*width]);
if(err_cnt > 4)
{
printf("Terminating...\n");
exit(1);
}
}
if (0 == err_cnt)
printf("OK\n");
// Save image
sdkSavePGM(out_filename,h_outimg,width,height);
free(h_inimg);
free(h_outimg);
}
如果我将第 191 行放入注释中,一切都运行良好且花花公子(内核中没有数据)。
谁能指出将数据传送到内核的正确方法?
首先,您在 proper cuda error checking 方面做得不够好。您应该检查 每个 CUDA API 调用的 return 值。
如果您这样做了,您会发现 "invalid argument" 错误与您的内核启动 没有任何关系,但是因为那是您唯一的地方正在检查错误,在那里收到报告。
实际错误发生在这些行上:
cudaMalloc((void **) d_in, imgSize);
cudaMalloc((void **) d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
您可以通过添加 necessary ampersands:
来修复它cudaMalloc((void **) &d_in, imgSize);
cudaMalloc((void **) &d_out, imgSize);
cudaMalloc((void **) &d_filter, filterSize);
修复该错误后,您会发现下一个错误是 cudaMemcpy
操作的段错误:
cudaMemcpy(d_in,input_img,imgSize,cudaMemcpyHostToDevice);
根本原因在这里:
int imgSize=sizeof(float)*width*height;
^^^^^^^^^^^^^
因为你的 d_in
是 unsigned char
而你的 input_img
是 unsigned char
,我不确定你为什么认为你应该将图像大小乘以 sizeof(float)
.无论如何,将该行更改为:
int imgSize=width*height;
将修复段错误。进行这些更改可以让您的代码 运行 对我来说没有任何 CUDA 错误。显然结果是假的,因为你的内核什么都不做。