cudaLaunchKernel 启动内核失败
cudaLaunchKernel failed to launch kernel
我正在尝试使用运行时 API 启动内核函数。出于某种原因,我无法直接调用 cudaLaunchKernel。相反,我调用了一个在其中调用 cudaLaunchKernel 的函数。这是一个示例,它只是从设备打印一条消息:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. \n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}
直接调用 cudaLaunchKernel 的选项 1 有效。但是,间接调用 cudaLaunchKernel 的选项 2 不起作用。使用选项 2,设备未打印任何消息,并且 return 值不等于 CUDA_SUCCESS.
我想知道是否有人对这个问题有任何见解。
提前感谢您的帮助和时间。
网格和块维度不能为零:
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
您的两次启动行为不同的原因是您创建的网格和块维度变量不同。
如果你改为:
const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};
这两种情况都适用。
顺便说一下,这种错误捕获对你自己没有任何好处:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
这会更有启发性:
if (res != cudaSuccess)
{
printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
return -1;
}
我正在尝试使用运行时 API 启动内核函数。出于某种原因,我无法直接调用 cudaLaunchKernel。相反,我调用了一个在其中调用 cudaLaunchKernel 的函数。这是一个示例,它只是从设备打印一条消息:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. \n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}
直接调用 cudaLaunchKernel 的选项 1 有效。但是,间接调用 cudaLaunchKernel 的选项 2 不起作用。使用选项 2,设备未打印任何消息,并且 return 值不等于 CUDA_SUCCESS.
我想知道是否有人对这个问题有任何见解。
提前感谢您的帮助和时间。
网格和块维度不能为零:
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
您的两次启动行为不同的原因是您创建的网格和块维度变量不同。
如果你改为:
const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};
这两种情况都适用。
顺便说一下,这种错误捕获对你自己没有任何好处:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
这会更有启发性:
if (res != cudaSuccess)
{
printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
return -1;
}