cudaLaunchKernel 启动内核失败

cudaLaunchKernel failed to launch kernel

我正在尝试使用运行时 API 启动内核函数。出于某种原因,我无法直接调用 cudaLaunchKernel。相反,我调用了一个在其中调用 cudaLaunchKernel 的函数。这是一个示例,它只是从设备打印一条消息:

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>

__global__
void hello()
{
  printf(“hello from kernel. \n”);
}

template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
  cudaError_t res;
  dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
  dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
  res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
  if (res != CUDA_SUCCESS)
  {
    char msg[256];
    printf (“error during kernel launch \n”);
    return -1;
  }
return 0;
}

int main(void)
{
  float *hx, *dx;
  hx = (float*)malloc(32 * sizeof(float));
  cudaMalloc(&dx, 32 * sizeof(float));
  unsigned int threads = 32;
  unsigned int blocks = 1;
  ///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
  //cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
  //////////////////////////////////////////////////////////////////////////////////
  ///////// option 2: call a function which further calls cudaLaunchKernel /////////
  const size_t grid3d[3] = {blocks, 0, 0};
  const size_t block3d[3] = {threads, 0, 0};
  launchKernel (hello , grid3d , block3d);
  //////////////////////////////////////////////////////////////////////////////////
  cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(dx);
  free(hx);
  return 0;
}

直接调用 cudaLaunchKernel 的选项 1 有效。但是,间接调用 cudaLaunchKernel 的选项 2 不起作用。使用选项 2,设备未打印任何消息,并且 return 值不等于 CUDA_SUCCESS.

我想知道是否有人对这个问题有任何见解。

提前感谢您的帮助和时间。

网格和块维度不能为零:

const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};

您的两次启动行为不同的原因是您创建的网格和块维度变量不同。

如果你改为:

const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};

这两种情况都适用。

顺便说一下,这种错误捕获对你自己没有任何好处:

  if (res != CUDA_SUCCESS)
  {
    char msg[256];
    printf (“error during kernel launch \n”);
    return -1;
  }

这会更有启发性:

  if (res != cudaSuccess)
  {
    printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
    return -1;
  }