使用驱动程序 API 编译 CUDA 动态并行代码时出错

Question

我正在尝试实现 NestedHelloWorld 作为动态并行性的示例。环境是VS2019，必须实现为Driver API接口，在C中。代码是

CUdevice* gpu_initialize(CUdevice *dev_ptr, int int_of_dev);        //signature of the init function

// Host code
int main()
{
printf("Nested Parallelism : Hello World (Driver API)\n");

CUdevice device;
if (gpu_initialize(&device,0) == NULL) {                //Initialize the device[0] and populate the device pointer contents
    printf("Error Initializing GPU... exiting");
    exit(-1);
}

CUresult result;
char* error_string;

//Create a context
CUcontext  context;
unsigned int flags = CU_CTX_SCHED_YIELD;            //set to mode where the GPU yields when awaiting CPU : increases latency
if ((result = cuCtxCreate(&context,flags,device)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error creating context : %s\n", error_string);
        return -1;
    }
    else {
        printf("Unknown error creating context\n");
        return -1;
    }
}

//Load the module by specifying the filename
CUmodule module;
char filename[] = "C:\Users\gautam\OneDrive\Projects\VS 2019\repos\learn_cuda_nesting\dynamic_parallelism\x64\Debug\nestedHelloWorld.cu.obj";        //Use for VS 2019 compilation

if ((result = cuModuleLoad(&module,filename)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading Module using filename %s: %s \n",filename,error_string);
        return -1;
    }
    else {
        printf("Unknown error loading Module from filename %s\n",filename);
        return -1;
    }
}
else printf("Successfully loaded module %s\n", filename);

//Load the function from the module
CUfunction function;
char        function_name[120];
strcpy_s(function_name,120,"nestedHelloWorld");
if ((result = cuModuleGetFunction(&function,module,function_name)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading function %s: %s\n",function_name, error_string);
        return -1;
    }
    else {
        printf("Unknown error loading function %s\n",function_name);
        return -1;
    }
}
else printf("Successfully loaded function %s\n", function_name);

//Set up kernel grid parameters

int size = 8;
int blocksize = 8;   // initial block size
int igrid = 1;
int threads_per_block_x = 8;
int threads_per_block_y = 1;
int blocks_per_grid_x = (size + threads_per_block_x - 1) / threads_per_block_x;
int blocks_per_grid_y = 1;

//Launch the first function in the kernel
//Case 1 : Invoke kernel with 8 x 1 grid size
void* args[] = {(void *)&size, (void *)&igrid};

if ((result = cuLaunchKernel(function, blocks_per_grid_x, blocks_per_grid_y, 1, threads_per_block_x, threads_per_block_y, 1, 0, 0, args, 0)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error launching kernel %s: %s\n",function_name, error_string);
        result = cuCtxDestroy(context);
        return -1;
    }
    else {
        printf("Unknown error launching kernel %s\n", function_name);
        result = cuCtxDestroy(context);
        return -1;
    }
}
else printf("CUDA kernel launch with (%d,%d) blocks per grid, each with (%d,%d) threads per block\n", blocks_per_grid_x, blocks_per_grid_y, threads_per_block_x, threads_per_block_y);

result = cuCtxSynchronize();        //wait for kernel run to finish
result = cuCtxDestroy(context);
exit(0);
}

对应的.cu文件代码为：

extern "C" __global__ void nestedHelloWorld(int const iSize, int iDepth)
{
int tid = threadIdx.x;
printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid,blockIdx.x);

// condition to stop recursive execution
if (iSize == 1) return;

// reduce block size to half
int nthreads = iSize >> 1;

// thread 0 launches child grid recursively
if(tid == 0 && nthreads > 0)
{
    //nestedHelloWorld(nthreads, ++iDepth);
    nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
    printf("-------> nested execution depth: %d\n", iDepth);
}
}

设置如下：

最后是 gpu 参数和无效 PTX 错误。

虽然这看起来是“基本代码”，但真正的目的是确定正在执行的编译参数错误。编译顺利通过。

非常感谢所有帮助...

编辑：添加了编译成功的截图，以及单独尝试编译文件时使用的外部命令的截图link。错误没有改变。

Answer 1

我们可以从 ptxjit CUDA sample code and then follow the additional instructions 开始构建一个完整的示例。这基本上就是您需要的一切。

这是 linux 上的完整示例，包括 Makefile：

$ cat kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat ptxjit.cpp

/*
 */

// System includes
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <iostream>

// CUDA driver & runtime
#include <cuda.h>
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#define CUDA_DRIVER_API
#include <helper_cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

#define PTX_FILE "kernel.ptx"

const char *sSDKname = "CDP Recursion Test (Driver API)";

bool inline findModulePath(const char *module_file, std::string &module_path,
                           char **argv, std::string &ptx_source) {
  char *actual_path = sdkFindFilePath(module_file, argv[0]);

  if (actual_path) {
    module_path = actual_path;
  } else {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  }

  if (module_path.empty()) {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  } else {
    printf("> findModulePath <%s>\n", module_path.c_str());

    if (module_path.rfind(".ptx") != std::string::npos) {
      FILE *fp = fopen(module_path.c_str(), "rb");
      fseek(fp, 0, SEEK_END);
      int file_size = ftell(fp);
      char *buf = new char[file_size + 1];
      fseek(fp, 0, SEEK_SET);
      fread(buf, sizeof(char), file_size, fp);
      fclose(fp);
      buf[file_size] = '[=10=]';
      ptx_source = buf;
      delete[] buf;
    }

    return true;
  }
}

void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel,
            CUlinkState *lState) {
  const int options_num = 5;
  CUjit_option options[options_num];
  void *optionVals[options_num];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;
  std::string module_path, ptx_source;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *)&walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *)info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *)(long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *)error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *)(long)logSize;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(options_num, options, optionVals, lState));

  // first search for the module path before we load the results
  if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
    printf("> findModulePath could not find <kernel> ptx\n");
    exit(EXIT_FAILURE);
  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

  // Load the PTX from the ptx file
  printf("Loading ptxjit_kernel[] program\n");
  myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(),
                        strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0);

  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
  }
  myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL);
  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "Library Linker Error:\n%s\n", error_log);
  }
  // Complete the linker step
  checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime,
         info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry poin
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "k"));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(*lState));
}

// Variables
CUcontext cuContext;

int main(int argc, char **argv) {
  const unsigned int nThreads = 1;
  const unsigned int nBlocks = 1;

  CUmodule hModule = 0;
  CUfunction hKernel = 0;
  CUlinkState lState;

  int cuda_device = 0;

  printf("[%s] - Starting...\n", sSDKname);
  // Initialize
  checkCudaErrors(cuInit(0));

  CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv);
  int driverVersion;
  cudaDriverGetVersion(&driverVersion);
  if (driverVersion < CUDART_VERSION) {
    printf("driverVersion = %d < CUDART_VERSION = %d \n"
      "Enhanced compatibility is not supported for this sample.. waving execution\n", driverVersion, CUDART_VERSION);
    exit(EXIT_WAIVED);
  }
  // Create context
  checkCudaErrors(cuCtxCreate(&cuContext, 0, dev));


  // JIT Compile the Kernel from PTX and get the Handles (Driver API)
  ptxJIT(argc, argv, &hModule, &hKernel, &lState);

  // Set the kernel parameters (Driver API)
  dim3 block(nThreads, 1, 1);
  dim3 grid(nBlocks, 1, 1);
  int my_N = 4;
  void *args[1] = {&my_N};

  // Launch the kernel (Driver API_)
  checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x,
                                 block.y, block.z, 0, NULL, args, NULL));
  std::cout << "CUDA kernel launched" << std::endl;

  cuCtxSynchronize();

  if (hModule) {
    checkCudaErrors(cuModuleUnload(hModule));
    hModule = 0;
  }

  return EXIT_SUCCESS;
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

PTX_FILE := kernel.ptx

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 70

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

LIBRARIES += -lcudart_static

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: ptxjit $(PTX_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(PTX_FILE): kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -o $@ -ptx $<

ptxjit.o:ptxjit.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

ptxjit: ptxjit.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./ptxjit

clean:
        rm -f ptxjit ptxjit.o  $(PTX_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f ptxjit ptxjit.o  kernel.ptx
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit.o -c ptxjit.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit ptxjit.o  -L/usr/local/cuda/lib64/stubs -lcuda -lcudart_static
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -rdc=true -o kernel.ptx -ptx kernel.cu
$ cuda-memcheck ./ptxjit
========= CUDA-MEMCHECK
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath <./kernel.ptx>
> initCUDA loading module: <./kernel.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

在 windows，您将遵循相同的路径，使用上述文件。从 ptxjit 项目开始。为简单起见，您可能希望重命名内核，使其与 ptxjit 项目中使用的内核文件的名称完全匹配。

这是我使用 VS2019 遵循的详细步骤：

打开 ptxjit 解决方案，在我的机器上它在这里：C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit
从上面的linux版本中取出ptxjit.cpp代码，并用它来替换solution/project.
将定义语句改回：#define PTX_FILE "ptxjit_kernel64.ptx"
更改设备的位置运行时间库以匹配您的机器。特别是这一行： myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL); 需要更改为 myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\lib\x64\cudadevrt.lib", 0, NULL, NULL);
在该项目的 ptxjit_kernel.cu 文件中，将该文件的内容替换为上述 linux 版本的 kernel.cu 文件内容。
在解决方案资源管理器 window 中，右键单击 ptxjit_kernel.cu 文件和 select“属性”。在左侧的“配置属性”窗格中，展开 CUDA C/C++ 部分和 select“通用”。在右侧窗格中，将“生成可重定位设备代码”选项从“否”更改为“是”。单击“确定”。
在同一解决方案资源管理器 window 中，右键单击 ptxjit project 和 select 属性。进入配置属性...CUDA Link呃...常规，并将“执行设备Link”从“是”更改为“否”。单击“确定”。
Select 构建...重建解决方案

当我这样做时，我得到这样的构建控制台输出：

1>------ Rebuild All started: Project: ptxjit, Configuration: Debug x64 ------
1>Compiling CUDA source file ptxjit_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -ptx -cudart static -Xcompiler "/wd 4819" -o data/ptxjit_kernel64.ptx "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit\ptxjit_kernel.cu"
1>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
1>ptxjit_kernel.cu
1>Done building project "ptxjit_vs2019.vcxproj".
1>ptxjit.cpp
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit\ptxjit.cpp(318,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit\ptxjit.cpp(324,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>LINK : ..\..\bin\win64\Debug\ptxjit.exe not found or not built by the last incremental link; performing full link
1>   Creating library ../../bin/win64/Debug/ptxjit.lib and object ../../bin/win64/Debug/ptxjit.exp
1>ptxjit_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
1>Done building project "ptxjit_vs2019.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

此时我们可以转到指定的exe文件位置：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe

和运行它在命令控制台中。当我这样做时，我看到这样的输出：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>ptxjit.exe
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <ptxjit_kernel64.ptx> in ./
...
sdkFindFilePath <ptxjit_kernel64.ptx> in ../../../6_Advanced/ptxjit/data/
> findModulePath <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
> initCUDA loading module: <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in -107374176.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

备注：

上面编译中的 C4312 警告在原始项目中，可以通过在相关行上从 long 切换到 long long 来删除。这不是实际问题。
在运行时间打印的 sdkFindFilePath 条消息的扩展序列可以通过将 ptx 文件从它的位置复制到 exe 文件的位置来缩短。最终的 sdkFindFilePath 输出将告诉您在哪里找到了 ptx 文件。

Answer 2

回复我下发布的问题：

is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface

是的。 （我提供了一个单独的答案，因为我运行进入了我之前答案的字符限制）。

在这种情况下，我们希望在内核代码本身的编译过程中创建 a fatbin object 而不是 ptx。这个 fatbin 需要用 -rdc=true 编译，正如你对动态并行性所期望的那样，还需要与 CUDA 设备运行time 库一起进行设备链接。

本例中的主机端机制更简单，因为我们不需要任何链接步骤。看起来比较接近这个流程的CUDA示例代码是vectorAddDrv所以我将从那个code/sample项目开始来演示这个。

这里是 linux 版本：

$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

// Functions
bool findModulePath(const char *, string &, char **, string &);

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
    printf("Linked CDP demo (Driver API)\n");
    int N = 4, devID = 0;

    // Initialize
    checkCudaErrors(cuInit(0));

    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Create context
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
    {
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    if (!fatbin.str().size())
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }

    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

    // Get function handle from module
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));


        // Grid/Block configuration
        int threadsPerBlock = 1;
        int blocksPerGrid   = 1;

        void *args[] = { &N };

        // Launch the CUDA kernel
        checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    checkCudaErrors(cuCtxSynchronize());

    exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 52 60 61 70 75 80 86

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: vectorAddDrv $(FATBIN_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(FATBIN_FILE): vectorAdd_kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink  -o $@ -fatbin $<

vectorAddDrv.o:vectorAddDrv.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAddDrv: vectorAddDrv.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./vectorAddDrv

clean:
        rm -f vectorAddDrv vectorAddDrv.o  $(FATBIN_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o  vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o  -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink  -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

在 Windows/VS 2019/CUDA 11.1 上，我遵循了以下步骤：

打开 vectorAddDrv project/solution，在我的机器上它位于：C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1[=15=]_Simple\vectorAddDrv
将 vectorAddDrv.cpp 文件中的代码替换为上面 linux 示例中同一文件中的代码。
将 vectorAdd_kernel.cu 文件中的代码替换为上面 linux 示例中同一文件中的代码。
在左侧的解决方案资源管理器窗格中，右键单击 vectorAdd_kernel.cu 文件，然后打开属性。然后在 Configuration Properties...CUDA C/C++...Common 中将“Generate Relocatable Device Code”从 No 更改为 Yes。然后在 Configuration Properties...CUDA C/C++...Command Line 添加 -dlink。还要确保 Configuration Properties...CUDA C/C++...Device...Code Generation 与您想要运行的设备相匹配。单击“确定”。
在左侧的同一解决方案资源管理器窗格中，右键单击 vectorAddDrv project、select Properties，然后在 Configuration Properties...CUDA Link呃...一般将“执行设备Link”从是更改为否。单击“确定”。
Select 构建...重建解决方案。

当我这样做时，我看到这样的控制台构建输出：

1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1[=11=]_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1[=11=]_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1[=11=]_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

如果我们然后打开命令提示符并导航到可执行文件的指定位置，然后运行它，我会看到：

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

另一个答案中的一个注释也适用于此：通过将 fatbin 文件从其位置复制到 exe 文件的位置，可以缩短在运行时间打印的 sdkFindFilePath 消息的扩展序列.最终的 sdkFindFilePath 输出将告诉您它在哪里找到了 fatbin 文件。

使用驱动程序 API 编译 CUDA 动态并行代码时出错

Error compiling CUDA dynamic parallelism code with Driver API

cuda

visual-studio-2019