在 WSL2 上使用 Cuda 给了我 "no kernel image is available for execution on the device."

Using Cuda on WSL2 gives me "no kernel image is available for execution on the device."

我正在尝试在 WSL2 上的 C++ 程序中使用 Cuda 和 Thrust。我按照 here 中的说明在 WSL2 上启用了 Cuda。这是一个小示例程序:

首先,我定义:

export CUDA_LIBRARY_DIRECTORY=/usr/local/cuda-11.0/lib64
export CUDA_INCLUDE_DIRECTORY=/usr/local/cuda-11.0/include
export CUDACXX=/usr/local/cuda-11.0/bin/nvcc

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
project(proj LANGUAGES CXX CUDA)

set (CMAKE_CXX_STANDARD 14)

#### use cuda ####
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

include_directories($ENV{CUDA_INCLUDE_DIRECTORY})
link_directories($ENV{CUDA_LIBRARY_DIRECTORY})

ADD_EXECUTABLE(
proj 
src/cudafile.cu
src/main.cpp)

main.cpp

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

void func(int size, int* a1, int* a2, int* a3);
void FillWithValue(int* arr, int size, int val);

int main()
{

    int size=1000;
    int *arr1, *arr2, *arr3;
    
    cudaMalloc((void**)&arr1, size * sizeof(int));
    FillWithValue(arr1,size,1);

    cudaMalloc((void**)&arr2, size * sizeof(int));
    FillWithValue(arr2,size,2);

    cudaMalloc((void**)&arr3, size * sizeof(int));

    int* harr = new int [size];
    cudaMemcpy(harr,arr1,size*sizeof(int),cudaMemcpyDeviceToHost);
    fprintf(stdout, "%d\n",harr[0]);


    func(size, arr1, arr2, arr3);
    cudaError_t err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString(err));
    

    return 1;

}

cudafile.cu

#include<thrust/host_vector.h>
#include<thrust/device_vector.h>
#include<thrust/device_ptr.h>

#define blocksize 512
#define maxblocks 65535

__global__ void funcKernel(int size, int* a1, int* a2, int* a3)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    while (i < size)
    {
        a3[i]=a1[i]+a2[i];
    }
}

void func(int size, int* a1, int* a2, int* a3)
{
    int gridsize = size / blocksize + 1;
    if (gridsize > maxblocks) gridsize = maxblocks;

    funcKernel << <gridsize, blocksize >> > (size, a1, a2, a3);
}

void FillWithValue(int* arr, int size, int val)
{

    thrust::device_ptr<int> d = thrust::device_pointer_cast(arr);
    thrust::fill(d, d + size, val);
}

产出

0
Cuda error: no kernel image is available for execution on the device.

现在第一个fprintf的输出证明Thrust fill函数填充数组失败,cudaGetLastError()捕获错误,证明内核也失败了。

这是详细的 cmake 构建:

cmake ..

-- The CXX compiler identification is GNU 9.3.0
-- The CUDA compiler identification is NVIDIA 11.0.221
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc
-- Check for working CUDA compiler: /usr/local/cuda-11.0/bin/nvcc -- works
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Configuring done
-- Generating done
-- Build files have been written to: /mnt/d/work/wsl2-projects/tests/kernels/build

制作

/usr/bin/cmake -S/mnt/d/work/wsl2-projects/tests/kernels -B/mnt/d/work/wsl2-projects/tests/kernels/build --check-build-system CMakeFiles/Makefile.cmake 0
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/progress.marks
make -f CMakeFiles/Makefile2 all
make[1]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/depend
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
cd /mnt/d/work/wsl2-projects/tests/kernels/build && /usr/bin/cmake -E cmake_depends "Unix Makefiles" /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles/proj.dir/DependInfo.cmake --color=
Scanning dependencies of target proj
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
make -f CMakeFiles/proj.dir/build.make CMakeFiles/proj.dir/build
make[2]: Entering directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[ 33%] Building CUDA object CMakeFiles/proj.dir/src/cudafile.cu.o
/usr/local/cuda-11.0/bin/nvcc     -x cu -c /mnt/d/work/wsl2-projects/tests/kernels/src/cudafile.cu -o CMakeFiles/proj.dir/src/cudafile.cu.o
[ 66%] Building CXX object CMakeFiles/proj.dir/src/main.cpp.o
/usr/bin/c++   -I/usr/local/cuda-11.0/include  -std=gnu++14 -o CMakeFiles/proj.dir/src/main.cpp.o -c /mnt/d/work/wsl2-projects/tests/kernels/src/main.cpp
[100%] Linking CXX executable proj
/usr/bin/cmake -E cmake_link_script CMakeFiles/proj.dir/link.txt --verbose=1
/usr/bin/c++    -rdynamic CMakeFiles/proj.dir/src/cudafile.cu.o CMakeFiles/proj.dir/src/main.cpp.o  -o proj   -L/usr/local/cuda-11.0/lib64  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib/stubs  -L/usr/local/cuda-11.0/targets/x86_64-linux/lib  -lcudadevrt -lcudart_static -lrt -lpthread -ldl
make[2]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
[100%] Built target proj
make[1]: Leaving directory '/mnt/d/work/wsl2-projects/tests/kernels/build'
/usr/bin/cmake -E cmake_progress_start /mnt/d/work/wsl2-projects/tests/kernels/build/CMakeFiles 0

是不是和我的GPU不匹配Cuda版本有关?我想降级到 Cuda 10 或 9,但我不知道如何安装它完全像 here,这样它就不会用另一个 Nvidia 驱动程序替换驱动程序。

附加信息:

根据 Robert Crovella 的评论,我设法使程序正确地 运行,输出正确,没有错误。

在CMakeLists.txt中,我使用了

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_50,code=sm_50 -lineinfo -cudart=static -Xptxas -v")

而不是

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=sm_50;-lineinfo; -cudart=static; -Xptxas; -v)

现在输出是

1