为什么这个 NVIDIA CUDA PTX 没有按预期工作？

Question

我有这段代码尝试使用手写的 PTX 函数添加两个向量：

//kernel.cu
#include <stdio.h>
#include <cuda.h>


int main()
{
    CUdevice device;
    CUcontext context;
    CUmodule module;
    CUfunction function;
    char* moduleFile = "test.ptx";
    char* kernelName = "test";

    CUresult err = cuInit(0);
    cuDeviceGet(&device, 0);
    cuCtxCreate(&context, 0, device);

    err = cuModuleLoad(&module, moduleFile);
    if (err != CUDA_SUCCESS) { printf("cuModuleLoad %4d \n", err); return 1; }

    err = cuModuleGetFunction(&function, module, kernelName);
    if (err != CUDA_SUCCESS) { printf("cuModuleGetFunction %4d \n", err); return 1; }


    int size = 4;
    unsigned int byteSize = size * sizeof(int);
    int* h_a = (int*)malloc(byteSize);
    int* h_b = (int*)malloc(byteSize);
    int* h_c = (int*)malloc(byteSize);
    CUdeviceptr d_a; cuMemAlloc(&d_a, byteSize);
    CUdeviceptr d_b; cuMemAlloc(&d_b, byteSize);
    CUdeviceptr d_c; cuMemAlloc(&d_c, byteSize);

    for (int i = 0; i < size; i++)
    {
        h_a[i] = i;
        h_b[i] = i;
        h_c[i] = 0;
    }
    printf("before\n");
    for (int i = 0; i < size; i++)
        printf("i:%-3d a:%-3d b:%-3d c:%-3d \n", i, h_a[i], h_b[i], h_c[i]);

    cuMemcpyHtoD(d_a, h_a, byteSize);
    cuMemcpyHtoD(d_b, h_b, byteSize);
    void* args[]{ &d_a, &d_b, &d_c };
    cudaLaunchKernel(function, { 1, 1, 1 }, { byteSize, 1, 1 }, (void**)args);

    cudaDeviceSynchronize();
    cuMemcpyDtoH(h_c, d_c, byteSize);

    printf("\nafter\n");
    for (int i = 0; i < size; i++)
        printf("i:%-3d a:%-3d b:%-3d c:%-3d \n", i, h_a[i], h_b[i], h_c[i]);

    free(h_a);
    free(h_b);
    free(h_c);
    cuMemFree(d_a);
    cuMemFree(d_b);
    cuMemFree(d_c);

    cuCtxDestroy(context);
    cudaDeviceReset();
    return 0;
}

PTX函数：

//test.ptx
.version 6.5
.target sm_75
.address_size 64

.visible .entry test(
    .param .u64 .ptr .global .align 8 a,
    .param .u64 .ptr .global .align 8 b,
    .param .u64 .ptr .global .align 8 c
)
{
    .reg .u64 %a;
    .reg .u64 %b;
    .reg .u64 %c;
    .reg .u64 %i;

    ld.param.u64 %a, [a];
    ld.param.u64 %b, [b];
    ld.param.u64 %c, [c];
    cvt.u64.u32 %i, %tid.x;
    shl.b64 %i, %i, 2;
    add.u64 %a, %a, %i;
    add.u64 %b, %b, %i;
    add.u64 %c, %c, %i;

    .reg .s32 %s<3>;
    ld.s32 %s0, [%a];
    ld.s32 %s1, [%b];
    add.s32 %s0, %s0, %s1;
    st.s32 [%c], %s0;

    ret;
}

输出为：

before
i:0   a:0   b:0   c:0
i:1   a:1   b:1   c:0
i:2   a:2   b:2   c:0
i:3   a:3   b:3   c:0
test took 52.0913 ms

after
i:0   a:0   b:0   c:0
i:1   a:1   b:1   c:0
i:2   a:2   b:2   c:0
i:3   a:3   b:3   c:0

什么时候应该：

before
i:0   a:0   b:0   c:0
i:1   a:1   b:1   c:0
i:2   a:2   b:2   c:0
i:3   a:3   b:3   c:0
test took 52.0913 ms

after
i:0   a:0   b:0   c:0
i:1   a:1   b:1   c:2
i:2   a:2   b:2   c:4
i:3   a:3   b:3   c:6

我正在使用驱动程序 API 加载 PTX 文件和运行内核 test。我相信这要么与我使用 cuda 上下文的方式有关，要么与我加载和访问 gpu 内存的方式有关。我错过了什么？

Answer 1

运行您使用 cuda-memcheck 的代码显示函数启动失败：

$ cuda-memcheck ./saitama
========= CUDA-MEMCHECK
before
i:0   a:0   b:0   c:0   
i:1   a:1   b:1   c:0   
i:2   a:2   b:2   c:0   
i:3   a:3   b:3   c:0   
========= Program hit cudaErrorInvalidDeviceFunction (error 98) due to "invalid device function" on CUDA API call to cudaLaunchKernel. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3b9803]
=========     Host Frame:./saitama [0x4a585]
=========     Host Frame:./saitama [0x72b8]
=========     Host Frame:./saitama [0x7071]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./saitama [0x6c1a]

您的主机代码是设备和运行时的奇怪混合体API，我认为它永远无法正常工作。像这样修改您的主机代码：

#include <stdio.h>
#include <cuda.h>

int main()
{
    CUdevice device;
    CUcontext context;
    CUmodule module;
    CUfunction function;
    char moduleFile[] = "test.ptx";
    char kernelName[] = "test";

    CUresult err = cuInit(0);
    cuDeviceGet(&device, 0);
    cuCtxCreate(&context, 0, device);

    err = cuModuleLoad(&module, moduleFile);
    if (err != CUDA_SUCCESS) { printf("cuModuleLoad %4d \n", err); return 1; }

    err = cuModuleGetFunction(&function, module, kernelName);
    if (err != CUDA_SUCCESS) { printf("cuModuleGetFunction %4d \n", err); return 1; }

    int size = 4;
    unsigned int byteSize = size * sizeof(int);
    int* h_a = (int*)malloc(byteSize);
    int* h_b = (int*)malloc(byteSize);
    int* h_c = (int*)malloc(byteSize);
    CUdeviceptr d_a; cuMemAlloc(&d_a, byteSize);
    CUdeviceptr d_b; cuMemAlloc(&d_b, byteSize);
    CUdeviceptr d_c; cuMemAlloc(&d_c, byteSize);

    for (int i = 0; i < size; i++)
    {
        h_a[i] = i;
        h_b[i] = i;
        h_c[i] = 0;
    }
    printf("before\n");
    for (int i = 0; i < size; i++)
        printf("i:%-3d a:%-3d b:%-3d c:%-3d \n", i, h_a[i], h_b[i], h_c[i]);

    cuMemcpyHtoD(d_a, h_a, byteSize);
    cuMemcpyHtoD(d_b, h_b, byteSize);
    void* args[]{ &d_a, &d_b, &d_c };
    cuLaunchKernel(function,  1, 1, 1, size, 1, 1, 0, 0, (void**)args, 0);

    cuCtxSynchronize();
    cuMemcpyDtoH(h_c, d_c, byteSize);

    printf("\nafter\n");
    for (int i = 0; i < size; i++)
        printf("i:%-3d a:%-3d b:%-3d c:%-3d \n", i, h_a[i], h_b[i], h_c[i]);

    free(h_a);
    free(h_b);
    free(h_c);
    cuMemFree(d_a);
    cuMemFree(d_b);
    cuMemFree(d_c);

    cuCtxDestroy(context);
    return 0;
}

给我这个：

$ nvcc -o saitama saitama.cu  -lcuda

$ cat test.ptx
//test.ptx
.version 6.4
.target sm_52
.address_size 64

.visible .entry test(
    .param .u64 .ptr .global .align 8 a,
    .param .u64 .ptr .global .align 8 b,
    .param .u64 .ptr .global .align 8 c
)
{
    .reg .u64 %a;
    .reg .u64 %b;
    .reg .u64 %c;
    .reg .u64 %i;

    ld.param.u64 %a, [a];
    ld.param.u64 %b, [b];
    ld.param.u64 %c, [c];
    cvt.u64.u32 %i, %tid.x;
    shl.b64 %i, %i, 2;
    add.u64 %a, %a, %i;
    add.u64 %b, %b, %i;
    add.u64 %c, %c, %i;

    .reg .s32 %s<3>;
    ld.s32 %s0, [%a];
    ld.s32 %s1, [%b];
    add.s32 %s0, %s0, %s1;
    st.s32 [%c], %s0;

    ret;
}

$ cuda-memcheck ./saitama
========= CUDA-MEMCHECK
before
i:0   a:0   b:0   c:0   
i:1   a:1   b:1   c:0   
i:2   a:2   b:2   c:0   
i:3   a:3   b:3   c:0   

after
i:0   a:0   b:0   c:0   
i:1   a:1   b:1   c:2   
i:2   a:2   b:2   c:4   
i:3   a:3   b:3   c:6   
========= ERROR SUMMARY: 0 errors

[注意我需要更改 PTX 版本和目标，并且永远不要忽略编译器警告，它们会帮助你]。

为什么这个 NVIDIA CUDA PTX 没有按预期工作？

Why is this NVIDIA CUDA PTX not working as intended?

c++

cuda

ptx