使用静态 __global__ 函数在 CUDA 中创建仿函数作为执行器

Create a functor in CUDA with a static __global__ function as executor

我有一个 C++ 设计,我可以在其中编译时为基于 std::integer_sequence 的模板函数生成函数调用。这适用于 CPU,但我想使用 CUDA 将其扩展到 GPU。但是,我不允许将 __global__ 函数放入 struct。还有其他方法可以实现我想要做的事情吗?

#include <vector>
#include <utility>

struct Functor_cpu
{
    template<int I>
    static void exec(int* __restrict__ data, const int itot)
    {
        for (int i=0; i<itot; ++i)
            data[i] *= I;
    }
};

struct Functor_gpu
{
    template<int I> __global__
    static void exec(int* __restrict__ data, const int itot)
    {
        const int i = blockIdx.x * blockDim.x + threadIdx.x;

        if (i < itot)
            data[i] *= I;
    }
};

template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
    (Func::template exec<Is>(args...), ...);
}

template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
    dim3 grid{problem_size/64 + (problem_size%64 > 0)}, block{64};
    (Func::template exec<Is><<<grid, block>>>(args...), ...);
}

int main()
{
    const int itot = 1024;

    // CPU
    std::vector<int> data_cpu(itot);
    test<Functor_cpu>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);

    // GPU.
    int* data_gpu;
    cudaMalloc((void **) &data_gpu, itot*sizeof(int));

    dim3 block{64};
    dim3 grid{itot/block.x + (itot%block.x > 0)};
    test_gpu<Functor_gpu>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);

    cudaFree(data_gpu);

    return 0;
}

I am not allowed to put a __global__ function into a struct.

正确。

Is there another way to achieve what I am trying to do?

我认为您没有完全说明您要做什么。例如,我假设您除了乘以此处所示的编译时间常量之外,可能还有其他功能。但是,我认为克服您确定的障碍的一种可能方法是:

  1. __global__ 函数定义放在全局范围内(因为这基本上是必需的,我们不妨从这里开始),而不是 class/struct 的一部分,为您想要的功能模板化实施。
  2. 通过包含在仿函数中的模板参数传递您想要实现的功能。

这是一个粗略的例子:

$ cat t58.cu
#include <vector>
#include <utility>

struct f_mult
{
    template <typename T>
    __host__ __device__
    void operator ()(T *d, T I, int i) { d[i] = d[i]*I;}
};

struct f_div
{
    template <typename T>
    __host__ __device__
    void operator ()(T *d, T I, int i) { d[i] = d[i]/I;}
};

struct f_saxpy
{
    template <typename T>
    __host__ __device__
    void operator ()(T *x, T y, T a, int i) {x[i] = a*x[i]+y;}
};
template<class f, int I>
static void exec_cpu(int* __restrict__ data, const int itot)
{
    f my_F;
    for (int i=0; i<itot; ++i)
        my_F(data, I, i);
}

template<class f, int I, class... Args> __global__
void exec_gpu(const int itot, Args... args)
{
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    f my_F;
    if (i < itot)
        my_F(args..., I, i);
};

template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
    (exec_cpu<Func, Is>(args...), ...);
}

template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
    dim3 grid(problem_size/64 + (problem_size%64 > 0)), block(64);
    (exec_gpu<Func, Is><<<grid, block>>>(problem_size, args...), ...);
}

int main()
{
    const int itot = 1024;

    // CPU
    std::vector<int> data_cpu(itot);
    test<f_mult>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);

    // GPU.
    int* data_gpu;
    cudaMalloc((void **) &data_gpu, itot*sizeof(int));
    int y = 7;
    test_gpu<f_mult>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
    test_gpu<f_div> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
    test_gpu<f_saxpy> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu, y);

    cudaFree(data_gpu);

    return 0;
}
$ nvcc -o t58 t58.cu -std=c++17
$ cuda-memcheck ./t58
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t58
==55176== NVPROF is profiling process 55176, command: ./t58
==55176== Profiling application: ./t58
==55176== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   16.19%  2.1760us         1  2.1760us  2.1760us  2.1760us  _Z8exec_gpuI6f_multLi2EJPiEEviDpT1_
                   11.43%  1.5360us         1  1.5360us  1.5360us  1.5360us  _Z8exec_gpuI6f_multLi3EJPiEEviDpT1_
                   10.48%  1.4080us         1  1.4080us  1.4080us  1.4080us  _Z8exec_gpuI7f_saxpyLi4EJPiiEEviDpT1_
                   10.48%  1.4080us         1  1.4080us  1.4080us  1.4080us  _Z8exec_gpuI6f_multLi4EJPiEEviDpT1_
                   10.48%  1.4080us         1  1.4080us  1.4080us  1.4080us  _Z8exec_gpuI5f_divLi3EJPiEEviDpT1_
                   10.24%  1.3760us         1  1.3760us  1.3760us  1.3760us  _Z8exec_gpuI5f_divLi2EJPiEEviDpT1_
                   10.24%  1.3760us         1  1.3760us  1.3760us  1.3760us  _Z8exec_gpuI7f_saxpyLi2EJPiiEEviDpT1_
                   10.24%  1.3760us         1  1.3760us  1.3760us  1.3760us  _Z8exec_gpuI7f_saxpyLi3EJPiiEEviDpT1_
                   10.24%  1.3760us         1  1.3760us  1.3760us  1.3760us  _Z8exec_gpuI5f_divLi4EJPiEEviDpT1_
      API calls:   90.99%  263.81ms         1  263.81ms  263.81ms  263.81ms  cudaMalloc
                    4.85%  14.073ms         8  1.7591ms  1.0394ms  2.2642ms  cuDeviceTotalMem
                    3.68%  10.668ms       808  13.203us     200ns  970.67us  cuDeviceGetAttribute
                    0.35%  1.0199ms         8  127.49us  85.773us  170.92us  cuDeviceGetName
                    0.09%  256.61us         1  256.61us  256.61us  256.61us  cudaFree
                    0.02%  67.286us         9  7.4760us  4.2950us  27.253us  cudaLaunchKernel
                    0.01%  24.842us         8  3.1050us  1.0410us  5.0730us  cuDeviceGetPCIBusId
                    0.00%  8.6170us        16     538ns     214ns  1.8110us  cuDeviceGet
                    0.00%  3.9830us         8     497ns     254ns  1.0350us  cuDeviceGetUuid
                    0.00%  3.4770us         3  1.1590us     697ns  1.9520us  cuDeviceGetCount
$

我更新了以前版本的 GPU 代码,以展示如何处理不同的仿函数“原型”,而无需对整体调度结构进行任何更改。