使用静态 __global__ 函数在 CUDA 中创建仿函数作为执行器
Create a functor in CUDA with a static __global__ function as executor
我有一个 C++ 设计,我可以在其中编译时为基于 std::integer_sequence
的模板函数生成函数调用。这适用于 CPU,但我想使用 CUDA 将其扩展到 GPU。但是,我不允许将 __global__
函数放入 struct
。还有其他方法可以实现我想要做的事情吗?
#include <vector>
#include <utility>
struct Functor_cpu
{
template<int I>
static void exec(int* __restrict__ data, const int itot)
{
for (int i=0; i<itot; ++i)
data[i] *= I;
}
};
struct Functor_gpu
{
template<int I> __global__
static void exec(int* __restrict__ data, const int itot)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < itot)
data[i] *= I;
}
};
template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
(Func::template exec<Is>(args...), ...);
}
template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
dim3 grid{problem_size/64 + (problem_size%64 > 0)}, block{64};
(Func::template exec<Is><<<grid, block>>>(args...), ...);
}
int main()
{
const int itot = 1024;
// CPU
std::vector<int> data_cpu(itot);
test<Functor_cpu>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);
// GPU.
int* data_gpu;
cudaMalloc((void **) &data_gpu, itot*sizeof(int));
dim3 block{64};
dim3 grid{itot/block.x + (itot%block.x > 0)};
test_gpu<Functor_gpu>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
cudaFree(data_gpu);
return 0;
}
I am not allowed to put a __global__
function into a struct.
正确。
Is there another way to achieve what I am trying to do?
我认为您没有完全说明您要做什么。例如,我假设您除了乘以此处所示的编译时间常量之外,可能还有其他功能。但是,我认为克服您确定的障碍的一种可能方法是:
- 将
__global__
函数定义放在全局范围内(因为这基本上是必需的,我们不妨从这里开始),而不是 class/struct 的一部分,为您想要的功能模板化实施。
- 通过包含在仿函数中的模板参数传递您想要实现的功能。
这是一个粗略的例子:
$ cat t58.cu
#include <vector>
#include <utility>
struct f_mult
{
template <typename T>
__host__ __device__
void operator ()(T *d, T I, int i) { d[i] = d[i]*I;}
};
struct f_div
{
template <typename T>
__host__ __device__
void operator ()(T *d, T I, int i) { d[i] = d[i]/I;}
};
struct f_saxpy
{
template <typename T>
__host__ __device__
void operator ()(T *x, T y, T a, int i) {x[i] = a*x[i]+y;}
};
template<class f, int I>
static void exec_cpu(int* __restrict__ data, const int itot)
{
f my_F;
for (int i=0; i<itot; ++i)
my_F(data, I, i);
}
template<class f, int I, class... Args> __global__
void exec_gpu(const int itot, Args... args)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
f my_F;
if (i < itot)
my_F(args..., I, i);
};
template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
(exec_cpu<Func, Is>(args...), ...);
}
template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
dim3 grid(problem_size/64 + (problem_size%64 > 0)), block(64);
(exec_gpu<Func, Is><<<grid, block>>>(problem_size, args...), ...);
}
int main()
{
const int itot = 1024;
// CPU
std::vector<int> data_cpu(itot);
test<f_mult>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);
// GPU.
int* data_gpu;
cudaMalloc((void **) &data_gpu, itot*sizeof(int));
int y = 7;
test_gpu<f_mult>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
test_gpu<f_div> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
test_gpu<f_saxpy> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu, y);
cudaFree(data_gpu);
return 0;
}
$ nvcc -o t58 t58.cu -std=c++17
$ cuda-memcheck ./t58
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t58
==55176== NVPROF is profiling process 55176, command: ./t58
==55176== Profiling application: ./t58
==55176== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 16.19% 2.1760us 1 2.1760us 2.1760us 2.1760us _Z8exec_gpuI6f_multLi2EJPiEEviDpT1_
11.43% 1.5360us 1 1.5360us 1.5360us 1.5360us _Z8exec_gpuI6f_multLi3EJPiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI7f_saxpyLi4EJPiiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI6f_multLi4EJPiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI5f_divLi3EJPiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI5f_divLi2EJPiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI7f_saxpyLi2EJPiiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI7f_saxpyLi3EJPiiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI5f_divLi4EJPiEEviDpT1_
API calls: 90.99% 263.81ms 1 263.81ms 263.81ms 263.81ms cudaMalloc
4.85% 14.073ms 8 1.7591ms 1.0394ms 2.2642ms cuDeviceTotalMem
3.68% 10.668ms 808 13.203us 200ns 970.67us cuDeviceGetAttribute
0.35% 1.0199ms 8 127.49us 85.773us 170.92us cuDeviceGetName
0.09% 256.61us 1 256.61us 256.61us 256.61us cudaFree
0.02% 67.286us 9 7.4760us 4.2950us 27.253us cudaLaunchKernel
0.01% 24.842us 8 3.1050us 1.0410us 5.0730us cuDeviceGetPCIBusId
0.00% 8.6170us 16 538ns 214ns 1.8110us cuDeviceGet
0.00% 3.9830us 8 497ns 254ns 1.0350us cuDeviceGetUuid
0.00% 3.4770us 3 1.1590us 697ns 1.9520us cuDeviceGetCount
$
我更新了以前版本的 GPU 代码,以展示如何处理不同的仿函数“原型”,而无需对整体调度结构进行任何更改。
我有一个 C++ 设计,我可以在其中编译时为基于 std::integer_sequence
的模板函数生成函数调用。这适用于 CPU,但我想使用 CUDA 将其扩展到 GPU。但是,我不允许将 __global__
函数放入 struct
。还有其他方法可以实现我想要做的事情吗?
#include <vector>
#include <utility>
struct Functor_cpu
{
template<int I>
static void exec(int* __restrict__ data, const int itot)
{
for (int i=0; i<itot; ++i)
data[i] *= I;
}
};
struct Functor_gpu
{
template<int I> __global__
static void exec(int* __restrict__ data, const int itot)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < itot)
data[i] *= I;
}
};
template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
(Func::template exec<Is>(args...), ...);
}
template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
dim3 grid{problem_size/64 + (problem_size%64 > 0)}, block{64};
(Func::template exec<Is><<<grid, block>>>(args...), ...);
}
int main()
{
const int itot = 1024;
// CPU
std::vector<int> data_cpu(itot);
test<Functor_cpu>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);
// GPU.
int* data_gpu;
cudaMalloc((void **) &data_gpu, itot*sizeof(int));
dim3 block{64};
dim3 grid{itot/block.x + (itot%block.x > 0)};
test_gpu<Functor_gpu>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
cudaFree(data_gpu);
return 0;
}
I am not allowed to put a
__global__
function into a struct.
正确。
Is there another way to achieve what I am trying to do?
我认为您没有完全说明您要做什么。例如,我假设您除了乘以此处所示的编译时间常量之外,可能还有其他功能。但是,我认为克服您确定的障碍的一种可能方法是:
- 将
__global__
函数定义放在全局范围内(因为这基本上是必需的,我们不妨从这里开始),而不是 class/struct 的一部分,为您想要的功能模板化实施。 - 通过包含在仿函数中的模板参数传递您想要实现的功能。
这是一个粗略的例子:
$ cat t58.cu
#include <vector>
#include <utility>
struct f_mult
{
template <typename T>
__host__ __device__
void operator ()(T *d, T I, int i) { d[i] = d[i]*I;}
};
struct f_div
{
template <typename T>
__host__ __device__
void operator ()(T *d, T I, int i) { d[i] = d[i]/I;}
};
struct f_saxpy
{
template <typename T>
__host__ __device__
void operator ()(T *x, T y, T a, int i) {x[i] = a*x[i]+y;}
};
template<class f, int I>
static void exec_cpu(int* __restrict__ data, const int itot)
{
f my_F;
for (int i=0; i<itot; ++i)
my_F(data, I, i);
}
template<class f, int I, class... Args> __global__
void exec_gpu(const int itot, Args... args)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x;
f my_F;
if (i < itot)
my_F(args..., I, i);
};
template<class Func, class... Args, int... Is>
void test(std::integer_sequence<int, Is...>, Args... args)
{
(exec_cpu<Func, Is>(args...), ...);
}
template<class Func, class... Args, int... Is>
void test_gpu(const int problem_size, std::integer_sequence<int, Is...>, Args... args)
{
dim3 grid(problem_size/64 + (problem_size%64 > 0)), block(64);
(exec_gpu<Func, Is><<<grid, block>>>(problem_size, args...), ...);
}
int main()
{
const int itot = 1024;
// CPU
std::vector<int> data_cpu(itot);
test<f_mult>(std::integer_sequence<int, 2, 3, 4>{}, data_cpu.data(), itot);
// GPU.
int* data_gpu;
cudaMalloc((void **) &data_gpu, itot*sizeof(int));
int y = 7;
test_gpu<f_mult>(itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
test_gpu<f_div> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu);
test_gpu<f_saxpy> (itot, std::integer_sequence<int, 2, 3, 4>{}, data_gpu, y);
cudaFree(data_gpu);
return 0;
}
$ nvcc -o t58 t58.cu -std=c++17
$ cuda-memcheck ./t58
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t58
==55176== NVPROF is profiling process 55176, command: ./t58
==55176== Profiling application: ./t58
==55176== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 16.19% 2.1760us 1 2.1760us 2.1760us 2.1760us _Z8exec_gpuI6f_multLi2EJPiEEviDpT1_
11.43% 1.5360us 1 1.5360us 1.5360us 1.5360us _Z8exec_gpuI6f_multLi3EJPiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI7f_saxpyLi4EJPiiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI6f_multLi4EJPiEEviDpT1_
10.48% 1.4080us 1 1.4080us 1.4080us 1.4080us _Z8exec_gpuI5f_divLi3EJPiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI5f_divLi2EJPiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI7f_saxpyLi2EJPiiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI7f_saxpyLi3EJPiiEEviDpT1_
10.24% 1.3760us 1 1.3760us 1.3760us 1.3760us _Z8exec_gpuI5f_divLi4EJPiEEviDpT1_
API calls: 90.99% 263.81ms 1 263.81ms 263.81ms 263.81ms cudaMalloc
4.85% 14.073ms 8 1.7591ms 1.0394ms 2.2642ms cuDeviceTotalMem
3.68% 10.668ms 808 13.203us 200ns 970.67us cuDeviceGetAttribute
0.35% 1.0199ms 8 127.49us 85.773us 170.92us cuDeviceGetName
0.09% 256.61us 1 256.61us 256.61us 256.61us cudaFree
0.02% 67.286us 9 7.4760us 4.2950us 27.253us cudaLaunchKernel
0.01% 24.842us 8 3.1050us 1.0410us 5.0730us cuDeviceGetPCIBusId
0.00% 8.6170us 16 538ns 214ns 1.8110us cuDeviceGet
0.00% 3.9830us 8 497ns 254ns 1.0350us cuDeviceGetUuid
0.00% 3.4770us 3 1.1590us 697ns 1.9520us cuDeviceGetCount
$
我更新了以前版本的 GPU 代码,以展示如何处理不同的仿函数“原型”,而无需对整体调度结构进行任何更改。