命名空间作为 CUDA 中的模板参数
Namespaces as template parameters in CUDA
在 C++ 中,不可能将 namespace
作为某种参数(通过模板或实际函数参数)传递给 class,否则函数是不可能的。这同样适用于 CUDA(至少,据我所知)。这个问题解释了一些原因:Why can't namespaces be template parameters?
namespace experiment1
int repetitions() { return 2; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
namespace experiment2
int repetitions() { return 4; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
// Beware, this is invalid C++ and invalid CUDA
template<namespace NS>
void do_test()
// Do something with NS::repetitions(), NS::setup(), ...
这在 C++ 中无效的原因之一是,在这种方法中,没有什么是 classes 无法做到的。您确实可以将每个命名空间变成 class 并将函数变成成员函数,然后将 class 作为模板参数传递给 do_test
我同意这一点。但是,在 CUDA 的特定情况下,您可以使用名称空间来做一些事情,但不能使用 classes。假设 f
是一个内核,即一个 __global__
函数,并且 setup
或另一个函数用于指定,例如为内核分配的共享内存的大小。内核不能是 classes 的成员(请参阅此问题的答案:)。但是,您可以将它与与同一实验相关的其他函数放在同一个 namespace
是设置计时器、准备一些输入、检查输出、测量时间和执行其他操作的函数。每个实验都是一组几个函数,具有相同的名称和相同的接口,其中一个是内核。您希望 do_test
我有几个执行类似操作的非常简单的内核。他们从一个大数组中加载值,对它们应用模板操作,然后将结果写入输出数组(不同于输入数组)。通过模板操作,我的意思是线程 idx
对输入值 idx
及其相邻值(比如从 idx-3
到 idx+3
)执行的操作。这些内核中最简单的只执行从输入到输出的复制:每个线程读取 input[idx]
并写入 output[idx]
。另一个示例是执行 output[idx] = input[idx+1] - input[idx-1]
的差异模板。 (我把一些细节分开了,但你明白了。)
namespace copy
std::string name() { return "copy"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
namespace difference
std::string name() { return "difference"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
我有一个函数 do_test
typedef bool NameFunction(const float* input, const float* output);
typedef bool CheckFunction(const float* input, const float* output);
typedef void KernelFunction(const float* input, float* output, int size);
void do_test(NameFunction name, KernelFunction kernel1, KernelFunction kernel2, CheckFunction check)
// Set up input and output array
// Set up CUDA events
// Warm up kernels
// Run kernels
// Check results
// Measure time
// Do standard output
int main()
do_test<copy::name, copy::kernel, copy::kernelOptimized, copy::check>()
do_test<difference::name, difference::kernel, difference::kernelOptimized, difference::check>()
现在,当然这样已经很不错了。但是,如果我再引入一个每个实验都必须提供的功能,我将需要修改我调用 do_test
您可以将内核修改为 "just" __device__
函数,然后通过 kernel_wrapper
#include <iostream>
#include <stdio.h>
typedef void (*kernel_ptr)(const float* input, float* output, int size);
template <kernel_ptr kernel>
void kernel_wrapper(const float* input, float* output, int size)
kernel(input, output, size);
struct copy
std::string name() { return "copy"; }
__device__ static void kernel(const float* input, float* output, int size){ printf("copy: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("copy optimized: %d\n",threadIdx.x); }
struct difference
std::string name() { return "difference"; }
__device__ static void kernel(const float* input, float* output,i nt size){ printf("difference: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("difference optimized: %d\n",threadIdx.x); }
template <typename Experiment>
void do_test()
dim3 dimBlock( 4, 1 );
dim3 dimGrid( 1, 1 );
Experiment e;
std::cout << "running experiment " << e.name() << std::endl;
std::cout << "launching the normal kernel" << std::endl;
kernel_wrapper<Experiment::kernel><<<dimGrid, dimBlock>>>(0,0,0);
std::cout << "launching the optimized kernel" << std::endl;
kernel_wrapper<Experiment::kernelOptimized><<<dimGrid, dimBlock>>>(0,0,0);
int main()
return 0;
running experiment copy
launching the normal kernel
copy: 0
copy: 1
copy: 2
copy: 3
launching the optimized kernel
copy optimized: 0
copy optimized: 1
copy optimized: 2
copy optimized: 3
running experiment difference
launching the normal kernel
difference: 0
difference: 1
difference: 2
difference: 3
launching the optimized kernel
difference optimized: 0
difference optimized: 1
difference optimized: 2
difference optimized: 3
或者,您可以结合使用 CRTP 和模板专业化:
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
void run()
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
struct experiment1 : experiment<experiment1>
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
template <>
void f<experiment1>()
printf("experiment1: %d\n",threadIdx.x);
struct experiment2 : experiment<experiment2>
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
template <>
void f<experiment2>()
printf("experiment2: %d\n",threadIdx.x);
template<typename Experiment>
void do_test()
Experiment e;
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
void run()
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
struct experiment1 : experiment<experiment1>
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
template <>
void f<experiment1>()
printf("experiment1: %d\n",threadIdx.x);
struct experiment2 : experiment<experiment2>
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
template <>
void f<experiment2>()
printf("experiment2: %d\n",threadIdx.x);
template<typename Experiment>
void do_test()
Experiment e;
int main()
return 0;
setting up experiment 1
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
shutting down experiment 1
setting up experiment 2
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
shutting down experiment 2
在 C++ 中,不可能将 namespace
作为某种参数(通过模板或实际函数参数)传递给 class,否则函数是不可能的。这同样适用于 CUDA(至少,据我所知)。这个问题解释了一些原因:Why can't namespaces be template parameters?
namespace experiment1
int repetitions() { return 2; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
namespace experiment2
int repetitions() { return 4; }
void setup() { ... }
void f() { ... }
void teardown() { ... }
// Beware, this is invalid C++ and invalid CUDA
template<namespace NS>
void do_test()
// Do something with NS::repetitions(), NS::setup(), ...
这在 C++ 中无效的原因之一是,在这种方法中,没有什么是 classes 无法做到的。您确实可以将每个命名空间变成 class 并将函数变成成员函数,然后将 class 作为模板参数传递给 do_test
我同意这一点。但是,在 CUDA 的特定情况下,您可以使用名称空间来做一些事情,但不能使用 classes。假设 f
是一个内核,即一个 __global__
函数,并且 setup
或另一个函数用于指定,例如为内核分配的共享内存的大小。内核不能是 classes 的成员(请参阅此问题的答案:namespace
是设置计时器、准备一些输入、检查输出、测量时间和执行其他操作的函数。每个实验都是一组几个函数,具有相同的名称和相同的接口,其中一个是内核。您希望 do_test
我有几个执行类似操作的非常简单的内核。他们从一个大数组中加载值,对它们应用模板操作,然后将结果写入输出数组(不同于输入数组)。通过模板操作,我的意思是线程 idx
对输入值 idx
及其相邻值(比如从 idx-3
到 idx+3
)执行的操作。这些内核中最简单的只执行从输入到输出的复制:每个线程读取 input[idx]
并写入 output[idx]
。另一个示例是执行 output[idx] = input[idx+1] - input[idx-1]
的差异模板。 (我把一些细节分开了,但你明白了。)
namespace copy
std::string name() { return "copy"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
namespace difference
std::string name() { return "difference"; }
__global__ void kernel(const float* input, float* output, int size);
__global__ void kernelOptimized(const float* input, float* output, int size);
bool check(const float* input, const float* output);
我有一个函数 do_test
typedef bool NameFunction(const float* input, const float* output);
typedef bool CheckFunction(const float* input, const float* output);
typedef void KernelFunction(const float* input, float* output, int size);
void do_test(NameFunction name, KernelFunction kernel1, KernelFunction kernel2, CheckFunction check)
// Set up input and output array
// Set up CUDA events
// Warm up kernels
// Run kernels
// Check results
// Measure time
// Do standard output
int main()
do_test<copy::name, copy::kernel, copy::kernelOptimized, copy::check>()
do_test<difference::name, difference::kernel, difference::kernelOptimized, difference::check>()
现在,当然这样已经很不错了。但是,如果我再引入一个每个实验都必须提供的功能,我将需要修改我调用 do_test
您可以将内核修改为 "just" __device__
函数,然后通过 kernel_wrapper
#include <iostream>
#include <stdio.h>
typedef void (*kernel_ptr)(const float* input, float* output, int size);
template <kernel_ptr kernel>
void kernel_wrapper(const float* input, float* output, int size)
kernel(input, output, size);
struct copy
std::string name() { return "copy"; }
__device__ static void kernel(const float* input, float* output, int size){ printf("copy: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("copy optimized: %d\n",threadIdx.x); }
struct difference
std::string name() { return "difference"; }
__device__ static void kernel(const float* input, float* output,i nt size){ printf("difference: %d\n",threadIdx.x); }
__device__ static void kernelOptimized(const float* input, float* output, int size){ printf("difference optimized: %d\n",threadIdx.x); }
template <typename Experiment>
void do_test()
dim3 dimBlock( 4, 1 );
dim3 dimGrid( 1, 1 );
Experiment e;
std::cout << "running experiment " << e.name() << std::endl;
std::cout << "launching the normal kernel" << std::endl;
kernel_wrapper<Experiment::kernel><<<dimGrid, dimBlock>>>(0,0,0);
std::cout << "launching the optimized kernel" << std::endl;
kernel_wrapper<Experiment::kernelOptimized><<<dimGrid, dimBlock>>>(0,0,0);
int main()
return 0;
running experiment copy
launching the normal kernel
copy: 0
copy: 1
copy: 2
copy: 3
launching the optimized kernel
copy optimized: 0
copy optimized: 1
copy optimized: 2
copy optimized: 3
running experiment difference
launching the normal kernel
difference: 0
difference: 1
difference: 2
difference: 3
launching the optimized kernel
difference optimized: 0
difference optimized: 1
difference optimized: 2
difference optimized: 3
或者,您可以结合使用 CRTP 和模板专业化:
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
void run()
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
struct experiment1 : experiment<experiment1>
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
template <>
void f<experiment1>()
printf("experiment1: %d\n",threadIdx.x);
struct experiment2 : experiment<experiment2>
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
template <>
void f<experiment2>()
printf("experiment2: %d\n",threadIdx.x);
template<typename Experiment>
void do_test()
Experiment e;
#include <iostream>
#include <stdio.h>
template <typename Experiment>
__global__ void f();
template <typename Derived>
struct experiment
void run()
int blocksize = static_cast<Derived*>(this)->blocksize();
int reps = static_cast<Derived*>(this)->repetitions();
for (int i = 0; i<reps; ++i)
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
f<Derived><<<dimGrid, dimBlock>>>();
struct experiment1 : experiment<experiment1>
int repetitions() { return 2; }
int blocksize() { return 4; }
experiment1() { std::cout << "setting up experiment 1" << std::endl; }
~experiment1() { std::cout << "shutting down experiment 1" << std::endl; }
template <>
void f<experiment1>()
printf("experiment1: %d\n",threadIdx.x);
struct experiment2 : experiment<experiment2>
int repetitions() { return 4; }
int blocksize() { return 2; }
experiment2() { std::cout << "setting up experiment 2" << std::endl; }
~experiment2() { std::cout << "shutting down experiment 2" << std::endl; }
template <>
void f<experiment2>()
printf("experiment2: %d\n",threadIdx.x);
template<typename Experiment>
void do_test()
Experiment e;
int main()
return 0;
setting up experiment 1
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
experiment1: 0
experiment1: 1
experiment1: 2
experiment1: 3
shutting down experiment 1
setting up experiment 2
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
experiment2: 0
experiment2: 1
shutting down experiment 2