如何运行 cuda cooperative template kernel
How to run cuda cooperative template kernel
我试图在 CUDA C++ 中将模板内核作为协作内核启动但未成功,我做错了什么
错误
Error cannot determine which instance of function template "boolPrepareKernel" is intended
我尝试像下面这样调用内核
ForBoolKernelArgs<int> fbArgs = ...;
int device = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
cudaLaunchCooperativeKernel((void*)boolPrepareKernel, deviceProp.multiProcessorCount, fFArgs.threads, fbArgs) ;
内核定义如
template <typename TYO>
__global__ void boolPrepareKernel(ForBoolKernelArgs<TYO> fbArgs) {
...
}
我试过参数化启动(在本例中使用 int),例如
cudaLaunchCooperativeKernel((void*)(<int>boolPrepareKernel), deviceProp.multiProcessorCount, fFArgs.threads, fbArgs) ;
但我收到错误
no instance of overloaded function matches the argument list argument types are: (<error-type>, int, dim3, ForBoolKernelArgs<int>)
对于建议的案例
cudaLaunchCooperativeKernel((void*)(boolPrepareKernel<int>), deviceProp.multiProcessorCount, fFArgs.threads, fbArgs)
我的错误是
no instance of overloaded function matches the argument list argument types are: (void *, int, dim3, ForBoolKernelArgs<int>)
这可能很简单,但我卡住了 - 感谢您的帮助!!
内核启动参考
boolPrepareKernel << <fFArgs.blocks, fFArgs.threads >> > (fbArgs);
有效,但网格同步当然不可用。
这是一个可以编译的最小示例:
$ cat t1954.cu
template <typename TYO>
struct ForBoolKernelArgs
{
TYO val;
};
template <typename TYO>
__global__ void boolPrepareKernel(ForBoolKernelArgs<TYO> fbArgs) {
}
int main(){
ForBoolKernelArgs<int> fbArgs;
void *kernel_args[] = {&fbArgs};
cudaLaunchCooperativeKernel((void*)(boolPrepareKernel<int>), 1, 1, kernel_args) ;
}
$ nvcc -o t1954 t1954.cu
$
可能您剩下的主要问题是您没有遵循正确的 instructions 来传递内核参数。
我试图在 CUDA C++ 中将模板内核作为协作内核启动但未成功,我做错了什么
错误
Error cannot determine which instance of function template "boolPrepareKernel" is intended
我尝试像下面这样调用内核
ForBoolKernelArgs<int> fbArgs = ...;
int device = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
cudaLaunchCooperativeKernel((void*)boolPrepareKernel, deviceProp.multiProcessorCount, fFArgs.threads, fbArgs) ;
内核定义如
template <typename TYO>
__global__ void boolPrepareKernel(ForBoolKernelArgs<TYO> fbArgs) {
...
}
我试过参数化启动(在本例中使用 int),例如
cudaLaunchCooperativeKernel((void*)(<int>boolPrepareKernel), deviceProp.multiProcessorCount, fFArgs.threads, fbArgs) ;
但我收到错误
no instance of overloaded function matches the argument list argument types are: (<error-type>, int, dim3, ForBoolKernelArgs<int>)
对于建议的案例
cudaLaunchCooperativeKernel((void*)(boolPrepareKernel<int>), deviceProp.multiProcessorCount, fFArgs.threads, fbArgs)
我的错误是
no instance of overloaded function matches the argument list argument types are: (void *, int, dim3, ForBoolKernelArgs<int>)
这可能很简单,但我卡住了 - 感谢您的帮助!!
内核启动参考
boolPrepareKernel << <fFArgs.blocks, fFArgs.threads >> > (fbArgs);
有效,但网格同步当然不可用。
这是一个可以编译的最小示例:
$ cat t1954.cu
template <typename TYO>
struct ForBoolKernelArgs
{
TYO val;
};
template <typename TYO>
__global__ void boolPrepareKernel(ForBoolKernelArgs<TYO> fbArgs) {
}
int main(){
ForBoolKernelArgs<int> fbArgs;
void *kernel_args[] = {&fbArgs};
cudaLaunchCooperativeKernel((void*)(boolPrepareKernel<int>), 1, 1, kernel_args) ;
}
$ nvcc -o t1954 t1954.cu
$
可能您剩下的主要问题是您没有遵循正确的 instructions 来传递内核参数。