如何为 cudaMemAdviseSetPreferredLocation 指定 GPU id
How to specify GPU id for cudaMemAdviseSetPreferredLocation
在尝试将托管内存的首选位置设置为 GPU #0:
时,我一直收到 "invalid device ordinal"
CUDA_ERR_CHECK(cudaMemAdvise(deviceMemoryHeap.pool, size,
cudaMemAdviseSetPreferredLocation, 0));
唯一有效的是 cudaCpuDeviceId。那么,如何指定GPU id呢?
编辑 添加一个简单的例子:
#define CUDA_ERR_CHECK(x) \
do { cudaError_t err = x; if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error %d \"%s\" at %s:%d\n", \
(int)err, cudaGetErrorString(err), \
__FILE__, __LINE__); \
exit(1); \
}} while (0);
#include <cstdio>
template<typename T>
__global__ void kernel(size_t* value)
{
*value = sizeof(T);
}
int main()
{
size_t size = 1024 * 1024 * 1024;
size_t* managed = NULL;
CUDA_ERR_CHECK(cudaMallocManaged(&managed, size, cudaMemAttachGlobal));
CUDA_ERR_CHECK(cudaMemAdvise(managed, size,
cudaMemAdviseSetPreferredLocation, 0));
kernel<double><<<1, 1>>>(managed);
CUDA_ERR_CHECK(cudaGetLastError());
CUDA_ERR_CHECK(cudaDeviceSynchronize());
CUDA_ERR_CHECK(cudaFree(managed));
size_t* memory = NULL;
CUDA_ERR_CHECK(cudaMalloc(&memory, size));
kernel<double><<<1, 1>>>(memory);
CUDA_ERR_CHECK(cudaGetLastError());
CUDA_ERR_CHECK(cudaDeviceSynchronize());
CUDA_ERR_CHECK(cudaFree(memory));
return 0;
}
抛出错误:
$ make
nvcc -arch=sm_30 managed.cu -o managed
$ ./managed
CUDA error 10 "invalid device ordinal" at managed.cu:24
CUDA 8.0
我的目标是摆脱巨大的 cudaLaunch 调用延迟,这种延迟只发生在托管内存内核启动的情况下:
错误似乎是由于缺少设备功能引起的。正如 cudaMemAdvise
函数的 CUDA 文档所述:
If device is a GPU, then it must have a non-zero value for the device attribute cudaDevAttrConcurrentManagedAccess
.
您应该调用以下代码以确保设备可以并发托管使用:
int device_id = 0, result = 0;
cudaDeviceGetAttribute (&result, cudaDevAttrConcurrentManagedAccess, device_id);
if (result) {
// Call cudaMemAdvise
}
在尝试将托管内存的首选位置设置为 GPU #0:
时,我一直收到 "invalid device ordinal"CUDA_ERR_CHECK(cudaMemAdvise(deviceMemoryHeap.pool, size,
cudaMemAdviseSetPreferredLocation, 0));
唯一有效的是 cudaCpuDeviceId。那么,如何指定GPU id呢?
编辑 添加一个简单的例子:
#define CUDA_ERR_CHECK(x) \
do { cudaError_t err = x; if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error %d \"%s\" at %s:%d\n", \
(int)err, cudaGetErrorString(err), \
__FILE__, __LINE__); \
exit(1); \
}} while (0);
#include <cstdio>
template<typename T>
__global__ void kernel(size_t* value)
{
*value = sizeof(T);
}
int main()
{
size_t size = 1024 * 1024 * 1024;
size_t* managed = NULL;
CUDA_ERR_CHECK(cudaMallocManaged(&managed, size, cudaMemAttachGlobal));
CUDA_ERR_CHECK(cudaMemAdvise(managed, size,
cudaMemAdviseSetPreferredLocation, 0));
kernel<double><<<1, 1>>>(managed);
CUDA_ERR_CHECK(cudaGetLastError());
CUDA_ERR_CHECK(cudaDeviceSynchronize());
CUDA_ERR_CHECK(cudaFree(managed));
size_t* memory = NULL;
CUDA_ERR_CHECK(cudaMalloc(&memory, size));
kernel<double><<<1, 1>>>(memory);
CUDA_ERR_CHECK(cudaGetLastError());
CUDA_ERR_CHECK(cudaDeviceSynchronize());
CUDA_ERR_CHECK(cudaFree(memory));
return 0;
}
抛出错误:
$ make
nvcc -arch=sm_30 managed.cu -o managed
$ ./managed
CUDA error 10 "invalid device ordinal" at managed.cu:24
CUDA 8.0
我的目标是摆脱巨大的 cudaLaunch 调用延迟,这种延迟只发生在托管内存内核启动的情况下:
错误似乎是由于缺少设备功能引起的。正如 cudaMemAdvise
函数的 CUDA 文档所述:
If device is a GPU, then it must have a non-zero value for the device attribute
cudaDevAttrConcurrentManagedAccess
.
您应该调用以下代码以确保设备可以并发托管使用:
int device_id = 0, result = 0;
cudaDeviceGetAttribute (&result, cudaDevAttrConcurrentManagedAccess, device_id);
if (result) {
// Call cudaMemAdvise
}