设备内的 Cuda 复杂对象初始化:cudaDeviceSetLimit 的问题
Cuda complex object initialization within device: problem with cudaDeviceSetLimit
我正在尝试在我的设备内、线程内和块内初始化复杂对象。在我看来,cudaDeviceSetLimit
有问题。鉴于我对问题的理解,我没有正确设置每个线程的堆内存量。 documentation refers to my problem. But they do not initialize an object. I have also read this post 的这一部分,但我的代码无法正常工作。
编辑
与第一个答案相反:在我的问题配置中必须在内核中执行此操作,因为我想利用跨块并行初始化对象的优势。
我制作了以下玩具示例,它适用于少量块 (65) 但不适用于 65535 个块(我可以在我的设备上使用的最大块数):
class NNode{
public:
int node_id;
};
class cuNetwork{
public:
int num_allnodes;
NNode** all_nodes;
};
__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
cuNetwork* gpu_net = new cuNetwork();
gpu_net->all_nodes = new NNode*[num_allnodes];
for(int i=0; i<num_allnodes; i++){
gpu_net->all_nodes[i] = new NNode();
}
arr_gpu_net[bId] = gpu_net;
}
int main(int argc, const char **argv){
int numBlocks = 65;
int num_allnodes = 200;
cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);
size_t size;
//for each block
size = sizeof(cuNetwork);//new cuNetwork()
size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes]
size += sizeof(NNode) * num_allnodes; //for()... new NNode()
//size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, numBlocks * size);
mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
cudaDeviceSynchronize();
return 0;
}
一旦我开始向对象添加其他属性,或者如果我将 numBlocks
增加到 65535,我就会收到错误消息:
CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x555555efff90
Thread 1 "no_fun" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (7750,0,0), thread (0,0,0), device 0, sm 1, warp 3, lane 0]
0x0000555555f000b0 in mallocTest(int, cuNetwork**)<<<(65535,1,1),(1,1,1)>>> ()
我的问题是:在这个例子中,我应该如何正确地初始化 cudaDeviceSetLimit
以便为 cuNetwork
的每个线程初始化所需的正确内存量?
回答你的问题:
由于内存填充和分配粒度,每个块可能需要比计算大小更多的内存。
您应该始终检查 new
的 return 值。如果是nullptr
,则分配失败。
但是,如果预先知道所有网络的节点总数,则仅 cudaMalloc
所有节点(和所有网络)的内存会更有效。然后,在内核中相应地更新指针。
像这样:
struct cuNetwork2{
int num_allnodes;
NNode* all_nodes;
}
__global__ void kernel(cuNetwork2* d_networks, Node* d_nodes, int numNodesPerNetwork){
int index = ...
d_networks[index].num_allnodes = numNodesPerNetwork;
d_networks[index].all_nodes = d_nodes + index * numNodesperNetwork;
}
...
int numBlocks = 65;
int num_allnodes = 200;
cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
kernel<<<>>>(d_networks, d_nodes, num_allnodes);
在这种情况下,您不需要 cudaDeviceSetLimit 或内核动态分配。
这是两种方法之间的简单性能比较。
#include <iostream>
#include <chrono>
#include <cassert>
class NNode{
public:
int node_id;
};
class cuNetwork{
public:
int num_allnodes;
NNode** all_nodes;
};
__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
cuNetwork* gpu_net = new cuNetwork();
gpu_net->all_nodes = new NNode*[num_allnodes];
for(int i=0; i<num_allnodes; i++){
gpu_net->all_nodes[i] = new NNode();
}
arr_gpu_net[bId] = gpu_net;
}
__global__ void mallocTestMultiThreadBlock(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
if(threadIdx.x == 0){
cuNetwork* gpu_net = new cuNetwork();
assert(gpu_net != nullptr);
gpu_net->all_nodes = new NNode*[num_allnodes];
assert(gpu_net->all_nodes != nullptr);
arr_gpu_net[bId] = gpu_net;
}
__syncthreads();
for(int i=threadIdx.x; i<num_allnodes; i += blockDim.x){
arr_gpu_net[bId]->all_nodes[i] = new NNode();
}
}
struct cuNetwork2{
int num_allnodes;
NNode* all_nodes;
};
__global__
void pointerassignmentkernel(cuNetwork2* d_networks, NNode* d_nodes, int numNodesPerNetwork){
int bId = blockIdx.x;
d_networks[bId].num_allnodes = numNodesPerNetwork;
d_networks[bId].all_nodes = d_nodes + bId * numNodesPerNetwork;
}
__global__
void nodeinitkernel(NNode* d_nodes, int totalNumNodes){
const int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < totalNumNodes){
new (&d_nodes[id]) NNode();
}
}
int main(int argc, const char **argv){
int numBlocks = 64;
int num_allnodes = 200;
cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);
size_t size;
//for each block
size = sizeof(cuNetwork);//new cuNetwork()
size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes]
size += sizeof(NNode) * num_allnodes; //for()... new NNode()
//size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1ull * 1024ull * 1024ull * 1024ull); // I set this to 1GB which did not cause errors for the given problem size
std::chrono::time_point<std::chrono::system_clock> timeA = std::chrono::system_clock::now();
mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
//mallocTestMultiThreadBlock<<<numBlocks, num_allnodes>>>(num_allnodes, arr_gpu_net);
cudaError_t status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeB = std::chrono::system_clock::now();
std::cerr << "mallocTest kernel: " << std::chrono::duration<double>(timeB - timeA).count() << "s\n";
std::chrono::time_point<std::chrono::system_clock> timeC = std::chrono::system_clock::now();
cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
std::chrono::time_point<std::chrono::system_clock> timeD = std::chrono::system_clock::now();
std::cerr << "host cudaMalloc: " << std::chrono::duration<double>(timeD - timeC).count() << "s\n";
pointerassignmentkernel<<<numBlocks, 1>>>(d_networks, d_nodes, num_allnodes);
status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeE = std::chrono::system_clock::now();
std::cerr << "pointerassignmentkernel: " << std::chrono::duration<double>(timeE - timeD).count() << "s\n";
nodeinitkernel<<<(numBlocks * num_allnodes + 128 - 1) / 128, 128>>>(d_nodes, numBlocks * num_allnodes);
status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeF = std::chrono::system_clock::now();
std::cerr << "nodeinitkernel: " << std::chrono::duration<double>(timeF - timeE).count() << "s\n";
cudaDeviceReset();
return 0;
}
Compiled with: nvcc -arch=sm_61 -O3 kernelallocation.cu -o kernelallocation
mallocTest kernel: 0.0183772s
host cudaMalloc: 5.02e-06s
pointerassignmentkernel: 1.2739e-05s
nodeinitkernel: 1.213e-05s
我正在尝试在我的设备内、线程内和块内初始化复杂对象。在我看来,cudaDeviceSetLimit
有问题。鉴于我对问题的理解,我没有正确设置每个线程的堆内存量。 documentation refers to my problem. But they do not initialize an object. I have also read this post 的这一部分,但我的代码无法正常工作。
编辑
与第一个答案相反:在我的问题配置中必须在内核中执行此操作,因为我想利用跨块并行初始化对象的优势。
我制作了以下玩具示例,它适用于少量块 (65) 但不适用于 65535 个块(我可以在我的设备上使用的最大块数):
class NNode{
public:
int node_id;
};
class cuNetwork{
public:
int num_allnodes;
NNode** all_nodes;
};
__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
cuNetwork* gpu_net = new cuNetwork();
gpu_net->all_nodes = new NNode*[num_allnodes];
for(int i=0; i<num_allnodes; i++){
gpu_net->all_nodes[i] = new NNode();
}
arr_gpu_net[bId] = gpu_net;
}
int main(int argc, const char **argv){
int numBlocks = 65;
int num_allnodes = 200;
cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);
size_t size;
//for each block
size = sizeof(cuNetwork);//new cuNetwork()
size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes]
size += sizeof(NNode) * num_allnodes; //for()... new NNode()
//size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, numBlocks * size);
mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
cudaDeviceSynchronize();
return 0;
}
一旦我开始向对象添加其他属性,或者如果我将 numBlocks
增加到 65535,我就会收到错误消息:
CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x555555efff90
Thread 1 "no_fun" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (7750,0,0), thread (0,0,0), device 0, sm 1, warp 3, lane 0]
0x0000555555f000b0 in mallocTest(int, cuNetwork**)<<<(65535,1,1),(1,1,1)>>> ()
我的问题是:在这个例子中,我应该如何正确地初始化 cudaDeviceSetLimit
以便为 cuNetwork
的每个线程初始化所需的正确内存量?
回答你的问题:
由于内存填充和分配粒度,每个块可能需要比计算大小更多的内存。
您应该始终检查 new
的 return 值。如果是nullptr
,则分配失败。
但是,如果预先知道所有网络的节点总数,则仅 cudaMalloc
所有节点(和所有网络)的内存会更有效。然后,在内核中相应地更新指针。
像这样:
struct cuNetwork2{
int num_allnodes;
NNode* all_nodes;
}
__global__ void kernel(cuNetwork2* d_networks, Node* d_nodes, int numNodesPerNetwork){
int index = ...
d_networks[index].num_allnodes = numNodesPerNetwork;
d_networks[index].all_nodes = d_nodes + index * numNodesperNetwork;
}
...
int numBlocks = 65;
int num_allnodes = 200;
cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
kernel<<<>>>(d_networks, d_nodes, num_allnodes);
在这种情况下,您不需要 cudaDeviceSetLimit 或内核动态分配。
这是两种方法之间的简单性能比较。
#include <iostream>
#include <chrono>
#include <cassert>
class NNode{
public:
int node_id;
};
class cuNetwork{
public:
int num_allnodes;
NNode** all_nodes;
};
__global__ void mallocTest(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
cuNetwork* gpu_net = new cuNetwork();
gpu_net->all_nodes = new NNode*[num_allnodes];
for(int i=0; i<num_allnodes; i++){
gpu_net->all_nodes[i] = new NNode();
}
arr_gpu_net[bId] = gpu_net;
}
__global__ void mallocTestMultiThreadBlock(int num_allnodes, cuNetwork** arr_gpu_net){
int bId = blockIdx.x;
if(threadIdx.x == 0){
cuNetwork* gpu_net = new cuNetwork();
assert(gpu_net != nullptr);
gpu_net->all_nodes = new NNode*[num_allnodes];
assert(gpu_net->all_nodes != nullptr);
arr_gpu_net[bId] = gpu_net;
}
__syncthreads();
for(int i=threadIdx.x; i<num_allnodes; i += blockDim.x){
arr_gpu_net[bId]->all_nodes[i] = new NNode();
}
}
struct cuNetwork2{
int num_allnodes;
NNode* all_nodes;
};
__global__
void pointerassignmentkernel(cuNetwork2* d_networks, NNode* d_nodes, int numNodesPerNetwork){
int bId = blockIdx.x;
d_networks[bId].num_allnodes = numNodesPerNetwork;
d_networks[bId].all_nodes = d_nodes + bId * numNodesPerNetwork;
}
__global__
void nodeinitkernel(NNode* d_nodes, int totalNumNodes){
const int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < totalNumNodes){
new (&d_nodes[id]) NNode();
}
}
int main(int argc, const char **argv){
int numBlocks = 64;
int num_allnodes = 200;
cuNetwork** arr_gpu_net = new cuNetwork*[numBlocks];
cudaMalloc((void **)&arr_gpu_net, sizeof(cuNetwork*) * numBlocks);
size_t size;
//for each block
size = sizeof(cuNetwork);//new cuNetwork()
size += sizeof(NNode*) * num_allnodes;//new NNode*[num_allnodes]
size += sizeof(NNode) * num_allnodes; //for()... new NNode()
//size = sizeof(cuNetwork) + (sizeof(int) * 2 + sizeof(NNode)) * num_allnodes;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1ull * 1024ull * 1024ull * 1024ull); // I set this to 1GB which did not cause errors for the given problem size
std::chrono::time_point<std::chrono::system_clock> timeA = std::chrono::system_clock::now();
mallocTest<<<numBlocks, 1>>>(num_allnodes, arr_gpu_net);
//mallocTestMultiThreadBlock<<<numBlocks, num_allnodes>>>(num_allnodes, arr_gpu_net);
cudaError_t status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeB = std::chrono::system_clock::now();
std::cerr << "mallocTest kernel: " << std::chrono::duration<double>(timeB - timeA).count() << "s\n";
std::chrono::time_point<std::chrono::system_clock> timeC = std::chrono::system_clock::now();
cuNetwork2* d_networks;
NNode* d_nodes;
cudaMalloc(&d_networks, sizeof(cuNetwork2) * numBlocks);
cudaMalloc(&d_nodes, sizeof(NNode) * numBlocks * num_allnodes);
std::chrono::time_point<std::chrono::system_clock> timeD = std::chrono::system_clock::now();
std::cerr << "host cudaMalloc: " << std::chrono::duration<double>(timeD - timeC).count() << "s\n";
pointerassignmentkernel<<<numBlocks, 1>>>(d_networks, d_nodes, num_allnodes);
status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeE = std::chrono::system_clock::now();
std::cerr << "pointerassignmentkernel: " << std::chrono::duration<double>(timeE - timeD).count() << "s\n";
nodeinitkernel<<<(numBlocks * num_allnodes + 128 - 1) / 128, 128>>>(d_nodes, numBlocks * num_allnodes);
status = cudaDeviceSynchronize();
assert(status == cudaSuccess);
std::chrono::time_point<std::chrono::system_clock> timeF = std::chrono::system_clock::now();
std::cerr << "nodeinitkernel: " << std::chrono::duration<double>(timeF - timeE).count() << "s\n";
cudaDeviceReset();
return 0;
}
Compiled with: nvcc -arch=sm_61 -O3 kernelallocation.cu -o kernelallocation
mallocTest kernel: 0.0183772s
host cudaMalloc: 5.02e-06s
pointerassignmentkernel: 1.2739e-05s
nodeinitkernel: 1.213e-05s