将指向多个设备的指针数组传递给 Cuda C 内核
Pass array of pointers to multiple devices to Cuda C Kernel
我有一个需要处理的一维数组,但它对于单个 GPU 来说太大了。因此,我将数组传递给多个 GPU 以存储在内存中,其数量将根据问题的大小而变化。如果我将一个指针数组传递给不同 GPU 中的数组,我将无法从我的 Cuda C 内核访问其他数组。
我已经尝试通过内核调用将一个简单的设备指针数组传递给每个设备,但是当我尝试访问数组时代码似乎中断了。即使是 运行 内核的设备也无法访问其自身内存中的数组。
数据结构:
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
分配器:
ComplexArrayArray stateVector;
stateVector.Arr = (ComplexArray*)malloc(sizeof(ComplexArray*) * numberOfGPU));
for (int dev = 0; dev < numberOfGPI; dev++)
{
...
cudaMalloc(&(stateVector.Arr[dev].real), numberOfElements * sizeof(*(stateVector.Arr[dev].real)) / numberOfGPU);
...
}
内核:
__global__ void kernel(..., ComplexArrayArray stateVector, ...)
{
// Calculate necessary device
int device_number = ...;
int index = ...;
double val = stateVector.Arr[device_number].real[index];
...
}
当我尝试以这种方式访问数组时,内核似乎 "break"。没有报错信息,但是很明显数据没有被读取。此外,在数据访问后我没有到达任何 printf 语句。
关于将指向设备内存的指针数组传递给 Cuda C 内核的最佳方法有什么想法吗?
您尝试使用带有指向结构数组的指针的结构,每个结构都有一个嵌入式指针,这将导致 cudaMalloc
的实现非常复杂。如果使用 cudaMallocManaged
可能会简单一些,但仍然不必要地复杂。复杂性的出现是因为 cudaMalloc
在特定设备上分配了 space,并且(默认情况下)任何其他设备都无法访问该数据,而且由于您的嵌入式指针创建了各种必要性"deep copies"。这是一个有效的例子:
$ cat t1492.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArrayArray stateVector, int dev, int ds)
{
// Calculate necessary device
int device_number = dev;
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.Arr[device_number].real[index] + dev;
stateVector.Arr[device_number].real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArrayArray *stateVector = new ComplexArrayArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
ComplexArray *ddata = new ComplexArray[numberOfGPU];
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].Arr), sizeof(ComplexArray) * numberOfGPU);
cudaMalloc(&(ddata[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(ddata[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(stateVector[i].Arr, ddata, sizeof(ComplexArray)*numberOfGPU, cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), ddata[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1492 t1492.cu
$ cuda-memcheck ./t1492
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
但是,如果您想将一个主机阵列分成每个 GPU 的一个块,则不需要那么复杂。这是一个更简单的例子:
$ cat t1493.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[index] + dev;
stateVector.real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray *stateVector = new ComplexArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1493 t1493.cu
$ cuda-memcheck ./t1493
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
请注意,您的问题似乎提到了将数据分成块的想法,并且每个内核都可能可以访问所有块。这将需要管理内存使用或了解系统可以支持 GPU 之间的 P2P 访问。这增加了更多的复杂性,超出了我在这里回答的范围,我在这里回答的重点是你关于内核无法访问 "its own" 数据的问题。
由于我们应该能够限制可以参与的 GPU 数量(让我们将其设置为最大值 8),我们可以避免第一种方法的深度复制,同时仍然允许所有 GPU 拥有所有指针.这是一个修改后的例子:
$ cat t1495.cu
#include <iostream>
#include <stdio.h>
const int maxGPU=8;
typedef struct ComplexArray
{
double *real[maxGPU];
} ComplexArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[dev][index] + dev;
stateVector.real[dev][index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray stateVector;
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector.real[i]), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector.real[i], hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector, i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector.real[i], (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1495 t1495.cu
$ cuda-memcheck ./t1495
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
我有一个需要处理的一维数组,但它对于单个 GPU 来说太大了。因此,我将数组传递给多个 GPU 以存储在内存中,其数量将根据问题的大小而变化。如果我将一个指针数组传递给不同 GPU 中的数组,我将无法从我的 Cuda C 内核访问其他数组。
我已经尝试通过内核调用将一个简单的设备指针数组传递给每个设备,但是当我尝试访问数组时代码似乎中断了。即使是 运行 内核的设备也无法访问其自身内存中的数组。
数据结构:
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
分配器:
ComplexArrayArray stateVector;
stateVector.Arr = (ComplexArray*)malloc(sizeof(ComplexArray*) * numberOfGPU));
for (int dev = 0; dev < numberOfGPI; dev++)
{
...
cudaMalloc(&(stateVector.Arr[dev].real), numberOfElements * sizeof(*(stateVector.Arr[dev].real)) / numberOfGPU);
...
}
内核:
__global__ void kernel(..., ComplexArrayArray stateVector, ...)
{
// Calculate necessary device
int device_number = ...;
int index = ...;
double val = stateVector.Arr[device_number].real[index];
...
}
当我尝试以这种方式访问数组时,内核似乎 "break"。没有报错信息,但是很明显数据没有被读取。此外,在数据访问后我没有到达任何 printf 语句。
关于将指向设备内存的指针数组传递给 Cuda C 内核的最佳方法有什么想法吗?
您尝试使用带有指向结构数组的指针的结构,每个结构都有一个嵌入式指针,这将导致 cudaMalloc
的实现非常复杂。如果使用 cudaMallocManaged
可能会简单一些,但仍然不必要地复杂。复杂性的出现是因为 cudaMalloc
在特定设备上分配了 space,并且(默认情况下)任何其他设备都无法访问该数据,而且由于您的嵌入式指针创建了各种必要性"deep copies"。这是一个有效的例子:
$ cat t1492.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArrayArray stateVector, int dev, int ds)
{
// Calculate necessary device
int device_number = dev;
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.Arr[device_number].real[index] + dev;
stateVector.Arr[device_number].real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArrayArray *stateVector = new ComplexArrayArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
ComplexArray *ddata = new ComplexArray[numberOfGPU];
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].Arr), sizeof(ComplexArray) * numberOfGPU);
cudaMalloc(&(ddata[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(ddata[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(stateVector[i].Arr, ddata, sizeof(ComplexArray)*numberOfGPU, cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), ddata[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1492 t1492.cu
$ cuda-memcheck ./t1492
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
但是,如果您想将一个主机阵列分成每个 GPU 的一个块,则不需要那么复杂。这是一个更简单的例子:
$ cat t1493.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[index] + dev;
stateVector.real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray *stateVector = new ComplexArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1493 t1493.cu
$ cuda-memcheck ./t1493
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
请注意,您的问题似乎提到了将数据分成块的想法,并且每个内核都可能可以访问所有块。这将需要管理内存使用或了解系统可以支持 GPU 之间的 P2P 访问。这增加了更多的复杂性,超出了我在这里回答的范围,我在这里回答的重点是你关于内核无法访问 "its own" 数据的问题。
由于我们应该能够限制可以参与的 GPU 数量(让我们将其设置为最大值 8),我们可以避免第一种方法的深度复制,同时仍然允许所有 GPU 拥有所有指针.这是一个修改后的例子:
$ cat t1495.cu
#include <iostream>
#include <stdio.h>
const int maxGPU=8;
typedef struct ComplexArray
{
double *real[maxGPU];
} ComplexArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[dev][index] + dev;
stateVector.real[dev][index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray stateVector;
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector.real[i]), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector.real[i], hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector, i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector.real[i], (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1495 t1495.cu
$ cuda-memcheck ./t1495
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$