将非平凡的 类 复制到设备

Copy non-trivial classes to Device

我有一个 class 这样的:

class CudaArray
{
   CudaArray() : Ptr(new double[5]) {}
   double* Ptr;
   int Dimension;
}

然后是另一个 class,例如:

class Container
{
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
}

现在我正在以这种方式在设备上创建数组:

CudaArray H_Array;
CudaArray* D_Array;

Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));

double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));

我希望能够在设备代码上使用类型为 Container 的对象,但我在从现有数组初始化 CudaArray 成员时遇到了问题。到目前为止我试过:

Container* Cont = nullptr;

Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));

但我在最后一个 cudaMemcpy 上得到 GPUassert: invalid argument

如何初始化包含指向设备内存中现有对象 (class) 的指针的设备 class?

还有,有没有更简单或更优雅的方法来在主机和设备之间复制复杂的对象?

Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));

But I get GPUassert: invalid argument on the last cudaMemcpy.

错误是您指定了 cudaMemcpyDeviceToDevice&D_Array 是主机内存中的一个位置:

CudaArray* D_Array;

您应该使用 cudaMempcyHostToDevice

How can I initialize a device class that contains a pointer to an existing object(class) in device memory?

该更改似乎为我解决了问题:

$ cat t174.cu
#include <cstdio>
class CudaArray
{
  public:
   CudaArray() : Ptr(new double[5]) {}
   double* Ptr;
   int Dimension;
};

class Container
{
  public:
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
};
#define Check(x) x
__global__ void k(Container *c){

  printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){

CudaArray H_Array;
H_Array.Ptr[0] = 1234.0;
CudaArray* D_Array;

Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));

double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));

Container* Cont = nullptr;

Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyHostToDevice));
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t174 t174.cu
$ cuda-memcheck ./t174
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$

And also, is there a simpler or more elegant way to copy complex objects between host and device?

可能有 impacts,但从代码复杂性的角度来看,如果您通过托管分配器进行 all 分配,事情可能会更简单(简单性在于旁观者之眼):

$ cat t175.cu
#include <cstdio>
#include <new>
class CudaArray
{
  public:
   CudaArray()  {cudaMallocManaged(&Ptr, 5*sizeof(double)); for (int i = 0; i < 5; i ++) Ptr[i] = 0.0;}
   double* Ptr;
   int Dimension;
};

class Container
{
  public:
   short a;
   CudaArray* ArrayPtr;
   int b;
   int c;
};
#define Check(x) x
__global__ void k(Container *c){

  printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){

CudaArray *my_Array;
cudaMallocManaged(&my_Array, sizeof(CudaArray));
new(my_Array) CudaArray();
my_Array[0].Ptr[0] = 1234.0;

Container* Cont = nullptr;

Check(cudaMallocManaged(&Cont , sizeof(Container)));
Cont[0].ArrayPtr = my_Array;
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t175 t175.cu
$ cuda-memcheck ./t175
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$