将非平凡的 类 复制到设备
Copy non-trivial classes to Device
我有一个 class 这样的:
class CudaArray
{
CudaArray() : Ptr(new double[5]) {}
double* Ptr;
int Dimension;
}
然后是另一个 class,例如:
class Container
{
short a;
CudaArray* ArrayPtr;
int b;
int c;
}
现在我正在以这种方式在设备上创建数组:
CudaArray H_Array;
CudaArray* D_Array;
Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));
double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));
我希望能够在设备代码上使用类型为 Container
的对象,但我在从现有数组初始化 CudaArray
成员时遇到了问题。到目前为止我试过:
Container* Cont = nullptr;
Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));
但我在最后一个 cudaMemcpy 上得到 GPUassert: invalid argument
。
如何初始化包含指向设备内存中现有对象 (class) 的指针的设备 class?
还有,有没有更简单或更优雅的方法来在主机和设备之间复制复杂的对象?
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));
But I get GPUassert: invalid argument on the last cudaMemcpy.
错误是您指定了 cudaMemcpyDeviceToDevice
但 &D_Array
是主机内存中的一个位置:
CudaArray* D_Array;
您应该使用 cudaMempcyHostToDevice
。
How can I initialize a device class that contains a pointer to an existing object(class) in device memory?
该更改似乎为我解决了问题:
$ cat t174.cu
#include <cstdio>
class CudaArray
{
public:
CudaArray() : Ptr(new double[5]) {}
double* Ptr;
int Dimension;
};
class Container
{
public:
short a;
CudaArray* ArrayPtr;
int b;
int c;
};
#define Check(x) x
__global__ void k(Container *c){
printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){
CudaArray H_Array;
H_Array.Ptr[0] = 1234.0;
CudaArray* D_Array;
Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));
double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));
Container* Cont = nullptr;
Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyHostToDevice));
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t174 t174.cu
$ cuda-memcheck ./t174
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$
And also, is there a simpler or more elegant way to copy complex objects between host and device?
可能有 impacts,但从代码复杂性的角度来看,如果您通过托管分配器进行 all 分配,事情可能会更简单(简单性在于旁观者之眼):
$ cat t175.cu
#include <cstdio>
#include <new>
class CudaArray
{
public:
CudaArray() {cudaMallocManaged(&Ptr, 5*sizeof(double)); for (int i = 0; i < 5; i ++) Ptr[i] = 0.0;}
double* Ptr;
int Dimension;
};
class Container
{
public:
short a;
CudaArray* ArrayPtr;
int b;
int c;
};
#define Check(x) x
__global__ void k(Container *c){
printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){
CudaArray *my_Array;
cudaMallocManaged(&my_Array, sizeof(CudaArray));
new(my_Array) CudaArray();
my_Array[0].Ptr[0] = 1234.0;
Container* Cont = nullptr;
Check(cudaMallocManaged(&Cont , sizeof(Container)));
Cont[0].ArrayPtr = my_Array;
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t175 t175.cu
$ cuda-memcheck ./t175
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$
我有一个 class 这样的:
class CudaArray
{
CudaArray() : Ptr(new double[5]) {}
double* Ptr;
int Dimension;
}
然后是另一个 class,例如:
class Container
{
short a;
CudaArray* ArrayPtr;
int b;
int c;
}
现在我正在以这种方式在设备上创建数组:
CudaArray H_Array;
CudaArray* D_Array;
Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));
double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));
我希望能够在设备代码上使用类型为 Container
的对象,但我在从现有数组初始化 CudaArray
成员时遇到了问题。到目前为止我试过:
Container* Cont = nullptr;
Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));
但我在最后一个 cudaMemcpy 上得到 GPUassert: invalid argument
。
如何初始化包含指向设备内存中现有对象 (class) 的指针的设备 class?
还有,有没有更简单或更优雅的方法来在主机和设备之间复制复杂的对象?
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyDeviceToDevice));
But I get GPUassert: invalid argument on the last cudaMemcpy.
错误是您指定了 cudaMemcpyDeviceToDevice
但 &D_Array
是主机内存中的一个位置:
CudaArray* D_Array;
您应该使用 cudaMempcyHostToDevice
。
How can I initialize a device class that contains a pointer to an existing object(class) in device memory?
该更改似乎为我解决了问题:
$ cat t174.cu
#include <cstdio>
class CudaArray
{
public:
CudaArray() : Ptr(new double[5]) {}
double* Ptr;
int Dimension;
};
class Container
{
public:
short a;
CudaArray* ArrayPtr;
int b;
int c;
};
#define Check(x) x
__global__ void k(Container *c){
printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){
CudaArray H_Array;
H_Array.Ptr[0] = 1234.0;
CudaArray* D_Array;
Check(cudaMalloc(&D_Array, sizeof(CudaArray)));
Check(cudaMemcpy(D_Array, &H_Array, sizeof(CudaArray), cudaMemcpyHostToDevice));
double* Tmp;
Check(cudaMalloc(&Tmp, sizeof(double) * 5));
Check(cudaMemcpy(Tmp, H_Array.Ptr, sizeof(double) * 5, cudaMemcpyHostToDevice));
Check(cudaMemcpy(&(D_Array->Ptr), &Tmp, sizeof(double*), cudaMemcpyHostToDevice));
Container* Cont = nullptr;
Check(cudaMalloc(&Cont , sizeof(Container)));
Check(cudaMemcpy(&(Cont->ArrayPtr), &D_Array, sizeof(CudaArray*), cudaMemcpyHostToDevice));
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t174 t174.cu
$ cuda-memcheck ./t174
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$
And also, is there a simpler or more elegant way to copy complex objects between host and device?
可能有
$ cat t175.cu
#include <cstdio>
#include <new>
class CudaArray
{
public:
CudaArray() {cudaMallocManaged(&Ptr, 5*sizeof(double)); for (int i = 0; i < 5; i ++) Ptr[i] = 0.0;}
double* Ptr;
int Dimension;
};
class Container
{
public:
short a;
CudaArray* ArrayPtr;
int b;
int c;
};
#define Check(x) x
__global__ void k(Container *c){
printf("%f\n", c->ArrayPtr->Ptr[0]);
}
int main(){
CudaArray *my_Array;
cudaMallocManaged(&my_Array, sizeof(CudaArray));
new(my_Array) CudaArray();
my_Array[0].Ptr[0] = 1234.0;
Container* Cont = nullptr;
Check(cudaMallocManaged(&Cont , sizeof(Container)));
Cont[0].ArrayPtr = my_Array;
k<<<1,1>>>(Cont);
cudaDeviceSynchronize();
}
$ nvcc -o t175 t175.cu
$ cuda-memcheck ./t175
========= CUDA-MEMCHECK
1234.000000
========= ERROR SUMMARY: 0 errors
$