如何在 CUDA C++ 中复制 3 维数组的分段
How to copy the subsection of the 3 dimensional array in CUDA C++
我遵循了 Using cudaMemcpy3D to transfer *** pointer 的例子
然而,我的任务是将设备全局内存数组的 3d 子部分复制到设备全局内存数组,例如:
Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))
// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
所以在上面我正在寻找一种方法来将所有数据从 sourceTensor 复制到目标张量
x 索引在 (1,6)
范围内
y 指数在 (2,7)
范围内
z 索引在 (3,8)
范围内
所以只有源数组的一部分,但我不知道如何正确定义 make_cudaPitchedPtr 和 make_cudaExtent,以实现我的目标。
cudaMemcpy3DParams
中的 srcPos
参数应该可以让这变得非常简单。这是一个例子:
$ cat t1957.cu
#include <cstdio>
typedef int it; // index type
typedef int dt; // data type
__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {
for (it z = 0; z < Nz; z++)
for (it y = 0; y < Ny; y++)
for (it x = 0; x < Nx; x++) {
char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
((dt *)ptr)[x] = z*100+y*10+x;
}
};
__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {
if (((dt *)targetTensor.ptr)[0] != 321) {
printf("%d\n", ((dt *)targetTensor.ptr)[0]);
}
};
int main(){
it Nx =10;
it Ny=10;
it Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
it NxTarget = 5;
it NyTarget = 5;
it NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
cudaMemcpy3DParms cpy = { 0 };
it NxOffset = 1;
it NyOffset = 2;
it NzOffset = 3;
cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
cpy.srcPtr = sourceTensor;
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
请注意,当源和目标都未指定为 cudaArray
类型时,元素大小总是 assumed 为无符号字符(即。 1 个字节)。
我遵循了 Using cudaMemcpy3D to transfer *** pointer 的例子 然而,我的任务是将设备全局内存数组的 3d 子部分复制到设备全局内存数组,例如:
Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))
// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
所以在上面我正在寻找一种方法来将所有数据从 sourceTensor 复制到目标张量
x 索引在 (1,6)
范围内y 指数在 (2,7)
范围内z 索引在 (3,8)
范围内所以只有源数组的一部分,但我不知道如何正确定义 make_cudaPitchedPtr 和 make_cudaExtent,以实现我的目标。
cudaMemcpy3DParams
中的 srcPos
参数应该可以让这变得非常简单。这是一个例子:
$ cat t1957.cu
#include <cstdio>
typedef int it; // index type
typedef int dt; // data type
__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {
for (it z = 0; z < Nz; z++)
for (it y = 0; y < Ny; y++)
for (it x = 0; x < Nx; x++) {
char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
((dt *)ptr)[x] = z*100+y*10+x;
}
};
__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {
if (((dt *)targetTensor.ptr)[0] != 321) {
printf("%d\n", ((dt *)targetTensor.ptr)[0]);
}
};
int main(){
it Nx =10;
it Ny=10;
it Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
it NxTarget = 5;
it NyTarget = 5;
it NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
cudaMemcpy3DParms cpy = { 0 };
it NxOffset = 1;
it NyOffset = 2;
it NzOffset = 3;
cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
cpy.srcPtr = sourceTensor;
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);
verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
请注意,当源和目标都未指定为 cudaArray
类型时,元素大小总是 assumed 为无符号字符(即。 1 个字节)。