如何在 CUDA C++ 中复制 3 维数组的分段

How to copy the subsection of the 3 dimensional array in CUDA C++

我遵循了 Using cudaMemcpy3D to transfer *** pointer 的例子 然而,我的任务是将设备全局内存数组的 3d 子部分复制到设备全局内存数组,例如:

Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))

// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);

所以在上面我正在寻找一种方法来将所有数据从 sourceTensor 复制到目标张量

x 索引在 (1,6)

范围内

y 指数在 (2,7)

范围内

z 索引在 (3,8)

范围内

所以只有源数组的一部分,但我不知道如何正确定义 make_cudaPitchedPtr 和 make_cudaExtent,以实现我的目标。

cudaMemcpy3DParams 中的 srcPos 参数应该可以让这变得非常简单。这是一个例子:

$ cat t1957.cu
#include <cstdio>

typedef int it;  // index type
typedef int dt;  // data type

__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {

  for (it z = 0; z < Nz; z++)
    for (it y = 0; y < Ny; y++)
      for (it x = 0; x < Nx; x++) {
        char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
        ((dt *)ptr)[x] = z*100+y*10+x;
        }
};

__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {

  if (((dt *)targetTensor.ptr)[0] != 321) {
    printf("%d\n", ((dt *)targetTensor.ptr)[0]);
    }

};


int main(){

  it Nx =10;
  it Ny=10;
  it Nz = 10;
  struct cudaPitchedPtr sourceTensor;
  cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
  populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
  it NxTarget = 5;
  it NyTarget = 5;
  it NzTarget = 5;
  struct cudaPitchedPtr targetTensor;
  cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
  cudaMemcpy3DParms cpy = { 0 };
  it NxOffset = 1;
  it NyOffset = 2;
  it NzOffset = 3;
  cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
  cpy.srcPtr = sourceTensor;
  cpy.dstPtr = targetTensor;
  cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
  cpy.kind = cudaMemcpyDeviceToDevice;
  cudaMemcpy3D(&cpy);
  verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
  cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

请注意,当源和目标都未指定为 cudaArray 类型时,元素大小总是 assumed 为无符号字符(即。 1 个字节)。