重置 CUDA 数组的值

Question

当我们在 CUDA 中有一个线性数组 d_A 时，可以通过

将其所有值重置为 0

cudaMemset(d_A, 0, K*K*sizeof(unsigned int) )

这工作得非常快（我想以全局内存带宽的速度）。如果我们现在有一个 texture/surface 内存需要的 CUDA 数组 cu_A，我找不到一个等效的函数来重置它的值，所以我尝试了两种解决方法：1) 存储一个线性数组 d_A 全部为 0 并将其复制到 CUDA 数组：

cudaMemcpyToArray(cu_A, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);

我发现这个副本的速度大约是我全局内存带宽的 10%，所以有点让人失望。然后我尝试了选项 2)，我在其中存储了另一个 CUDA 数组 cu_B，其中包含预复制的零，然后将其复制到主 CUDA 数组。这是最小的工作示例：

#include "mex.h"
#include "gpu/mxGPUArray.h"
#define K 4096 // data dimension

void mexFunction(int nlhs, mxArray *plhs[],
        int nrhs, mxArray const *prhs[])
{
    mxInitGPU();    
    // Declare the density field
    mwSize const Asize[] = { K, K };
    mxGPUArray *A = mxGPUCreateGPUArray(2, Asize, mxUINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); // initialized to zeros
    unsigned int *d_A = (unsigned int *)(mxGPUGetData(A));

    // Allocate CUDA arrays in device memory
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cu_A;
    cudaArray* cu_B;
    cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
    cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);

    /* Store the blank CUDA array here */
    cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);

    for (int timeStep = 0; timeStep<1000; timeStep++) {
        cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
    }

    mxGPUDestroyGPUArray(A);
    cudaFreeArray(cu_A);
    cudaFreeArray(cu_B);
}

令我沮丧的是，这个数组到数组的副本运行的速率仅为

(4096*4096 elements)*(1000 iterations)*(4 bits)/(measured 9.6 s) = 7 Gb/s

我的 Quadro P5000 应该具备的 288 Gb/s 个功能。

这些数字有意义吗？有没有更快的方法来重置 CUDA 数组？

Answer 1

如果我们从场景中删除 matlab，事情似乎按照我的预期进行。

首先，这条线是不正确的，我总是建议使用 proper cuda error checking:

cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);

这是从设备指针 (d_A) 复制到也在设备上的 cudaArray 指针 (cu_B)。因此正确的传输方向是cudaMemcpyDeviceToDevice.

如果我在没有 matlab harness 的情况下制作您的代码版本，并且运行使用 nvprof，我发现时间安排与我的预期大致一致。这是一个不依赖于 matlab 的完整示例代码，跟随您的代码：

$ cat t444.cu
#include <stdio.h>


int main(){
    int K = 4096;
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cudaArray* cu_A;
    cudaArray* cu_B;
    cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
    cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
    unsigned int *d_A;
    cudaMalloc(&d_A, K*K*sizeof(unsigned int));
    /* Store the blank CUDA array here */
    cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice);

    for (int timeStep = 0; timeStep<10; timeStep++) {
        cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
    }
    cudaDeviceSynchronize();
}

当我编译代码并在 Pascal Titan X 设备（类似于 Quadro P5000）上运行时，我得到以下分析器输出：

$ nvprof --print-gpu-trace ./t444
==16315== NVPROF is profiling process 16315, command: ./t444
==16315== Profiling application: ./t444
==16315== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
520.69ms  455.13us                    -               -         -         -         -  64.000MB  137.32GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
521.15ms  453.98us                    -               -         -         -         -  64.000MB  137.67GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
521.60ms  453.63us                    -               -         -         -         -  64.000MB  137.78GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
524.36ms  453.82us                    -               -         -         -         -  64.000MB  137.72GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
524.82ms  453.69us                    -               -         -         -         -  64.000MB  137.76GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
527.57ms  453.85us                    -               -         -         -         -  64.000MB  137.71GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
528.03ms  453.69us                    -               -         -         -         -  64.000MB  137.76GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
530.78ms  453.91us                    -               -         -         -         -  64.000MB  137.69GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
531.24ms  453.69us                    -               -         -         -         -  64.000MB  137.76GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
533.99ms  453.72us                    -               -         -         -         -  64.000MB  137.75GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
534.44ms  453.50us                    -               -         -         -         -  64.000MB  137.82GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
537.20ms  453.82us                    -               -         -         -         -  64.000MB  137.72GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
537.65ms  453.59us                    -               -         -         -         -  64.000MB  137.79GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
540.42ms  453.91us                    -               -         -         -         -  64.000MB  137.69GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
540.88ms  453.59us                    -               -         -         -         -  64.000MB  137.79GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
543.63ms  453.82us                    -               -         -         -         -  64.000MB  137.72GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
544.09ms  453.56us                    -               -         -         -         -  64.000MB  137.80GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
547.66ms  453.79us                    -               -         -         -         -  64.000MB  137.73GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
548.11ms  453.53us                    -               -         -         -         -  64.000MB  137.81GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]
550.87ms  453.88us                    -               -         -         -         -  64.000MB  137.70GB/s       Array      Device  TITAN X (Pascal         1         7  [CUDA memcpy AtoD]
551.32ms  453.56us                    -               -         -         -         -  64.000MB  137.80GB/s      Device       Array  TITAN X (Pascal         1         7  [CUDA memcpy DtoA]

所以有几点观察：

格式正确的 cudaMemcpyToArray 操作运行大约是设备全局内存带宽。该操作涉及每字节一次读取和一次写入，测得的吞吐量（字节 transferred/time）约为 140GB/s，这意味着此操作（每字节一次读取和一次写入）交付的带宽约为 280GB/s秒。这可能是您进行此数组初始化操作的最佳选择。
cudaMemcpyArrayToArray 操作被 CUDA 运行time 分解为 2 个独立的操作，从数组复制到线性缓冲区，然后从线性缓冲区复制到数组.因此，我希望此选项运行的速度是上面选项 1 的一半。

我希望，如果您对代码进行分析，您应该会发现这些操作运行ning 大约在这个速度。 9.6s 测量中的剩余时间可能是由于这些传输以外的操作造成的，例如 matlab 开销、CUDA 初始化开销以及与代码中其他 CUDA 运行time 调用相关的 CUDA 开销。

重置 CUDA 数组的值

Reset the values of a CUDA Array

textures

cuda

bandwidth