重置 CUDA 数组的值
Reset the values of a CUDA Array
当我们在 CUDA 中有一个线性数组 d_A
时,可以通过
将其所有值重置为 0
cudaMemset(d_A, 0, K*K*sizeof(unsigned int) )
这工作得非常快(我想以全局内存带宽的速度)。如果我们现在有一个 texture/surface 内存需要的 CUDA 数组 cu_A
,我找不到一个等效的函数来重置它的值,所以我尝试了两种解决方法:1) 存储一个线性数组 d_A
全部为 0 并将其复制到 CUDA 数组:
cudaMemcpyToArray(cu_A, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
我发现这个副本的速度大约是我全局内存带宽的 10%,所以有点让人失望。然后我尝试了选项 2),我在其中存储了另一个 CUDA 数组 cu_B
,其中包含预复制的零,然后将其复制到主 CUDA 数组。这是最小的工作示例:
#include "mex.h"
#include "gpu/mxGPUArray.h"
#define K 4096 // data dimension
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
mxInitGPU();
// Declare the density field
mwSize const Asize[] = { K, K };
mxGPUArray *A = mxGPUCreateGPUArray(2, Asize, mxUINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); // initialized to zeros
unsigned int *d_A = (unsigned int *)(mxGPUGetData(A));
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cu_A;
cudaArray* cu_B;
cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
/* Store the blank CUDA array here */
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int timeStep = 0; timeStep<1000; timeStep++) {
cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
}
mxGPUDestroyGPUArray(A);
cudaFreeArray(cu_A);
cudaFreeArray(cu_B);
}
令我沮丧的是,这个数组到数组的副本 运行 的速率仅为
(4096*4096 elements)*(1000 iterations)*(4 bits)/(measured 9.6 s) = 7 Gb/s
我的 Quadro P5000 应该具备的 288 Gb/s 个功能。
这些数字有意义吗?有没有更快的方法来重置 CUDA 数组?
如果我们从场景中删除 matlab,事情似乎按照我的预期进行。
首先,这条线是不正确的,我总是建议使用 proper cuda error checking:
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
这是从设备指针 (d_A
) 复制到也在设备上的 cudaArray 指针 (cu_B
)。因此正确的传输方向是cudaMemcpyDeviceToDevice
.
如果我在没有 matlab harness 的情况下制作您的代码版本,并且 运行 使用 nvprof
,我发现时间安排与我的预期大致一致。这是一个不依赖于 matlab 的完整示例代码,跟随您的代码:
$ cat t444.cu
#include <stdio.h>
int main(){
int K = 4096;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cu_A;
cudaArray* cu_B;
cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
unsigned int *d_A;
cudaMalloc(&d_A, K*K*sizeof(unsigned int));
/* Store the blank CUDA array here */
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice);
for (int timeStep = 0; timeStep<10; timeStep++) {
cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
}
cudaDeviceSynchronize();
}
当我编译代码并在 Pascal Titan X 设备(类似于 Quadro P5000)上 运行 时,我得到以下分析器输出:
$ nvprof --print-gpu-trace ./t444
==16315== NVPROF is profiling process 16315, command: ./t444
==16315== Profiling application: ./t444
==16315== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
520.69ms 455.13us - - - - - 64.000MB 137.32GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
521.15ms 453.98us - - - - - 64.000MB 137.67GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
521.60ms 453.63us - - - - - 64.000MB 137.78GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
524.36ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
524.82ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
527.57ms 453.85us - - - - - 64.000MB 137.71GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
528.03ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
530.78ms 453.91us - - - - - 64.000MB 137.69GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
531.24ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
533.99ms 453.72us - - - - - 64.000MB 137.75GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
534.44ms 453.50us - - - - - 64.000MB 137.82GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
537.20ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
537.65ms 453.59us - - - - - 64.000MB 137.79GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
540.42ms 453.91us - - - - - 64.000MB 137.69GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
540.88ms 453.59us - - - - - 64.000MB 137.79GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
543.63ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
544.09ms 453.56us - - - - - 64.000MB 137.80GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
547.66ms 453.79us - - - - - 64.000MB 137.73GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
548.11ms 453.53us - - - - - 64.000MB 137.81GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
550.87ms 453.88us - - - - - 64.000MB 137.70GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
551.32ms 453.56us - - - - - 64.000MB 137.80GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
所以有几点观察:
格式正确的 cudaMemcpyToArray
操作 运行 大约是设备全局内存带宽。该操作涉及每字节一次读取和一次写入,测得的吞吐量(字节 transferred/time)约为 140GB/s,这意味着此操作(每字节一次读取和一次写入)交付的带宽约为 280GB/s秒。这可能是您进行此数组初始化操作的最佳选择。
cudaMemcpyArrayToArray
操作被 CUDA 运行time 分解为 2 个独立的操作,从数组复制到线性缓冲区,然后从线性缓冲区复制到数组.因此,我希望此选项 运行 的速度是上面选项 1 的一半。
我希望,如果您对代码进行分析,您应该会发现这些操作 运行ning 大约在这个速度。 9.6s 测量中的剩余时间可能是由于这些传输以外的操作造成的,例如 matlab 开销、CUDA 初始化开销以及与代码中其他 CUDA 运行time 调用相关的 CUDA 开销。
当我们在 CUDA 中有一个线性数组 d_A
时,可以通过
cudaMemset(d_A, 0, K*K*sizeof(unsigned int) )
这工作得非常快(我想以全局内存带宽的速度)。如果我们现在有一个 texture/surface 内存需要的 CUDA 数组 cu_A
,我找不到一个等效的函数来重置它的值,所以我尝试了两种解决方法:1) 存储一个线性数组 d_A
全部为 0 并将其复制到 CUDA 数组:
cudaMemcpyToArray(cu_A, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
我发现这个副本的速度大约是我全局内存带宽的 10%,所以有点让人失望。然后我尝试了选项 2),我在其中存储了另一个 CUDA 数组 cu_B
,其中包含预复制的零,然后将其复制到主 CUDA 数组。这是最小的工作示例:
#include "mex.h"
#include "gpu/mxGPUArray.h"
#define K 4096 // data dimension
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
mxInitGPU();
// Declare the density field
mwSize const Asize[] = { K, K };
mxGPUArray *A = mxGPUCreateGPUArray(2, Asize, mxUINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); // initialized to zeros
unsigned int *d_A = (unsigned int *)(mxGPUGetData(A));
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cu_A;
cudaArray* cu_B;
cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
/* Store the blank CUDA array here */
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int timeStep = 0; timeStep<1000; timeStep++) {
cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
}
mxGPUDestroyGPUArray(A);
cudaFreeArray(cu_A);
cudaFreeArray(cu_B);
}
令我沮丧的是,这个数组到数组的副本 运行 的速率仅为
(4096*4096 elements)*(1000 iterations)*(4 bits)/(measured 9.6 s) = 7 Gb/s
我的 Quadro P5000 应该具备的 288 Gb/s 个功能。
这些数字有意义吗?有没有更快的方法来重置 CUDA 数组?
如果我们从场景中删除 matlab,事情似乎按照我的预期进行。
首先,这条线是不正确的,我总是建议使用 proper cuda error checking:
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyHostToDevice);
这是从设备指针 (d_A
) 复制到也在设备上的 cudaArray 指针 (cu_B
)。因此正确的传输方向是cudaMemcpyDeviceToDevice
.
如果我在没有 matlab harness 的情况下制作您的代码版本,并且 运行 使用 nvprof
,我发现时间安排与我的预期大致一致。这是一个不依赖于 matlab 的完整示例代码,跟随您的代码:
$ cat t444.cu
#include <stdio.h>
int main(){
int K = 4096;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
cudaArray* cu_A;
cudaArray* cu_B;
cudaMallocArray(&cu_A, &channelDesc, K, K, cudaArraySurfaceLoadStore);
cudaMallocArray(&cu_B, &channelDesc, K, K, cudaArraySurfaceLoadStore);
unsigned int *d_A;
cudaMalloc(&d_A, K*K*sizeof(unsigned int));
/* Store the blank CUDA array here */
cudaMemcpyToArray(cu_B, 0, 0, d_A, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice);
for (int timeStep = 0; timeStep<10; timeStep++) {
cudaMemcpyArrayToArray ( cu_A, 0, 0, cu_B, 0, 0, K*K*sizeof(unsigned int), cudaMemcpyDeviceToDevice ); // Reset the working memory
}
cudaDeviceSynchronize();
}
当我编译代码并在 Pascal Titan X 设备(类似于 Quadro P5000)上 运行 时,我得到以下分析器输出:
$ nvprof --print-gpu-trace ./t444
==16315== NVPROF is profiling process 16315, command: ./t444
==16315== Profiling application: ./t444
==16315== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
520.69ms 455.13us - - - - - 64.000MB 137.32GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
521.15ms 453.98us - - - - - 64.000MB 137.67GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
521.60ms 453.63us - - - - - 64.000MB 137.78GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
524.36ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
524.82ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
527.57ms 453.85us - - - - - 64.000MB 137.71GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
528.03ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
530.78ms 453.91us - - - - - 64.000MB 137.69GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
531.24ms 453.69us - - - - - 64.000MB 137.76GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
533.99ms 453.72us - - - - - 64.000MB 137.75GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
534.44ms 453.50us - - - - - 64.000MB 137.82GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
537.20ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
537.65ms 453.59us - - - - - 64.000MB 137.79GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
540.42ms 453.91us - - - - - 64.000MB 137.69GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
540.88ms 453.59us - - - - - 64.000MB 137.79GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
543.63ms 453.82us - - - - - 64.000MB 137.72GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
544.09ms 453.56us - - - - - 64.000MB 137.80GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
547.66ms 453.79us - - - - - 64.000MB 137.73GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
548.11ms 453.53us - - - - - 64.000MB 137.81GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
550.87ms 453.88us - - - - - 64.000MB 137.70GB/s Array Device TITAN X (Pascal 1 7 [CUDA memcpy AtoD]
551.32ms 453.56us - - - - - 64.000MB 137.80GB/s Device Array TITAN X (Pascal 1 7 [CUDA memcpy DtoA]
所以有几点观察:
格式正确的
cudaMemcpyToArray
操作 运行 大约是设备全局内存带宽。该操作涉及每字节一次读取和一次写入,测得的吞吐量(字节 transferred/time)约为 140GB/s,这意味着此操作(每字节一次读取和一次写入)交付的带宽约为 280GB/s秒。这可能是您进行此数组初始化操作的最佳选择。cudaMemcpyArrayToArray
操作被 CUDA 运行time 分解为 2 个独立的操作,从数组复制到线性缓冲区,然后从线性缓冲区复制到数组.因此,我希望此选项 运行 的速度是上面选项 1 的一半。
我希望,如果您对代码进行分析,您应该会发现这些操作 运行ning 大约在这个速度。 9.6s 测量中的剩余时间可能是由于这些传输以外的操作造成的,例如 matlab 开销、CUDA 初始化开销以及与代码中其他 CUDA 运行time 调用相关的 CUDA 开销。