为什么我的矢量化访问内核这么慢?

Why my vectorized access kernel is so slow?

我正在尝试了解矢量化内存访问并实现一个简单示例来评估性能。但是我发现矢量化的比朴素的慢?

在矢量化内核中,我将 int 指针重铸为 int2 指针,然后进行数据复制。

这是我使用的代码:

#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#include <iostream>

void initData_int(int *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (int)(rand()&0xff);
    }
}

__global__ void naiveCopy(int *d_in, int *d_out, int size)
{
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    for (int i = tid; i < size; i += blockDim.x*gridDim.x)
    {
        d_out[i] = d_in[i];
    }
}

__global__ void vecCopy(int *d_in, int *d_out, int size)
{
    int2* in = (int2*)d_in;
    int2* out = (int2*)d_out;
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    for (int i = tid; i < size/2; i += blockDim.x*gridDim.x)
    {
        out[i] = in[i];
    }

    if(tid==size/2 && size%2==1)
        d_out[size-1] = d_in[size-1];
}

int main(int argc, char **argv)
{
    int size = 1<<24;
    //int size = 128;
    int nBytes = size*sizeof(int);
    int *d_h;
    cudaMallocHost((int**)&d_h, nBytes);
    initData_int(d_h, size);

    //printData(d_h, size);

    int *res = (int*)malloc(nBytes);

    cudaStream_t stream;
    cudaStreamCreate(&stream);
    int *d_in, *d_out;
    dim3 block(128, 1);
    dim3 grid((size-1)/block.x+1, 1);
    cudaMalloc((int**)&d_in, nBytes);
    cudaMalloc((int**)&d_out, nBytes);

    cudaMemcpyAsync(d_in, d_h, nBytes, cudaMemcpyHostToDevice, stream);
    cudaStreamSynchronize(stream);
    auto s_0 = std::chrono::system_clock::now();
    naiveCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
    cudaStreamSynchronize(stream);
    auto e_0 = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = e_0 - s_0;
    printf("Naive Kernel time cost is: %2f.\n", diff.count());
    
    memset(res, 0, nBytes);
    cudaMemset(d_out, 0, nBytes);
    //vectorized access:
    cudaStreamSynchronize(stream);
    s_0 = std::chrono::system_clock::now();
    vecCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
    cudaStreamSynchronize(stream);
    e_0 = std::chrono::system_clock::now();
    diff = e_0 - s_0;
    printf("Vectorized kernel time cost is: %2f.\n", diff.count());

    cudaStreamDestroy(stream);
    cudaFree(d_h);
    cudaFree(d_in);
    cudaFree(d_out);
    free(res);

    return 0;
} 

这是来自 nvprof 的数据:

            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   89.28%  5.5024ms         1  5.5024ms  5.5024ms  5.5024ms  [CUDA memcpy HtoD]
                    4.82%  296.94us         1  296.94us  296.94us  296.94us  vecCopy(int*, int*, int)
                    3.99%  246.19us         1  246.19us  246.19us  246.19us  naiveCopy(int*, int*, int)

能否请您解释一下导致性能下降的原因?

您没有很好地调整网格大小。您的网格尺寸对于原始内核可能是合理的:

dim3 grid((size-1)/block.x+1, 1);

但是它们不必要地是矢量化复制内核所需大小的两倍。

当我将向量化内核的网格大小减半时(以匹配原始内核的方法):

dim3 grid2((size/2+block.x-1)/block.x);

然后根据我的测试,矢量化复制内核变得更快:

                3.88%  233.99us         1  233.99us  233.99us  233.99us  naiveCopy(int*, int*, int)
                2.84%  171.33us         1  171.33us  171.33us  171.33us  vecCopy(int*, int*, int)

备注:

  1. cudaFree 不是与 cudaMallocHost 一起使用的正确 API。正确的 API 是 cudaFreeHost.

  2. 正如评论中所提到的,我们可以更好地调整网格大小,方法是调整网格大小以匹配您 运行 所在的 GPU。然而,我们不需要采取这一步来证明这里的改进。