为什么这两个 GPU 内核有巨大的性能差异?

Why these two GPU kernel have massive performance difference?

您好,我正在尝试了解 cuda 内核的某些行为。这是我拥有的两个 cuda 内核。我发现 gpuReduce 需要的持续时间是 gpuReduceOpt 的两倍。是背离造成的吗?


#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>

void initData_int(int *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (int)(rand()&0xff);
    }
}


__global__ void gpuReduce(int *in, int *out, int size)
{
    int tid = threadIdx.x;
    int* data = in + blockIdx.x*blockDim.x;
    if (tid >= size)
        return;

    for (int stride = 1; stride < blockDim.x; stride*=2)
    {
        if((tid%(2*stride)) == 0){
            data[tid] += data[tid+stride];
            
        }
        __syncthreads();
    }
    if (tid == 0){
        out[blockIdx.x] = data[0];
    }
}

__global__ void gpuReduceOpt(int *in, int *out, int size)
{
    int tid = threadIdx.x;
    int* data = in + blockIdx.x*blockDim.x;
    if (tid >= size)
        return;
        
    for (int stride = 1; stride < blockDim.x; stride*=2)
    {
        int index = 2*stride*tid;
        if(index < blockDim.x){
            data[index] += data[index+stride];
        }
        __syncthreads();
    }
    if (tid == 0){
        out[blockIdx.x] = data[0];
    }
}

int main(int agrc, char **argv)
{
    int size = 1<<24;
    int blocksize = 1024;

    
    dim3 block(blocksize, 1);
    dim3 grid((size-1)/block.x+1, 1);
    int nBytes = sizeof(int)*size;

    int *a_h = (int*)malloc(nBytes);
    int *tmp = (int*)malloc(sizeof(int)*grid.x);
    int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
    initData_int(a_h, size);

    int *a_d, *output;
    cudaMalloc((int**)&a_d, nBytes);
    cudaMalloc((int**)&output, grid.x*sizeof(int));

    int *a_d1, *output1;
    cudaMalloc((int**)&a_d1, nBytes);
    cudaMalloc((int**)&output1, grid.x*sizeof(int));
    cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    

    auto start2 = std::chrono::system_clock::now();
    gpuReduce<<<grid, block>>>(a_d, output, size);
    cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
    int gpu_result;

    for (int i =0; i < grid.x; i++)
    {
        gpu_result += tmp[i];
    }
    cudaDeviceSynchronize();
    auto end2 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff2 = end2 - start2;
    printf("Gpu reduce take:%2f s\n", diff2.count());
    
    auto start3 = std::chrono::system_clock::now();
    gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
    cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
    int gpu_result1;

    for (int i =0; i < grid.x; i++)
    {
        gpu_result1 += tmp1[i];
    }
    cudaDeviceSynchronize();
    auto end3 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff3 = end3 - start3;
    printf("Gpu reduce opt take:%2f s\n", diff3.count());
    printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);


    cudaFree(a_d);
    cudaFree(output);
    free(a_h);
    free(tmp);
    cudaDeviceReset();
    return 0;
}

这是我得到的性能数据:

Gpu reduce take:0.004238 s
Gpu reduce opt take:0.002606 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471

在您现在发布的代码中,主机代码中仍然存在错误。此构造不正确:

int gpu_result;   // not initialized

for (int i =0; i < grid.x; i++)
{
    gpu_result += tmp[i];
}

这是未定义的行为。不能保证上面的变量 gpu_result 将从零开始。 gpu_result1.

也存在同样的问题

当我们解决这个问题时,内核时间执行的差异主要归结为第一个内核中模运算符的使用,正如@talonmies 在第一条评论中所建议的那样。如果您对每个内核进行概要分析,假设使用 nvprof,并询问 gld_efficiencygst_efficiencygld_transactionsgst_transactions 等指标,您会发现它们两个内核之间基本相同。

但是,如果您用等效但成本较低的算法替换模运算符,内核持续时间将变得几乎相同(在大约 10% 以内):

$ cat t1878a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>

void initData_int(int *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (int)(rand()&0xff);
    }
}


__global__ void gpuReduce(int *in, int *out, int size)
{
    int tid = threadIdx.x;
    int* data = in + blockIdx.x*blockDim.x;
    if (tid >= size)
        return;

    for (int stride = 1; stride < blockDim.x; stride*=2)
    {
#ifdef USE_FAST
        if((tid&(2*stride-1)) == 0){
#else
        if((tid%(2*stride)) == 0){
#endif
            data[tid] += data[tid+stride];

        }
        __syncthreads();
    }
    if (tid == 0){
        out[blockIdx.x] = data[0];
    }
}

__global__ void gpuReduceOpt(int *in, int *out, int size)
{
    int tid = threadIdx.x;
    int* data = in + blockIdx.x*blockDim.x;
    if (tid >= size)
        return;

    for (int stride = 1; stride < blockDim.x; stride*=2)
    {
        int index = 2*stride*tid;
        if(index < blockDim.x){
            data[index] += data[index+stride];
        }
        __syncthreads();
    }
    if (tid == 0){
        out[blockIdx.x] = data[0];
    }
}

int main(int agrc, char **argv)
{
    int size = 1<<24;
    int blocksize = 1024;


    dim3 block(blocksize, 1);
    dim3 grid((size-1)/block.x+1, 1);
    int nBytes = sizeof(int)*size;

    int *a_h = (int*)malloc(nBytes);
    int *tmp = (int*)malloc(sizeof(int)*grid.x);
    int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
    initData_int(a_h, size);

    int *a_d, *output;
    cudaMalloc((int**)&a_d, nBytes);
    cudaMalloc((int**)&output, grid.x*sizeof(int));

    int *a_d1, *output1;
    cudaMalloc((int**)&a_d1, nBytes);
    cudaMalloc((int**)&output1, grid.x*sizeof(int));
    cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);

    auto start2 = std::chrono::system_clock::now();
    gpuReduce<<<grid, block>>>(a_d, output, size);
    cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
    int gpu_result = 0;

    for (int i =0; i < grid.x; i++)
    {
        gpu_result += tmp[i];
    }
    cudaDeviceSynchronize();
    auto end2 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff2 = end2 - start2;
    printf("Gpu reduce take:%2f s\n", diff2.count());

    auto start3 = std::chrono::system_clock::now();
    gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
    cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
    int gpu_result1 = 0;

    for (int i =0; i < grid.x; i++)
    {
        gpu_result1 += tmp1[i];
    }
    cudaDeviceSynchronize();
    auto end3 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff3 = end3 - start3;
    printf("Gpu reduce opt take:%2f s\n", diff3.count());
    printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);


    cudaFree(a_d);
    cudaFree(output);
    free(a_h);
    free(tmp);
    cudaDeviceReset();
    return 0;
}
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo
$ nvprof ./t1878a
==14339== NVPROF is profiling process 14339, command: ./t1878a
Gpu reduce take:0.001021 s
Gpu reduce opt take:0.000543 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14339== Profiling application: ./t1878a
==14339== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.40%  43.743ms         2  21.872ms  21.280ms  22.463ms  [CUDA memcpy HtoD]
                    1.72%  770.61us         1  770.61us  770.61us  770.61us  gpuReduce(int*, int*, int)
                    0.86%  384.30us         1  384.30us  384.30us  384.30us  gpuReduceOpt(int*, int*, int)
                    0.03%  12.960us         2  6.4800us  6.4000us  6.5600us  [CUDA memcpy DtoH]
      API calls:   69.86%  350.40ms         4  87.601ms  8.0580us  349.79ms  cudaMalloc
                   19.33%  96.969ms         1  96.969ms  96.969ms  96.969ms  cudaDeviceReset
                    9.13%  45.770ms         4  11.442ms  451.76us  22.822ms  cudaMemcpy
                    1.00%  5.0119ms         4  1.2530ms  590.62us  3.2115ms  cuDeviceTotalMem
                    0.50%  2.5242ms       404  6.2470us     427ns  270.20us  cuDeviceGetAttribute
                    0.09%  449.28us         2  224.64us  10.437us  438.85us  cudaFree
                    0.06%  279.02us         4  69.755us  59.853us  94.003us  cuDeviceGetName
                    0.02%  101.11us         2  50.555us  23.936us  77.175us  cudaLaunchKernel
                    0.00%  22.146us         4  5.5360us  3.2730us  10.770us  cuDeviceGetPCIBusId
                    0.00%  14.686us         2  7.3430us  4.1300us  10.556us  cudaDeviceSynchronize
                    0.00%  11.444us         8  1.4300us     506ns  4.8200us  cuDeviceGet
                    0.00%  6.2180us         3  2.0720us     610ns  3.9200us  cuDeviceGetCount
                    0.00%  3.5570us         4     889ns     740ns  1.1270us  cuDeviceGetUuid
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo -DUSE_FAST
$ nvprof ./t1878a
==14375== NVPROF is profiling process 14375, command: ./t1878a
Gpu reduce take:0.000656 s
Gpu reduce opt take:0.000538 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14375== Profiling application: ./t1878a
==14375== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.92%  38.947ms         2  19.474ms  19.460ms  19.488ms  [CUDA memcpy HtoD]
                    1.08%  427.79us         1  427.79us  427.79us  427.79us  gpuReduce(int*, int*, int)
                    0.97%  385.99us         1  385.99us  385.99us  385.99us  gpuReduceOpt(int*, int*, int)
                    0.03%  13.216us         2  6.6080us  6.4320us  6.7840us  [CUDA memcpy DtoH]
      API calls:   67.47%  281.96ms         4  70.491ms  5.5820us  281.49ms  cudaMalloc
                   20.44%  85.428ms         1  85.428ms  85.428ms  85.428ms  cudaDeviceReset
                    9.70%  40.518ms         4  10.129ms  457.52us  19.781ms  cudaMemcpy
                    1.20%  5.0260ms         4  1.2565ms  601.24us  3.2163ms  cuDeviceTotalMem
                    0.94%  3.9413ms       404  9.7550us     270ns  1.7028ms  cuDeviceGetAttribute
                    0.10%  435.98us         2  217.99us  9.5230us  426.46us  cudaFree
                    0.10%  410.88us         4  102.72us  58.347us  225.92us  cuDeviceGetName
                    0.02%  94.871us         2  47.435us  20.952us  73.919us  cudaLaunchKernel
                    0.01%  21.734us         4  5.4330us  3.5080us  8.4130us  cuDeviceGetPCIBusId
                    0.00%  14.504us         2  7.2520us  3.8730us  10.631us  cudaDeviceSynchronize
                    0.00%  12.843us         8  1.6050us     460ns  5.3730us  cuDeviceGet
                    0.00%  9.7040us         3  3.2340us     804ns  6.9430us  cuDeviceGetCount
                    0.00%  2.5870us         4     646ns     517ns     957ns  cuDeviceGetUuid
$

备注:

  • 我并不是说以上是模数的一般替代品。它在这种情况下有效,因为 stride 仅取 2 的幂。

  • 我怀疑这是否符合您的想法:

    if (tid >= size)
        return;
    

    但这里的问题大小(块大小的整数倍)不是特别相关。如果剩余的内核代码使用 __syncthreads(),这也不是一个合适的选择,但这与此问题无关 size/choice.

  • 您在 2080 Ti 上的代码 运行 比在我的 V100 上慢了大约 5 倍,这对我来说听起来不对。我想知道您是否正在构建调试项目。但这并没有改变这里的观察结果。如果您正在构建调试项目或使用 -G 编译开关,我建议 永远不要 对调试代码进行性能分析。