为什么cuda kernel w/i divergence 的性能优于w/o divergence？

Question

您好，我正在编写 cuda 内核来了解经线发散的行为。这些是我拥有的 3 个内核：

#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>

__global__ void wardUp(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if ((idx/warpSize)%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

__global__ void kernel1(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if ((idx/warpSize)%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

__global__ void kernel2(float *c)
{
    float a = 0.0;
    float b = 0.0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx%2 == 0){
        a = 100.0f;
    }
    else{
        b = 200.0f;
    }
    c[idx] = a+b;
}

int main(int argc, char **argv)
{
    initDevice(0);
    int size = 64;
    int blocksize = 64;
    int nBytes = sizeof(float)*size;
    float *a_d;
    CHECK(cudaMalloc((float**)&a_d, nBytes));
    dim3 block(blocksize, 1);
    dim3 grid((blocksize-1)/block.x+1, 1);

    wardUp<<<grid, block>>>(a_d);

    float elapsed = 0;
    cudaEvent_t start1, stop1;
    CHECK(cudaEventCreate(&start1));
    CHECK(cudaEventCreate(&stop1));
    CHECK(cudaEventRecord(start1, 0));
    kernel1<<<grid, block>>>(a_d);
    CHECK(cudaEventRecord(stop1, 0));
    CHECK(cudaEventSynchronize(stop1));
    CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
    printf("kernel1 take:%2f ms\n", elapsed);

    float elapsed_1 = 0;
    cudaEvent_t start2, stop2;
    CHECK(cudaEventCreate(&start2));
    CHECK(cudaEventCreate(&stop2));
    CHECK(cudaEventRecord(start2, 0));
    kernel2<<<grid, block>>>(a_d);
    CHECK(cudaEventRecord(stop2, 0));
    CHECK(cudaEventSynchronize(stop2));
    CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
    printf("kernel2 take:%2f ms\n", elapsed_1);
    
    cudaFree(a_d);
    cudaEventDestroy(start1);
    cudaEventDestroy(stop1);
    cudaEventDestroy(start2);
    cudaEventDestroy(stop2);
    return 0;
}

如果我的理解是正确的，kernel1 没有发散问题，因为 if 分支发生在线程 0-31 上，相同的扭曲。 kernel2 由于奇数线程和偶数线程不能同时执行，所以会有分歧问题。但是我观察到 kernel1 比 kernel2 慢。为什么会这样？

Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms

我转而使用 cudaEventRecord 来记录持续时间，但似乎 kernel1 比 kernel2 慢。

Answer 1

您的方法 are/were 有很多问题。我可能不会一一列举：

问题规模太小，无法进行基准测试
编译器优化对您不利
代码太简单；编译器使用谓词来减轻扭曲发散的影响
您的内核持续时间测量方法存在缺陷
你基于blocksize创建grid是不明智的（尽管当size == blocksize时它恰好是明智的）。它应该基于问题 size，而不是 blocksize.

以下代码解决了这些问题，并显示内核持续时间增加了大约 2 倍，从根据 warp 边界执行 if/then 决策的代码到为每个其他线程执行此操作的代码：

$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x

__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
    int a = 0;
    int b = 0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx&32){
        for (int i = 0; i < l1; i++){
          a = a&y; a = a|z;}
    }
    else{
        for (int i = 0; i < l2; i++){
          b = b|y; b = b&z;}
    }
    c[idx] = a+b;
}

__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
    int a = 0;
    int b = 0;
    int idx = threadIdx.x + blockIdx.x*blockDim.x;
    if (idx&1){
        for (int i = 0; i < l1; i++){
          a = a&y; a = a|z;}
    }
    else{
        for (int i = 0; i < l2; i++){
          b = b|y; b = b&z;}
    }
    c[idx] = a+b;
}

int main(int argc, char **argv)
{
    int blocksize = 64;
    int size = blocksize*1048576;
    int nBytes = sizeof(int)*size;
    int *a_d;
    CHECK(cudaMalloc((int**)&a_d, nBytes));
    dim3 block(blocksize, 1);
    dim3 grid(size/block.x, 1);

    kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();

    auto start1 = std::chrono::system_clock::now();
    kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();
    auto end1 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff1 = end1 - start1;
    printf("kernel1 take:%2f s\n", diff1.count());

    kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();

    auto start2 = std::chrono::system_clock::now();
    kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
    cudaDeviceSynchronize();
    auto end2 = std::chrono::system_clock::now();
    std::chrono::duration<double>diff2 = end2 - start2;
    printf("kernel2 take:%2f s\n", diff2.count());
    return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$

为什么cuda kernel w/i divergence 的性能优于w/o divergence？

Why the performance of cuda kernel w/i divergence is better than w/o divergence?

cuda