为什么在减少时使用寄存器内存比共享内存慢？

Question

我评估了两个内核性能：

#include <chrono>
#include <cuda_runtime.h>
#include <stdio.h>

void initData_int(int *p, int size){
    for (int t=0; t<size; t++){
        p[t] = (int)(rand()&0xff);
    }
}

__global__ void reduceShfl(int *in, int* out, int size)
{
    extern __shared__ int smem[];
    int tid = threadIdx.x;
    int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
    smem[tid] = 0;
    if (tid>=size) return;
    int tmp = 0; 
    if (idx + blockDim.x*3 <= size){
        int a = in[idx];
        int b = in[idx+blockDim.x];
        int c = in[idx+2*blockDim.x];
        int d = in[idx+3*blockDim.x];
        tmp = a + b + c + d;
    }
    smem[tid] = tmp;
    __syncthreads();

    if (blockDim.x >= 1024 && tid < 512){
        smem[tid] += smem[tid + 512];
    }
    __syncthreads();
    if (blockDim.x >= 512 && tid < 256){
        smem[tid] += smem[tid + 256];
    }
    __syncthreads();
    if (blockDim.x >= 256 && tid < 128){
        smem[tid] += smem[tid + 128];
    }
    __syncthreads();
    if (blockDim.x >= 128 && tid < 64){
        smem[tid] += smem[tid + 64];
    }
    __syncthreads();

    if (blockDim.x >= 64 && tid < 32){
        smem[tid] += smem[tid + 32];
    }
    __syncthreads();
    
    int tmpsum = smem[tid]; 
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
    tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
    if (tid==0)
        out[blockIdx.x] = tmpsum;
}

__global__ void reduceShmUnroll(int *in, int *out, int num)
{
    extern __shared__ int smem[];
    int tid = threadIdx.x;
    int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
    
    if (tid >= num) return;
    int tmp=0;
    if(idx + blockDim.x*3 <= num)
    {
        int a = in[idx];
        int b = in[idx + blockDim.x];
        int c = in[idx + blockDim.x*2];
        int d = in[idx + blockDim.x*3];
        tmp = a + b + c + d;
    }   
    smem[tid] = tmp;
    __syncthreads();

    if (blockDim.x >= 1024 && tid < 512){
        smem[tid] += smem[tid + 512];
    }
    __syncthreads();
    if (blockDim.x >= 512 && tid < 256){
        smem[tid] += smem[tid+256];
    }
    __syncthreads();
    if (blockDim.x >= 256 && tid < 128){
        smem[tid] += smem[tid+128];
    }
    __syncthreads();
    if (blockDim.x >= 128 && tid < 64){
        smem[tid] += smem[tid+64];
    }
    __syncthreads();
    if (tid < 32){
        volatile int *vsmem = smem;
        vsmem[tid] += vsmem[tid+32];
        vsmem[tid] += vsmem[tid+16];
        vsmem[tid] += vsmem[tid+8];
        vsmem[tid] += vsmem[tid+4];
        vsmem[tid] += vsmem[tid+2];
        vsmem[tid] += vsmem[tid+1];
    }

    if (tid == 0) out[blockIdx.x] = smem[0];
}

int main(int agrc, char **argv)
{
    int size = 1<<24;
    int nBytes = size*sizeof(int);
    int *a_h = (int*)malloc(nBytes);
    initData_int(a_h, size);

    int blocksize = 1024;
    int gridsize = (size-1)/blocksize+1;
    dim3 block(blocksize, 1);
    dim3 grid((size-1)/blocksize+1, 1);
    int *a_d, *b_d;
    cudaMalloc((int**)&a_d, nBytes);
    cudaMalloc((int**)&b_d, grid.x*sizeof(int));
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    int *tmp = (int*)malloc(gridsize*sizeof(int));
    memset(tmp, 0, grid.x/4);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    auto s_0 = std::chrono::system_clock::now();
    reduceShfl<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
    cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    int res_1 = 0;
    for (int i=0; i<grid.x/4; i++){
        res_1 += tmp[i];
    }
    auto e_0 = std::chrono::system_clock::now();
    std::chrono::duration<double> diff = e_0 - s_0;
    printf("Result from reduceShfl is: %d and time cost is %2f.\n", res_1, diff.count());

    memset(tmp, 0, grid.x/4);
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    s_0 = std::chrono::system_clock::now();
    reduceShmUnroll<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
    cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    int res_0 = 0;
    for (int i=0; i<grid.x/4; i++){
        res_0 += tmp[i];
    }
    e_0 = std::chrono::system_clock::now();
    diff = e_0 - s_0;
    printf("Result from reduceShmUnroll is: %d and time cost is %2f.\n", res_0, diff.count());

    cudaFree(a_d);
    cudaFree(b_d);
    free(a_h);
    free(tmp);
    return 0;
}

主要区别在于最后一次 warp 缩减，reduceShmUnroll 使用共享内存，reduceShfl 进行 warp shuffle，它使用寄存器内存。但是我发现 reduceShfl 比 reduceShmUnroll 慢。

Result from reduceShfl is: 2139353471 and time cost is 0.000533.
Result from reduceShmUnroll is: 2139353471 and time cost is 0.000485.

我的代码有问题吗？

Answer 1

Is sth wrong with my code?

是的，我会说你的代码有问题。

我看到的主要问题是您进行的比较无效。在您的共享内存内核中，您将最后一次扭曲减少 activity 限制为最后一次扭曲。在shuffle内核中，你不是：

共享内存内核：

__syncthreads();
if (tid < 32){  // this is missing from your shuffle kernel
    volatile int *vsmem = smem;
    vsmem[tid] += vsmem[tid+32];
    vsmem[tid] += vsmem[tid+16];
    vsmem[tid] += vsmem[tid+8];
    vsmem[tid] += vsmem[tid+4];
    vsmem[tid] += vsmem[tid+2];
    vsmem[tid] += vsmem[tid+1];
}

随机播放内核：

__syncthreads();

int tmpsum = smem[tid]; 
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
    out[blockIdx.x] = tmpsum;

当我以限制共享内存内核的方式限制你的洗牌内核时（这样不必要的扭曲就不会做不必要的工作）然后我观察到大约相等的运行次（大约 1% 的差异）当我在 V100 上使用 nvprof 进行配置时，在两个内核之间：

                0.38%  222.76us         1  222.76us  222.76us  222.76us  reduceShmUnroll(int*, int*, int)
                0.37%  220.55us         1  220.55us  220.55us  220.55us  reduceShfl(int*, int*, int)

这是我所期望的。对于这种有限的使用，没有理由认为共享内存使用或随机播放会更快或更慢。

共享内存 activity 和 warp shuffle activity 都有吞吐量限制。因此，试图预测哪个会更快是很困难的，因为这取决于代码中发生的其他事情。如果您的代码受共享内存吞吐量限制，并且您将其中的一些 activity 转换为 warp shuffle，您可能会看到 warp shuffle 的好处。同样的说法也可以换个方向。对于此特定代码的此特定部分，当编写 properly/comparably/equivalently 时，您不受共享内存吞吐量或 warp shuffle 吞吐量的不同的约束，因此正确的期望是没有区别性能，以一个代替另一个。

为什么在减少时使用寄存器内存比共享内存慢？

Why is using register memory slower than shared memory when doing reduction?

cuda