矩阵乘法期间索引引起的性能差异

Question

我正在尝试在 CUDA C++ 中使用平铺实现和简单实现之间的区别。由于共享内存的重复使用，我希望在这些变体中看到性能差距。然而，加速只有大约两倍（naive ~12ms 和 tile ~6ms）。以下是代码片段：

#include <iostream>
#include <assert.h>
using namespace std;

# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void init_values(int *a, int *b, int sz) {
    for(int i=0; i<sz; i++) {
        a[i] = rand()%513 - 256;
        b[i] = rand()%513 - 256;
    }
}

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int t = 0;
    for(int i=0; i<n; i++) {
        t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
    }
    c[IDX(x, y, n)] = t;
}

void matmul_verify(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<n; j++) {
            int t = 0;
            for(int k=0; k<n; k++)
                t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
            // cout << i << " " << j << " " << c[IDX(i, j, n)] <<  " " << t <<  endl;
            assert(c[IDX(i, j, n)] == t);
        }
    }
}

int main()
{
    int *a, *b, *c;
    int *da, *db, *dc;
    size_t sz = N * N * sizeof(int);

    a = (int*)malloc(sz);
    b = (int*)malloc(sz);
    c = (int*)malloc(sz);
    init_values(a, b, N*N);

    gpuErrchk(cudaMalloc((void**)&da, sz));
    gpuErrchk(cudaMalloc((void**)&db, sz));
    gpuErrchk(cudaMalloc((void**)&dc, sz));

    gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
    // init grid size
    dim3 grids(N/THREADS, N/THREADS);
    dim3 blocks(THREADS, THREADS);
    // time it
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    matmul<<<grids, blocks>>>(da, db, dc, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cout << "Took " << milliseconds << " milliseconds.\n";

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
    matmul_verify(a, b, c, N);

    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(a);
    free(b);
    free(c);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

对于平铺实现，我将内核更改为

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int ty = threadIdx.y, by = blockIdx.y;
    int tx = threadIdx.x, bx = blockIdx.x;
    int x = bx * blockDim.x + tx;
    int y = by * blockDim.y + ty;
    // block IDs tell us which block to solve for
    // (bx, by) --> (bx: bx + tx, by:by + ty)
    __shared__ int A[SHMEM_SIZE];
    __shared__ int B[SHMEM_SIZE];

    const int tile_size = THREADS;

    // to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
    int res = 0;

    for(int blk=0; blk < n; blk+=tile_size)  {
        // block index
        A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
        B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
        __syncthreads();
        for(int k=0; k<tile_size; k++) {
            res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
        }
        __syncthreads();
    }

    // for(int k=0; k<n; k++)
    //     res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
    c[IDX(x, y, n)] = res;
}

没有其他真正改变。但是，在平铺实现中，如果我简单地更改

    int ty = threadIdx.x, by = blockIdx.x;
    int tx = threadIdx.y, bx = blockIdx.y;

对于线程和块索引的初始化，我得到大约 1 毫秒的运行时间（12 倍加速）。这是怎么回事？我从“CUDA By Example”一书中读到，二维中的线程和块索引只是为了程序员方便，并不反映任何性能差异。这似乎是错误的。非常感谢任何澄清。

Answer 1

CUDA 线程块被划分为 32 个线程的 warp。理想情况下，warp 的相邻通道应始终从全局内存中加载相邻元素。这称为合并并允许最大内存带宽。在硬件中，来自 warp 的所有合并负载将被捆绑到最少数量的内存事务中。

其他可能降低内存带宽的因素是负载的大小（可以尝试使用内置向量类型来获得更大的负载以进行优化，例如 int2、int4、float2, 等) 和对齐.

从 3D threadIdx 到 warp 通道的映射总是以第一个维度 .x 作为连续维度，即一个维度 (32, 2, 1) 的块将有一个 warp [=18] =] 和一个带有 threadIdx.y == 1 的经纱，其中每个经纱的通道对应于 threadIdx.x.

因此要允许合并，您必须访问内存

A[ty * s + tx] // coalesced access

而不是

A[tx * s + ty] // strided access

实现最佳性能。

你提到的书中可能的意思是，启动 (32, 2, 1) 块网格和 (64, 1, 1) 块网格在手动获取 ty = threadIdx.x / 32 和 tx = threadIdx.x % 32。当首先有一个不平坦的块时，这些分裂可能会在内部发生。

矩阵乘法期间索引引起的性能差异

Performance difference due to indexing during matrix multiplication

cuda