CUDA性能问题
CUDA performance issue
我真的不知道如何命名我 运行 遇到的问题,所以如果您认为 mod 相应地重命名它。
我得到了如下矩阵向量乘法内核:
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
我必须针对相当多的块大小测试此内核,但我得到的计时结果不太好。所以我决定mod通过以下方式对此进行验证,并在调用内核时将块大小固定为 32
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
令我非常惊讶的是,内核在执行时间方面提高了好几倍,我完全不知道为什么会这样。
有经验的人可以解释一下吗?
同样在这种情况下,我应该如何使用我想要的所有不同块大小从我的内核中获得最大值?我不能为所有块大小做这件事...
编辑:
这应该是一个有效的重现案例:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
#include "cublas_v2.h" //CUBLAS LIBRARY
#ifndef VALUES_MAX
# define VALUES_MAX 1.
#endif
#ifndef EPS
# define EPS 1.e-6
#endif
#ifndef NR_ITER
# define NR_ITER 200
#endif
enum
{
GPU_NAIVE = 0,
GPU_COALESCED,
GPU_SHMEM,
GPU_KERNEL_END
};
void *gpu_alloc(size_t count)
{
void *ret;
if (cudaMalloc(&ret, count) != cudaSuccess) {
ret = NULL;
}
return ret;
}
int copy_to_gpu(const void *host, void *gpu, size_t count)
{
if (cudaMemcpy(gpu, host, count, cudaMemcpyHostToDevice) != cudaSuccess)
return -1;
return 0;
}
int copy_from_gpu(void *host, const void *gpu, size_t count)
{
if (cudaMemcpy(host, gpu, count, cudaMemcpyDeviceToHost) != cudaSuccess)
return -1;
return 0;
}
void mat_init_rand(float **a, size_t n, float max)
{
size_t i, j;
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
//printf("%d %d \n", i, j);
a[i][j] = 2 * (((float) drand48()) - 0.5) * max;
}
}
}
void vec_init(float *v, size_t n, float val)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = val;
}
}
void vec_init_rand(float *v, size_t n, float max)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = 2 * (((float) drand48()) - 0.5) * max;
}
}
void vec_print(const float *v, size_t n)
{
size_t i;
for (i = 0; i < n; ++i)
printf("%f \n", v[i]);
}
void **calloc_2d(size_t n, size_t m, size_t size)
{
char **ret = (char **) malloc(n*sizeof(char *));
if (ret) {
char *area = (char *) calloc(n*m, size);
if (area) {
for (size_t i = 0; i < n; ++i)
ret[i] = (char *) &area[i*m*size];
} else {
free(ret);
ret = NULL;
}
}
return (void **) ret;
}
void **copy_2d(void **dst, const void **src, size_t n, size_t m, size_t size)
{
memcpy(dst[0], src[0], n*m*size);
return dst;
}
void free_2d(void **array)
{
free(array[0]);
free(array);
}
__global__ void dmv_gpu_shmem(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
__global__ void dmv_gpu_shmem_static(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
int main(int argc, char **argv)
{
if (argc < 2) {
printf("Wrong arguments \n");
return -1;
}
size_t orig_n = atoi(argv[1]);
/* Read block size and kernel to launch from the environment */
const char *env_gpu_kernel = getenv("GPU_KERNEL");
const char *env_gpu_block_size = getenv("GPU_BLOCK_SIZE");
int kernel = (env_gpu_kernel) ? atoi(env_gpu_kernel) : GPU_NAIVE;
int block_size = (env_gpu_block_size) ? atoi(env_gpu_block_size) : 256;
//Adjust Matrix to fit blocksize
size_t n = ((orig_n - 1)/block_size + 1)*block_size;
int grid_size = (n-1)/block_size + 1;
printf("Matrix size: %zd\n", orig_n);
printf("Input Block size: %zd\n", block_size);
printf("Adjusted matrix size: %zd\n", n);
/*
* Allocate the structures.
*
* Initialization to zero is crucial if you adjusted the matrix
* size.
*/
float **A = (float **) calloc_2d(n, n, sizeof(**A));
float *x = (float *) calloc(n, sizeof(*x));
float *y = (float *) calloc(n, sizeof(*y));
/* Initialize */
srand48(0);
mat_init_rand(A, orig_n, VALUES_MAX);
vec_init_rand(x, orig_n, VALUES_MAX);
vec_init(y, orig_n, 0.0);
printf("Setup Complete\n");
/*
* FILLME: Set up the blocks, grid and shared memory depending on
* the kernel. Make any transformations to the input
* matrix here.
*/
//Transposing Matrix for Shared and Coalesced Matrices
float tmp;
for(size_t i=0;i<n;i++)
for(size_t j=i+1;j<n;j++) {
tmp=A[i][j];
A[i][j] = A[j][i];
A[j][i] = tmp;
}
dim3 gpu_block(block_size, 1); // Number of threads
dim3 gpu_grid(grid_size, 1); // Number of blocks
size_t shmem_size = 0; // Shared memory size
/* Set SHARED MEMORY size */
shmem_size = block_size * sizeof(float);
printf(">>>> Begin of record <<<<\n");
printf("Block size: %dx%d\n", gpu_block.x, gpu_block.y);
printf("Grid size : %dx%d\n", gpu_grid.x, gpu_grid.y);
printf("Shared memory size: %ld bytes\n", shmem_size);
/* GPU allocations */
float *gpu_A = (float *) gpu_alloc(n*n*sizeof(*gpu_A));
float *gpu_x = (float *) gpu_alloc(n*sizeof(*gpu_x));
float *gpu_y = (float *) gpu_alloc(n*sizeof(*gpu_y));
/* Copy data to GPU */
copy_to_gpu(A[0], gpu_A, n*n*sizeof(*gpu_A));
copy_to_gpu(x, gpu_x, n*sizeof(*gpu_x));
/* Reset y and copy it to GPU */
vec_init(y, n, 0.0);
copy_to_gpu(y, gpu_y, n*sizeof(*gpu_y));
dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
dmv_gpu_shmem_static<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
cudaDeviceSynchronize();
/* Free resources on host */
free_2d((void **) A);
free(x);
free(y);
/* Free resources on GPU */
cudaFree(gpu_A);
cudaFree(gpu_x);
cudaFree(gpu_y);
return EXIT_SUCCESS;
}
编译
nvcc dmv_test_case.cu
执行
GPU_KERNEL=2 GPU_BLOCK_SIZE=32 ./a.out 2048
GPU_KERNEL 变量在这种情况下什么都不做。
GPU_BLOCK_SIZE 很明显。
number 参数是向量的大小 (n) 和矩阵的大小 (nxn)
Can someone more experienced explain this?
完整的分析超出了我准备提供的范围,但我会从中途开始。正如@talonmies 指出的那样,这至少部分是由于 "compiler optimization due to fixed trip counts".
当我 运行 你的代码 nvprof --print-gpu-trace ...
时,我观察到两个内核(在 cc2.0 设备上)的内核执行时间相差大约 3 倍。这可能有一些偏差,因为我们在 "slower" 内核之后调用 "faster" 内核 - 但它在相同的数据上运行,所以第二个可能有一些缓存优势。但是让我们忽略它。让我们看一下 cuobjdump -sass
为您的代码输出的 SASS 代码:
较慢的内核:
Function : _Z13dmv_gpu_shmemPKfS0_Pfm
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
....
....
/*01f0*/ LD.E R18, [R2]; /* 0x8400000000249c85 */
/*01f8*/ IADD R19.CC, R19, 0x1; /* 0x4801c0000534dc03 */
/*0200*/ LDS R17, [R21]; /* 0xc100000001545c85 */
/*0208*/ IADD.X R20, R20, RZ; /* 0x48000000fd451c43 */
/*0210*/ ISUB RZ.CC, R19, c[0x0][0x8]; /* 0x48014000213fdd03 */
/*0218*/ IADD R21, R21, 0x4; /* 0x4800c00011555c03 */
/*0220*/ ISETP.LT.U32.X.AND P0, PT, R20, RZ, PT; /* 0x188e0000fd41dc43 */
/*0228*/ IADD R2.CC, R2, R15; /* 0x480100003c209c03 */
/*0230*/ IADD.X R3, R3, R16; /* 0x480000004030dc43 */
/*0238*/ FFMA R6, R18, R17, R6; /* 0x300c000045219c00 */
/*0240*/ @P0 BRA 0x1f0; /* 0x4003fffea00001e7 */
更快"static"内核:
Function : _Z20dmv_gpu_shmem_staticPKfS0_Pfm
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
....
....
/*0110*/ LD.E R10, [R2]; /* 0x8400000000229c85 */
/*0118*/ STS [R6], R10; /* 0xc900000000629c85 */
/*0120*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0128*/ LD.E R22, [R8]; /* 0x8400000000859c85 */
/*0130*/ IADD R10.CC, R8, R14; /* 0x4801000038829c03 */
/*0138*/ IADD.X R11, R9, R15; /* 0x480000003c92dc43 */
/*0140*/ IADD R18.CC, R10, R14; /* 0x4801000038a49c03 */
/*0148*/ LD.E R21, [R10]; /* 0x8400000000a55c85 */
/*0150*/ IADD.X R19, R11, R15; /* 0x480000003cb4dc43 */
/*0158*/ IADD R16.CC, R18, R14; /* 0x4801000039241c03 */
/*0160*/ LD.E R24, [R18]; /* 0x8400000001261c85 */
/*0168*/ IADD.X R17, R19, R15; /* 0x480000003d345c43 */
/*0170*/ LDS.128 R8, [RZ]; /* 0xc100000003f21cc5 */
/*0178*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0180*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0188*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0190*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0198*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*01a0*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*01a8*/ FFMA R8, R22, R8, R20; /* 0x3028000021621c00 */
/*01b0*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*01b8*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*01c0*/ FFMA R8, R21, R9, R8; /* 0x3010000025521c00 */
/*01c8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*01d0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*01d8*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*01e0*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*01e8*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*01f0*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*01f8*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0200*/ LDS.128 R8, [0x10]; /* 0xc100000043f21cc5 */
/*0208*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0210*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0218*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0220*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0228*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0230*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*0238*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*0240*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*0248*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*0250*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*0258*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*0260*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0268*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0270*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0278*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0280*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0288*/ LDS.128 R8, [0x20]; /* 0xc100000083f21cc5 */
/*0290*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0298*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*02a0*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*02a8*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*02b0*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*02b8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*02c0*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*02c8*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*02d0*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*02d8*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*02e0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*02e8*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*02f0*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*02f8*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0300*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0308*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0310*/ LDS.128 R8, [0x30]; /* 0xc1000000c3f21cc5 */
/*0318*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0320*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0328*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0330*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0338*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0340*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*0348*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*0350*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*0358*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*0360*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*0368*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*0370*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0378*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0380*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0388*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0390*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0398*/ LDS.128 R8, [0x40]; /* 0xc100000103f21cc5 */
/*03a0*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*03a8*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*03b0*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*03b8*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*03c0*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*03c8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*03d0*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*03d8*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*03e0*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*03e8*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*03f0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*03f8*/ LD.E R20, [R20]; /* 0x8400000001451c85 */
/*0400*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0408*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0410*/ FFMA R24, R25, R11, R8; /* 0x301000002d961c00 */
/*0418*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0420*/ LDS.128 R8, [0x50]; /* 0xc100000143f21cc5 */
/*0428*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0430*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0438*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0440*/ LD.E R21, [R16]; /* 0x8400000001055c85 */
/*0448*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0450*/ IADD R16.CC, R18, R14; /* 0x4801000039241c03 */
/*0458*/ IADD.X R17, R19, R15; /* 0x480000003d345c43 */
/*0460*/ FFMA R8, R22, R8, R24; /* 0x3030000021621c00 */
/*0468*/ LD.E R24, [R18]; /* 0x8400000001261c85 */
/*0470*/ FFMA R8, R23, R9, R8; /* 0x3010000025721c00 */
/*0478*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0480*/ FFMA R8, R20, R10, R8; /* 0x3010000029421c00 */
/*0488*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0490*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0498*/ LD.E R18, [R18]; /* 0x8400000001249c85 */
/*04a0*/ FFMA R22, R25, R11, R8; /* 0x301000002d959c00 */
/*04a8*/ LDS.128 R8, [0x60]; /* 0xc100000183f21cc5 */
/*04b0*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*04b8*/ FFMA R16, R21, R8, R22; /* 0x302c000021541c00 */
/*04c0*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*04c8*/ IADD R22.CC, R20, R14; /* 0x4801000039459c03 */
/*04d0*/ LD.E R20, [R20]; /* 0x8400000001451c85 */
/*04d8*/ IADD.X R23, R21, R15; /* 0x480000003d55dc43 */
/*04e0*/ IADD R8.CC, R22, R14; /* 0x4801000039621c03 */
/*04e8*/ LD.E R22, [R22]; /* 0x8400000001659c85 */
/*04f0*/ FFMA R24, R24, R9, R16; /* 0x3020000025861c00 */
/*04f8*/ IADD.X R9, R23, R15; /* 0x480000003d725c43 */
/*0500*/ IADD R16.CC, R8, R14; /* 0x4801000038841c03 */
/*0508*/ LD.E R19, [R8]; /* 0x840000000084dc85 */
/*0510*/ IADD.X R17, R9, R15; /* 0x480000003c945c43 */
/*0518*/ LD.E R21, [R16]; /* 0x8400000001055c85 */
/*0520*/ FFMA R24, R25, R10, R24; /* 0x3030000029961c00 */
/*0528*/ FFMA R18, R18, R11, R24; /* 0x303000002d249c00 */
/*0530*/ LDS.128 R8, [0x70]; /* 0xc1000001c3f21cc5 */
/*0538*/ FFMA R18, R20, R8, R18; /* 0x3024000021449c00 */
/*0540*/ IADD R8.CC, R16, R14; /* 0x4801000039021c03 */
/*0548*/ FFMA R9, R22, R9, R18; /* 0x3024000025625c00 */
/*0550*/ FFMA R10, R19, R10, R9; /* 0x3012000029329c00 */
/*0558*/ IADD.X R9, R17, R15; /* 0x480000003d125c43 */
/*0560*/ FFMA R20, R21, R11, R10; /* 0x301400002d551c00 */
/*0568*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0570*/ IADD R7.CC, R7, 0x1; /* 0x4801c0000471dc03 */
/*0578*/ IADD.X R13, R13, RZ; /* 0x48000000fcd35c43 */
/*0580*/ ISUB RZ.CC, R7, R4; /* 0x48010000107fdd03 */
/*0588*/ ISETP.LT.U32.X.AND P0, PT, R13, R5, PT; /* 0x188e000014d1dc43 */
/*0590*/ IADD R2.CC, R2, 0x80; /* 0x4801c00200209c03 */
/*0598*/ IADD.X R3, R3, RZ; /* 0x48000000fc30dc43 */
/*05a0*/ @P0 BRA 0x110; /* 0x4003ffeda00001e7 */
由于 SO 字符数限制,我不得不从每个内核中摘录 "main loop" 进行实际的矩阵向量乘法运算。这是由 FFMA
指令执行的 - 浮点融合乘加。如果你看一下矩阵向量乘法运算,你会发现它是一系列乘加运算。
对比以上2个案例,我们可以得出几点看法:
较慢的内核在整个内核中只有一条 FFMA
指令 - 它在我展示的循环中。为了执行 32 次乘加运算,循环必须执行 32 次。更快的内核有 32 条独立的 FFMA
指令。这就是所谓的"unrolled"。先前内核的循环消失了。因此,该指令序列只需执行一次即可执行所有 32 个必要的乘加运算。
与展开一致,我们看到较慢的("rolled up")代码在循环中有大约 11 条指令。展开的代码有 ~150 条指令。
由于汇总代码必须执行 32 次,因此需要执行 32x11 或大约 350 条指令。将此与展开的情况进行比较,我们看到必须执行的指令只有一半。
这可能是对这两种情况之间至少 2 倍性能差异的挥手解释。由于展开的循环为编译器提供了更好的机会来组合中间步骤,因此它可以通过优化比循环所提供的更大的代码段来减少总体指令数。展开的代码在执行 32 FFMA
条指令期间根本不需要分支这一事实也可能带来一些好处。
Also given this situation, how am i supposed to get the max out of my kernel using all the different blocksizes i want? I can't do this thing for all the blocksizes...
嗯,实际上,真正感兴趣的块大小有多少?通常的 cuda 建议涉及块大小是 32 的倍数,甚至是 2 的二进制幂,即 "not too small" 和 "not too large"。对于真实世界的向量矩阵乘法,您可能只需要担心几个块大小,例如 64、128、256 和 512。您可以手写这些,但是 templating 可能是另一种方法,实际上这里有很多灵活性,对于您关心的这个特定替换。像这样:
template <int BS>
__global__ void dmv_gpu_shmem_templ(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/BS;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[BS*j + ltid];
__syncthreads();
for(size_t i=0; i< BS; i++) {
_y += a[gtid + (i + j*BS)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
和:
if(gpu_block == 32)
dmv_gpu_shmem_templ<32><<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
我真的不知道如何命名我 运行 遇到的问题,所以如果您认为 mod 相应地重命名它。
我得到了如下矩阵向量乘法内核:
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
我必须针对相当多的块大小测试此内核,但我得到的计时结果不太好。所以我决定mod通过以下方式对此进行验证,并在调用内核时将块大小固定为 32
__global__ void dmv_gpu_shmem(const value_t *a, const value_t *x, value_t *y,
size_t n)
{
extern __shared__ value_t shmem_buf[];
int ltid = threadIdx.x;
int gtid = get_global_tid();
value_t _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
令我非常惊讶的是,内核在执行时间方面提高了好几倍,我完全不知道为什么会这样。
有经验的人可以解释一下吗?
同样在这种情况下,我应该如何使用我想要的所有不同块大小从我的内核中获得最大值?我不能为所有块大小做这件事...
编辑:
这应该是一个有效的重现案例:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>
#include "cublas_v2.h" //CUBLAS LIBRARY
#ifndef VALUES_MAX
# define VALUES_MAX 1.
#endif
#ifndef EPS
# define EPS 1.e-6
#endif
#ifndef NR_ITER
# define NR_ITER 200
#endif
enum
{
GPU_NAIVE = 0,
GPU_COALESCED,
GPU_SHMEM,
GPU_KERNEL_END
};
void *gpu_alloc(size_t count)
{
void *ret;
if (cudaMalloc(&ret, count) != cudaSuccess) {
ret = NULL;
}
return ret;
}
int copy_to_gpu(const void *host, void *gpu, size_t count)
{
if (cudaMemcpy(gpu, host, count, cudaMemcpyHostToDevice) != cudaSuccess)
return -1;
return 0;
}
int copy_from_gpu(void *host, const void *gpu, size_t count)
{
if (cudaMemcpy(host, gpu, count, cudaMemcpyDeviceToHost) != cudaSuccess)
return -1;
return 0;
}
void mat_init_rand(float **a, size_t n, float max)
{
size_t i, j;
for (i = 0; i < n; ++i)
{
for (j = 0; j < n; ++j)
{
//printf("%d %d \n", i, j);
a[i][j] = 2 * (((float) drand48()) - 0.5) * max;
}
}
}
void vec_init(float *v, size_t n, float val)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = val;
}
}
void vec_init_rand(float *v, size_t n, float max)
{
size_t i;
for (i = 0; i < n; ++i)
{
v[i] = 2 * (((float) drand48()) - 0.5) * max;
}
}
void vec_print(const float *v, size_t n)
{
size_t i;
for (i = 0; i < n; ++i)
printf("%f \n", v[i]);
}
void **calloc_2d(size_t n, size_t m, size_t size)
{
char **ret = (char **) malloc(n*sizeof(char *));
if (ret) {
char *area = (char *) calloc(n*m, size);
if (area) {
for (size_t i = 0; i < n; ++i)
ret[i] = (char *) &area[i*m*size];
} else {
free(ret);
ret = NULL;
}
}
return (void **) ret;
}
void **copy_2d(void **dst, const void **src, size_t n, size_t m, size_t size)
{
memcpy(dst[0], src[0], n*m*size);
return dst;
}
void free_2d(void **array)
{
free(array[0]);
free(array);
}
__global__ void dmv_gpu_shmem(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/blockDim.x;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[blockDim.x*j + ltid];
__syncthreads();
for(size_t i=0; i< blockDim.x; i++) {
_y += a[gtid + (i + j*blockDim.x)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
__global__ void dmv_gpu_shmem_static(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/32;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[32*j + ltid];
__syncthreads();
for(size_t i=0; i< 32; i++) {
_y += a[gtid + (i + j*32)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
int main(int argc, char **argv)
{
if (argc < 2) {
printf("Wrong arguments \n");
return -1;
}
size_t orig_n = atoi(argv[1]);
/* Read block size and kernel to launch from the environment */
const char *env_gpu_kernel = getenv("GPU_KERNEL");
const char *env_gpu_block_size = getenv("GPU_BLOCK_SIZE");
int kernel = (env_gpu_kernel) ? atoi(env_gpu_kernel) : GPU_NAIVE;
int block_size = (env_gpu_block_size) ? atoi(env_gpu_block_size) : 256;
//Adjust Matrix to fit blocksize
size_t n = ((orig_n - 1)/block_size + 1)*block_size;
int grid_size = (n-1)/block_size + 1;
printf("Matrix size: %zd\n", orig_n);
printf("Input Block size: %zd\n", block_size);
printf("Adjusted matrix size: %zd\n", n);
/*
* Allocate the structures.
*
* Initialization to zero is crucial if you adjusted the matrix
* size.
*/
float **A = (float **) calloc_2d(n, n, sizeof(**A));
float *x = (float *) calloc(n, sizeof(*x));
float *y = (float *) calloc(n, sizeof(*y));
/* Initialize */
srand48(0);
mat_init_rand(A, orig_n, VALUES_MAX);
vec_init_rand(x, orig_n, VALUES_MAX);
vec_init(y, orig_n, 0.0);
printf("Setup Complete\n");
/*
* FILLME: Set up the blocks, grid and shared memory depending on
* the kernel. Make any transformations to the input
* matrix here.
*/
//Transposing Matrix for Shared and Coalesced Matrices
float tmp;
for(size_t i=0;i<n;i++)
for(size_t j=i+1;j<n;j++) {
tmp=A[i][j];
A[i][j] = A[j][i];
A[j][i] = tmp;
}
dim3 gpu_block(block_size, 1); // Number of threads
dim3 gpu_grid(grid_size, 1); // Number of blocks
size_t shmem_size = 0; // Shared memory size
/* Set SHARED MEMORY size */
shmem_size = block_size * sizeof(float);
printf(">>>> Begin of record <<<<\n");
printf("Block size: %dx%d\n", gpu_block.x, gpu_block.y);
printf("Grid size : %dx%d\n", gpu_grid.x, gpu_grid.y);
printf("Shared memory size: %ld bytes\n", shmem_size);
/* GPU allocations */
float *gpu_A = (float *) gpu_alloc(n*n*sizeof(*gpu_A));
float *gpu_x = (float *) gpu_alloc(n*sizeof(*gpu_x));
float *gpu_y = (float *) gpu_alloc(n*sizeof(*gpu_y));
/* Copy data to GPU */
copy_to_gpu(A[0], gpu_A, n*n*sizeof(*gpu_A));
copy_to_gpu(x, gpu_x, n*sizeof(*gpu_x));
/* Reset y and copy it to GPU */
vec_init(y, n, 0.0);
copy_to_gpu(y, gpu_y, n*sizeof(*gpu_y));
dmv_gpu_shmem<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
dmv_gpu_shmem_static<<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);
if (cudaGetLastError() != cudaSuccess)
printf("gpu kernel failed to launch \n");
cudaDeviceSynchronize();
/* Free resources on host */
free_2d((void **) A);
free(x);
free(y);
/* Free resources on GPU */
cudaFree(gpu_A);
cudaFree(gpu_x);
cudaFree(gpu_y);
return EXIT_SUCCESS;
}
编译
nvcc dmv_test_case.cu
执行
GPU_KERNEL=2 GPU_BLOCK_SIZE=32 ./a.out 2048
GPU_KERNEL 变量在这种情况下什么都不做。 GPU_BLOCK_SIZE 很明显。 number 参数是向量的大小 (n) 和矩阵的大小 (nxn)
Can someone more experienced explain this?
完整的分析超出了我准备提供的范围,但我会从中途开始。正如@talonmies 指出的那样,这至少部分是由于 "compiler optimization due to fixed trip counts".
当我 运行 你的代码 nvprof --print-gpu-trace ...
时,我观察到两个内核(在 cc2.0 设备上)的内核执行时间相差大约 3 倍。这可能有一些偏差,因为我们在 "slower" 内核之后调用 "faster" 内核 - 但它在相同的数据上运行,所以第二个可能有一些缓存优势。但是让我们忽略它。让我们看一下 cuobjdump -sass
为您的代码输出的 SASS 代码:
较慢的内核:
Function : _Z13dmv_gpu_shmemPKfS0_Pfm
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
....
....
/*01f0*/ LD.E R18, [R2]; /* 0x8400000000249c85 */
/*01f8*/ IADD R19.CC, R19, 0x1; /* 0x4801c0000534dc03 */
/*0200*/ LDS R17, [R21]; /* 0xc100000001545c85 */
/*0208*/ IADD.X R20, R20, RZ; /* 0x48000000fd451c43 */
/*0210*/ ISUB RZ.CC, R19, c[0x0][0x8]; /* 0x48014000213fdd03 */
/*0218*/ IADD R21, R21, 0x4; /* 0x4800c00011555c03 */
/*0220*/ ISETP.LT.U32.X.AND P0, PT, R20, RZ, PT; /* 0x188e0000fd41dc43 */
/*0228*/ IADD R2.CC, R2, R15; /* 0x480100003c209c03 */
/*0230*/ IADD.X R3, R3, R16; /* 0x480000004030dc43 */
/*0238*/ FFMA R6, R18, R17, R6; /* 0x300c000045219c00 */
/*0240*/ @P0 BRA 0x1f0; /* 0x4003fffea00001e7 */
更快"static"内核:
Function : _Z20dmv_gpu_shmem_staticPKfS0_Pfm
.headerflags @"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
....
....
/*0110*/ LD.E R10, [R2]; /* 0x8400000000229c85 */
/*0118*/ STS [R6], R10; /* 0xc900000000629c85 */
/*0120*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0128*/ LD.E R22, [R8]; /* 0x8400000000859c85 */
/*0130*/ IADD R10.CC, R8, R14; /* 0x4801000038829c03 */
/*0138*/ IADD.X R11, R9, R15; /* 0x480000003c92dc43 */
/*0140*/ IADD R18.CC, R10, R14; /* 0x4801000038a49c03 */
/*0148*/ LD.E R21, [R10]; /* 0x8400000000a55c85 */
/*0150*/ IADD.X R19, R11, R15; /* 0x480000003cb4dc43 */
/*0158*/ IADD R16.CC, R18, R14; /* 0x4801000039241c03 */
/*0160*/ LD.E R24, [R18]; /* 0x8400000001261c85 */
/*0168*/ IADD.X R17, R19, R15; /* 0x480000003d345c43 */
/*0170*/ LDS.128 R8, [RZ]; /* 0xc100000003f21cc5 */
/*0178*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0180*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0188*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0190*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0198*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*01a0*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*01a8*/ FFMA R8, R22, R8, R20; /* 0x3028000021621c00 */
/*01b0*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*01b8*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*01c0*/ FFMA R8, R21, R9, R8; /* 0x3010000025521c00 */
/*01c8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*01d0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*01d8*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*01e0*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*01e8*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*01f0*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*01f8*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0200*/ LDS.128 R8, [0x10]; /* 0xc100000043f21cc5 */
/*0208*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0210*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0218*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0220*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0228*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0230*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*0238*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*0240*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*0248*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*0250*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*0258*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*0260*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0268*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0270*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0278*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0280*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0288*/ LDS.128 R8, [0x20]; /* 0xc100000083f21cc5 */
/*0290*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0298*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*02a0*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*02a8*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*02b0*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*02b8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*02c0*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*02c8*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*02d0*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*02d8*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*02e0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*02e8*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*02f0*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*02f8*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0300*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0308*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0310*/ LDS.128 R8, [0x30]; /* 0xc1000000c3f21cc5 */
/*0318*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0320*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0328*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0330*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0338*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0340*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*0348*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*0350*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*0358*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*0360*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*0368*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*0370*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0378*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0380*/ LD.E R24, [R20]; /* 0x8400000001461c85 */
/*0388*/ FFMA R26, R25, R11, R8; /* 0x301000002d969c00 */
/*0390*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0398*/ LDS.128 R8, [0x40]; /* 0xc100000103f21cc5 */
/*03a0*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*03a8*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*03b0*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*03b8*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*03c0*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*03c8*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*03d0*/ FFMA R26, R22, R8, R26; /* 0x3034000021669c00 */
/*03d8*/ LD.E R22, [R16]; /* 0x8400000001059c85 */
/*03e0*/ FFMA R8, R23, R9, R26; /* 0x3034000025721c00 */
/*03e8*/ LD.E R23, [R18]; /* 0x840000000125dc85 */
/*03f0*/ IADD R16.CC, R20, R14; /* 0x4801000039441c03 */
/*03f8*/ LD.E R20, [R20]; /* 0x8400000001451c85 */
/*0400*/ IADD.X R17, R21, R15; /* 0x480000003d545c43 */
/*0408*/ FFMA R8, R24, R10, R8; /* 0x3010000029821c00 */
/*0410*/ FFMA R24, R25, R11, R8; /* 0x301000002d961c00 */
/*0418*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*0420*/ LDS.128 R8, [0x50]; /* 0xc100000143f21cc5 */
/*0428*/ IADD R16.CC, R16, R14; /* 0x4801000039041c03 */
/*0430*/ IADD.X R17, R17, R15; /* 0x480000003d145c43 */
/*0438*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0440*/ LD.E R21, [R16]; /* 0x8400000001055c85 */
/*0448*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0450*/ IADD R16.CC, R18, R14; /* 0x4801000039241c03 */
/*0458*/ IADD.X R17, R19, R15; /* 0x480000003d345c43 */
/*0460*/ FFMA R8, R22, R8, R24; /* 0x3030000021621c00 */
/*0468*/ LD.E R24, [R18]; /* 0x8400000001261c85 */
/*0470*/ FFMA R8, R23, R9, R8; /* 0x3010000025721c00 */
/*0478*/ IADD R18.CC, R16, R14; /* 0x4801000039049c03 */
/*0480*/ FFMA R8, R20, R10, R8; /* 0x3010000029421c00 */
/*0488*/ IADD.X R19, R17, R15; /* 0x480000003d14dc43 */
/*0490*/ IADD R20.CC, R18, R14; /* 0x4801000039251c03 */
/*0498*/ LD.E R18, [R18]; /* 0x8400000001249c85 */
/*04a0*/ FFMA R22, R25, R11, R8; /* 0x301000002d959c00 */
/*04a8*/ LDS.128 R8, [0x60]; /* 0xc100000183f21cc5 */
/*04b0*/ LD.E R25, [R16]; /* 0x8400000001065c85 */
/*04b8*/ FFMA R16, R21, R8, R22; /* 0x302c000021541c00 */
/*04c0*/ IADD.X R21, R19, R15; /* 0x480000003d355c43 */
/*04c8*/ IADD R22.CC, R20, R14; /* 0x4801000039459c03 */
/*04d0*/ LD.E R20, [R20]; /* 0x8400000001451c85 */
/*04d8*/ IADD.X R23, R21, R15; /* 0x480000003d55dc43 */
/*04e0*/ IADD R8.CC, R22, R14; /* 0x4801000039621c03 */
/*04e8*/ LD.E R22, [R22]; /* 0x8400000001659c85 */
/*04f0*/ FFMA R24, R24, R9, R16; /* 0x3020000025861c00 */
/*04f8*/ IADD.X R9, R23, R15; /* 0x480000003d725c43 */
/*0500*/ IADD R16.CC, R8, R14; /* 0x4801000038841c03 */
/*0508*/ LD.E R19, [R8]; /* 0x840000000084dc85 */
/*0510*/ IADD.X R17, R9, R15; /* 0x480000003c945c43 */
/*0518*/ LD.E R21, [R16]; /* 0x8400000001055c85 */
/*0520*/ FFMA R24, R25, R10, R24; /* 0x3030000029961c00 */
/*0528*/ FFMA R18, R18, R11, R24; /* 0x303000002d249c00 */
/*0530*/ LDS.128 R8, [0x70]; /* 0xc1000001c3f21cc5 */
/*0538*/ FFMA R18, R20, R8, R18; /* 0x3024000021449c00 */
/*0540*/ IADD R8.CC, R16, R14; /* 0x4801000039021c03 */
/*0548*/ FFMA R9, R22, R9, R18; /* 0x3024000025625c00 */
/*0550*/ FFMA R10, R19, R10, R9; /* 0x3012000029329c00 */
/*0558*/ IADD.X R9, R17, R15; /* 0x480000003d125c43 */
/*0560*/ FFMA R20, R21, R11, R10; /* 0x301400002d551c00 */
/*0568*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0570*/ IADD R7.CC, R7, 0x1; /* 0x4801c0000471dc03 */
/*0578*/ IADD.X R13, R13, RZ; /* 0x48000000fcd35c43 */
/*0580*/ ISUB RZ.CC, R7, R4; /* 0x48010000107fdd03 */
/*0588*/ ISETP.LT.U32.X.AND P0, PT, R13, R5, PT; /* 0x188e000014d1dc43 */
/*0590*/ IADD R2.CC, R2, 0x80; /* 0x4801c00200209c03 */
/*0598*/ IADD.X R3, R3, RZ; /* 0x48000000fc30dc43 */
/*05a0*/ @P0 BRA 0x110; /* 0x4003ffeda00001e7 */
由于 SO 字符数限制,我不得不从每个内核中摘录 "main loop" 进行实际的矩阵向量乘法运算。这是由 FFMA
指令执行的 - 浮点融合乘加。如果你看一下矩阵向量乘法运算,你会发现它是一系列乘加运算。
对比以上2个案例,我们可以得出几点看法:
较慢的内核在整个内核中只有一条
FFMA
指令 - 它在我展示的循环中。为了执行 32 次乘加运算,循环必须执行 32 次。更快的内核有 32 条独立的FFMA
指令。这就是所谓的"unrolled"。先前内核的循环消失了。因此,该指令序列只需执行一次即可执行所有 32 个必要的乘加运算。与展开一致,我们看到较慢的("rolled up")代码在循环中有大约 11 条指令。展开的代码有 ~150 条指令。
由于汇总代码必须执行 32 次,因此需要执行 32x11 或大约 350 条指令。将此与展开的情况进行比较,我们看到必须执行的指令只有一半。
这可能是对这两种情况之间至少 2 倍性能差异的挥手解释。由于展开的循环为编译器提供了更好的机会来组合中间步骤,因此它可以通过优化比循环所提供的更大的代码段来减少总体指令数。展开的代码在执行 32 FFMA
条指令期间根本不需要分支这一事实也可能带来一些好处。
Also given this situation, how am i supposed to get the max out of my kernel using all the different blocksizes i want? I can't do this thing for all the blocksizes...
嗯,实际上,真正感兴趣的块大小有多少?通常的 cuda 建议涉及块大小是 32 的倍数,甚至是 2 的二进制幂,即 "not too small" 和 "not too large"。对于真实世界的向量矩阵乘法,您可能只需要担心几个块大小,例如 64、128、256 和 512。您可以手写这些,但是 templating 可能是另一种方法,实际上这里有很多灵活性,对于您关心的这个特定替换。像这样:
template <int BS>
__global__ void dmv_gpu_shmem_templ(const float *a, const float *x, float *y,
size_t n)
{
extern __shared__ float shmem_buf[];
int ltid = threadIdx.x;
int gtid = blockIdx.x*blockDim.x+threadIdx.x;
float _y = 0.0;
if (gtid > n)
return;
int last_id = n/BS;
for(size_t j=0; j< last_id; j++) {
shmem_buf[ltid] = x[BS*j + ltid];
__syncthreads();
for(size_t i=0; i< BS; i++) {
_y += a[gtid + (i + j*BS)*n] * shmem_buf[i];
}
__syncthreads();
}
y[gtid] = _y;
}
和:
if(gpu_block == 32)
dmv_gpu_shmem_templ<32><<<gpu_grid,gpu_block,shmem_size>>>
(gpu_A, gpu_x, gpu_y, n);