为什么在减少时使用寄存器内存比共享内存慢?
Why is using register memory slower than shared memory when doing reduction?
我评估了两个内核性能:
#include <chrono>
#include <cuda_runtime.h>
#include <stdio.h>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void reduceShfl(int *in, int* out, int size)
{
extern __shared__ int smem[];
int tid = threadIdx.x;
int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
smem[tid] = 0;
if (tid>=size) return;
int tmp = 0;
if (idx + blockDim.x*3 <= size){
int a = in[idx];
int b = in[idx+blockDim.x];
int c = in[idx+2*blockDim.x];
int d = in[idx+3*blockDim.x];
tmp = a + b + c + d;
}
smem[tid] = tmp;
__syncthreads();
if (blockDim.x >= 1024 && tid < 512){
smem[tid] += smem[tid + 512];
}
__syncthreads();
if (blockDim.x >= 512 && tid < 256){
smem[tid] += smem[tid + 256];
}
__syncthreads();
if (blockDim.x >= 256 && tid < 128){
smem[tid] += smem[tid + 128];
}
__syncthreads();
if (blockDim.x >= 128 && tid < 64){
smem[tid] += smem[tid + 64];
}
__syncthreads();
if (blockDim.x >= 64 && tid < 32){
smem[tid] += smem[tid + 32];
}
__syncthreads();
int tmpsum = smem[tid];
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
out[blockIdx.x] = tmpsum;
}
__global__ void reduceShmUnroll(int *in, int *out, int num)
{
extern __shared__ int smem[];
int tid = threadIdx.x;
int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
if (tid >= num) return;
int tmp=0;
if(idx + blockDim.x*3 <= num)
{
int a = in[idx];
int b = in[idx + blockDim.x];
int c = in[idx + blockDim.x*2];
int d = in[idx + blockDim.x*3];
tmp = a + b + c + d;
}
smem[tid] = tmp;
__syncthreads();
if (blockDim.x >= 1024 && tid < 512){
smem[tid] += smem[tid + 512];
}
__syncthreads();
if (blockDim.x >= 512 && tid < 256){
smem[tid] += smem[tid+256];
}
__syncthreads();
if (blockDim.x >= 256 && tid < 128){
smem[tid] += smem[tid+128];
}
__syncthreads();
if (blockDim.x >= 128 && tid < 64){
smem[tid] += smem[tid+64];
}
__syncthreads();
if (tid < 32){
volatile int *vsmem = smem;
vsmem[tid] += vsmem[tid+32];
vsmem[tid] += vsmem[tid+16];
vsmem[tid] += vsmem[tid+8];
vsmem[tid] += vsmem[tid+4];
vsmem[tid] += vsmem[tid+2];
vsmem[tid] += vsmem[tid+1];
}
if (tid == 0) out[blockIdx.x] = smem[0];
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int nBytes = size*sizeof(int);
int *a_h = (int*)malloc(nBytes);
initData_int(a_h, size);
int blocksize = 1024;
int gridsize = (size-1)/blocksize+1;
dim3 block(blocksize, 1);
dim3 grid((size-1)/blocksize+1, 1);
int *a_d, *b_d;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&b_d, grid.x*sizeof(int));
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
int *tmp = (int*)malloc(gridsize*sizeof(int));
memset(tmp, 0, grid.x/4);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto s_0 = std::chrono::system_clock::now();
reduceShfl<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
int res_1 = 0;
for (int i=0; i<grid.x/4; i++){
res_1 += tmp[i];
}
auto e_0 = std::chrono::system_clock::now();
std::chrono::duration<double> diff = e_0 - s_0;
printf("Result from reduceShfl is: %d and time cost is %2f.\n", res_1, diff.count());
memset(tmp, 0, grid.x/4);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
s_0 = std::chrono::system_clock::now();
reduceShmUnroll<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
int res_0 = 0;
for (int i=0; i<grid.x/4; i++){
res_0 += tmp[i];
}
e_0 = std::chrono::system_clock::now();
diff = e_0 - s_0;
printf("Result from reduceShmUnroll is: %d and time cost is %2f.\n", res_0, diff.count());
cudaFree(a_d);
cudaFree(b_d);
free(a_h);
free(tmp);
return 0;
}
主要区别在于最后一次 warp 缩减,reduceShmUnroll
使用共享内存,reduceShfl
进行 warp shuffle,它使用寄存器内存。
但是我发现 reduceShfl
比 reduceShmUnroll
慢。
Result from reduceShfl is: 2139353471 and time cost is 0.000533.
Result from reduceShmUnroll is: 2139353471 and time cost is 0.000485.
我的代码有问题吗?
Is sth wrong with my code?
是的,我会说你的代码有问题。
我看到的主要问题是您进行的比较无效。在您的共享内存内核中,您将最后一次扭曲减少 activity 限制为最后一次扭曲。在shuffle内核中,你不是:
共享内存内核:
__syncthreads();
if (tid < 32){ // this is missing from your shuffle kernel
volatile int *vsmem = smem;
vsmem[tid] += vsmem[tid+32];
vsmem[tid] += vsmem[tid+16];
vsmem[tid] += vsmem[tid+8];
vsmem[tid] += vsmem[tid+4];
vsmem[tid] += vsmem[tid+2];
vsmem[tid] += vsmem[tid+1];
}
随机播放内核:
__syncthreads();
int tmpsum = smem[tid];
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
out[blockIdx.x] = tmpsum;
当我以限制共享内存内核的方式限制你的洗牌内核时(这样不必要的扭曲就不会做不必要的工作)然后我观察到大约相等的 运行 次(大约 1% 的差异)当我在 V100 上使用 nvprof
进行配置时,在两个内核之间:
0.38% 222.76us 1 222.76us 222.76us 222.76us reduceShmUnroll(int*, int*, int)
0.37% 220.55us 1 220.55us 220.55us 220.55us reduceShfl(int*, int*, int)
这是我所期望的。对于这种有限的使用,没有理由认为共享内存使用或随机播放会更快或更慢。
共享内存 activity 和 warp shuffle activity 都有吞吐量限制。因此,试图预测哪个会更快是很困难的,因为这取决于代码中发生的其他事情。如果您的代码受共享内存吞吐量限制,并且您将其中的一些 activity 转换为 warp shuffle,您可能会看到 warp shuffle 的好处。同样的说法也可以换个方向。对于此特定代码的此特定部分,当编写 properly/comparably/equivalently 时,您不受共享内存吞吐量或 warp shuffle 吞吐量的 不同 的约束,因此正确的期望是没有区别性能,以一个代替另一个。
我评估了两个内核性能:
#include <chrono>
#include <cuda_runtime.h>
#include <stdio.h>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void reduceShfl(int *in, int* out, int size)
{
extern __shared__ int smem[];
int tid = threadIdx.x;
int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
smem[tid] = 0;
if (tid>=size) return;
int tmp = 0;
if (idx + blockDim.x*3 <= size){
int a = in[idx];
int b = in[idx+blockDim.x];
int c = in[idx+2*blockDim.x];
int d = in[idx+3*blockDim.x];
tmp = a + b + c + d;
}
smem[tid] = tmp;
__syncthreads();
if (blockDim.x >= 1024 && tid < 512){
smem[tid] += smem[tid + 512];
}
__syncthreads();
if (blockDim.x >= 512 && tid < 256){
smem[tid] += smem[tid + 256];
}
__syncthreads();
if (blockDim.x >= 256 && tid < 128){
smem[tid] += smem[tid + 128];
}
__syncthreads();
if (blockDim.x >= 128 && tid < 64){
smem[tid] += smem[tid + 64];
}
__syncthreads();
if (blockDim.x >= 64 && tid < 32){
smem[tid] += smem[tid + 32];
}
__syncthreads();
int tmpsum = smem[tid];
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
out[blockIdx.x] = tmpsum;
}
__global__ void reduceShmUnroll(int *in, int *out, int num)
{
extern __shared__ int smem[];
int tid = threadIdx.x;
int idx = threadIdx.x + blockIdx.x*blockDim.x*4;
if (tid >= num) return;
int tmp=0;
if(idx + blockDim.x*3 <= num)
{
int a = in[idx];
int b = in[idx + blockDim.x];
int c = in[idx + blockDim.x*2];
int d = in[idx + blockDim.x*3];
tmp = a + b + c + d;
}
smem[tid] = tmp;
__syncthreads();
if (blockDim.x >= 1024 && tid < 512){
smem[tid] += smem[tid + 512];
}
__syncthreads();
if (blockDim.x >= 512 && tid < 256){
smem[tid] += smem[tid+256];
}
__syncthreads();
if (blockDim.x >= 256 && tid < 128){
smem[tid] += smem[tid+128];
}
__syncthreads();
if (blockDim.x >= 128 && tid < 64){
smem[tid] += smem[tid+64];
}
__syncthreads();
if (tid < 32){
volatile int *vsmem = smem;
vsmem[tid] += vsmem[tid+32];
vsmem[tid] += vsmem[tid+16];
vsmem[tid] += vsmem[tid+8];
vsmem[tid] += vsmem[tid+4];
vsmem[tid] += vsmem[tid+2];
vsmem[tid] += vsmem[tid+1];
}
if (tid == 0) out[blockIdx.x] = smem[0];
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int nBytes = size*sizeof(int);
int *a_h = (int*)malloc(nBytes);
initData_int(a_h, size);
int blocksize = 1024;
int gridsize = (size-1)/blocksize+1;
dim3 block(blocksize, 1);
dim3 grid((size-1)/blocksize+1, 1);
int *a_d, *b_d;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&b_d, grid.x*sizeof(int));
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
int *tmp = (int*)malloc(gridsize*sizeof(int));
memset(tmp, 0, grid.x/4);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto s_0 = std::chrono::system_clock::now();
reduceShfl<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
int res_1 = 0;
for (int i=0; i<grid.x/4; i++){
res_1 += tmp[i];
}
auto e_0 = std::chrono::system_clock::now();
std::chrono::duration<double> diff = e_0 - s_0;
printf("Result from reduceShfl is: %d and time cost is %2f.\n", res_1, diff.count());
memset(tmp, 0, grid.x/4);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
s_0 = std::chrono::system_clock::now();
reduceShmUnroll<<<grid, block, blocksize*sizeof(int)>>>(a_d, b_d, size);
cudaMemcpy(tmp, b_d, grid.x/4*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
int res_0 = 0;
for (int i=0; i<grid.x/4; i++){
res_0 += tmp[i];
}
e_0 = std::chrono::system_clock::now();
diff = e_0 - s_0;
printf("Result from reduceShmUnroll is: %d and time cost is %2f.\n", res_0, diff.count());
cudaFree(a_d);
cudaFree(b_d);
free(a_h);
free(tmp);
return 0;
}
主要区别在于最后一次 warp 缩减,reduceShmUnroll
使用共享内存,reduceShfl
进行 warp shuffle,它使用寄存器内存。
但是我发现 reduceShfl
比 reduceShmUnroll
慢。
Result from reduceShfl is: 2139353471 and time cost is 0.000533.
Result from reduceShmUnroll is: 2139353471 and time cost is 0.000485.
我的代码有问题吗?
Is sth wrong with my code?
是的,我会说你的代码有问题。
我看到的主要问题是您进行的比较无效。在您的共享内存内核中,您将最后一次扭曲减少 activity 限制为最后一次扭曲。在shuffle内核中,你不是:
共享内存内核:
__syncthreads();
if (tid < 32){ // this is missing from your shuffle kernel
volatile int *vsmem = smem;
vsmem[tid] += vsmem[tid+32];
vsmem[tid] += vsmem[tid+16];
vsmem[tid] += vsmem[tid+8];
vsmem[tid] += vsmem[tid+4];
vsmem[tid] += vsmem[tid+2];
vsmem[tid] += vsmem[tid+1];
}
随机播放内核:
__syncthreads();
int tmpsum = smem[tid];
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 16);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 8);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 4);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 2);
tmpsum += __shfl_xor_sync(0xffffffff, tmpsum, 1);
if (tid==0)
out[blockIdx.x] = tmpsum;
当我以限制共享内存内核的方式限制你的洗牌内核时(这样不必要的扭曲就不会做不必要的工作)然后我观察到大约相等的 运行 次(大约 1% 的差异)当我在 V100 上使用 nvprof
进行配置时,在两个内核之间:
0.38% 222.76us 1 222.76us 222.76us 222.76us reduceShmUnroll(int*, int*, int)
0.37% 220.55us 1 220.55us 220.55us 220.55us reduceShfl(int*, int*, int)
这是我所期望的。对于这种有限的使用,没有理由认为共享内存使用或随机播放会更快或更慢。
共享内存 activity 和 warp shuffle activity 都有吞吐量限制。因此,试图预测哪个会更快是很困难的,因为这取决于代码中发生的其他事情。如果您的代码受共享内存吞吐量限制,并且您将其中的一些 activity 转换为 warp shuffle,您可能会看到 warp shuffle 的好处。同样的说法也可以换个方向。对于此特定代码的此特定部分,当编写 properly/comparably/equivalently 时,您不受共享内存吞吐量或 warp shuffle 吞吐量的 不同 的约束,因此正确的期望是没有区别性能,以一个代替另一个。