为什么我的矢量化访问内核这么慢?
Why my vectorized access kernel is so slow?
我正在尝试了解矢量化内存访问并实现一个简单示例来评估性能。但是我发现矢量化的比朴素的慢?
在矢量化内核中,我将 int 指针重铸为 int2 指针,然后进行数据复制。
这是我使用的代码:
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#include <iostream>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void naiveCopy(int *d_in, int *d_out, int size)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size; i += blockDim.x*gridDim.x)
{
d_out[i] = d_in[i];
}
}
__global__ void vecCopy(int *d_in, int *d_out, int size)
{
int2* in = (int2*)d_in;
int2* out = (int2*)d_out;
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size/2; i += blockDim.x*gridDim.x)
{
out[i] = in[i];
}
if(tid==size/2 && size%2==1)
d_out[size-1] = d_in[size-1];
}
int main(int argc, char **argv)
{
int size = 1<<24;
//int size = 128;
int nBytes = size*sizeof(int);
int *d_h;
cudaMallocHost((int**)&d_h, nBytes);
initData_int(d_h, size);
//printData(d_h, size);
int *res = (int*)malloc(nBytes);
cudaStream_t stream;
cudaStreamCreate(&stream);
int *d_in, *d_out;
dim3 block(128, 1);
dim3 grid((size-1)/block.x+1, 1);
cudaMalloc((int**)&d_in, nBytes);
cudaMalloc((int**)&d_out, nBytes);
cudaMemcpyAsync(d_in, d_h, nBytes, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
auto s_0 = std::chrono::system_clock::now();
naiveCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
auto e_0 = std::chrono::system_clock::now();
std::chrono::duration<double> diff = e_0 - s_0;
printf("Naive Kernel time cost is: %2f.\n", diff.count());
memset(res, 0, nBytes);
cudaMemset(d_out, 0, nBytes);
//vectorized access:
cudaStreamSynchronize(stream);
s_0 = std::chrono::system_clock::now();
vecCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
e_0 = std::chrono::system_clock::now();
diff = e_0 - s_0;
printf("Vectorized kernel time cost is: %2f.\n", diff.count());
cudaStreamDestroy(stream);
cudaFree(d_h);
cudaFree(d_in);
cudaFree(d_out);
free(res);
return 0;
}
这是来自 nvprof 的数据:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 89.28% 5.5024ms 1 5.5024ms 5.5024ms 5.5024ms [CUDA memcpy HtoD]
4.82% 296.94us 1 296.94us 296.94us 296.94us vecCopy(int*, int*, int)
3.99% 246.19us 1 246.19us 246.19us 246.19us naiveCopy(int*, int*, int)
能否请您解释一下导致性能下降的原因?
您没有很好地调整网格大小。您的网格尺寸对于原始内核可能是合理的:
dim3 grid((size-1)/block.x+1, 1);
但是它们不必要地是矢量化复制内核所需大小的两倍。
当我将向量化内核的网格大小减半时(以匹配原始内核的方法):
dim3 grid2((size/2+block.x-1)/block.x);
然后根据我的测试,矢量化复制内核变得更快:
3.88% 233.99us 1 233.99us 233.99us 233.99us naiveCopy(int*, int*, int)
2.84% 171.33us 1 171.33us 171.33us 171.33us vecCopy(int*, int*, int)
备注:
cudaFree
不是与 cudaMallocHost
一起使用的正确 API。正确的 API 是 cudaFreeHost
.
正如评论中所提到的,我们可以更好地调整网格大小,方法是调整网格大小以匹配您 运行 所在的 GPU。然而,我们不需要采取这一步来证明这里的改进。
我正在尝试了解矢量化内存访问并实现一个简单示例来评估性能。但是我发现矢量化的比朴素的慢?
在矢量化内核中,我将 int 指针重铸为 int2 指针,然后进行数据复制。
这是我使用的代码:
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#include <iostream>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void naiveCopy(int *d_in, int *d_out, int size)
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size; i += blockDim.x*gridDim.x)
{
d_out[i] = d_in[i];
}
}
__global__ void vecCopy(int *d_in, int *d_out, int size)
{
int2* in = (int2*)d_in;
int2* out = (int2*)d_out;
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for (int i = tid; i < size/2; i += blockDim.x*gridDim.x)
{
out[i] = in[i];
}
if(tid==size/2 && size%2==1)
d_out[size-1] = d_in[size-1];
}
int main(int argc, char **argv)
{
int size = 1<<24;
//int size = 128;
int nBytes = size*sizeof(int);
int *d_h;
cudaMallocHost((int**)&d_h, nBytes);
initData_int(d_h, size);
//printData(d_h, size);
int *res = (int*)malloc(nBytes);
cudaStream_t stream;
cudaStreamCreate(&stream);
int *d_in, *d_out;
dim3 block(128, 1);
dim3 grid((size-1)/block.x+1, 1);
cudaMalloc((int**)&d_in, nBytes);
cudaMalloc((int**)&d_out, nBytes);
cudaMemcpyAsync(d_in, d_h, nBytes, cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
auto s_0 = std::chrono::system_clock::now();
naiveCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
auto e_0 = std::chrono::system_clock::now();
std::chrono::duration<double> diff = e_0 - s_0;
printf("Naive Kernel time cost is: %2f.\n", diff.count());
memset(res, 0, nBytes);
cudaMemset(d_out, 0, nBytes);
//vectorized access:
cudaStreamSynchronize(stream);
s_0 = std::chrono::system_clock::now();
vecCopy<<<grid, block, 0, stream>>>(d_in, d_out, size);
cudaStreamSynchronize(stream);
e_0 = std::chrono::system_clock::now();
diff = e_0 - s_0;
printf("Vectorized kernel time cost is: %2f.\n", diff.count());
cudaStreamDestroy(stream);
cudaFree(d_h);
cudaFree(d_in);
cudaFree(d_out);
free(res);
return 0;
}
这是来自 nvprof 的数据:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 89.28% 5.5024ms 1 5.5024ms 5.5024ms 5.5024ms [CUDA memcpy HtoD]
4.82% 296.94us 1 296.94us 296.94us 296.94us vecCopy(int*, int*, int)
3.99% 246.19us 1 246.19us 246.19us 246.19us naiveCopy(int*, int*, int)
能否请您解释一下导致性能下降的原因?
您没有很好地调整网格大小。您的网格尺寸对于原始内核可能是合理的:
dim3 grid((size-1)/block.x+1, 1);
但是它们不必要地是矢量化复制内核所需大小的两倍。
当我将向量化内核的网格大小减半时(以匹配原始内核的方法):
dim3 grid2((size/2+block.x-1)/block.x);
然后根据我的测试,矢量化复制内核变得更快:
3.88% 233.99us 1 233.99us 233.99us 233.99us naiveCopy(int*, int*, int)
2.84% 171.33us 1 171.33us 171.33us 171.33us vecCopy(int*, int*, int)
备注:
cudaFree
不是与cudaMallocHost
一起使用的正确 API。正确的 API 是cudaFreeHost
.正如评论中所提到的,我们可以更好地调整网格大小,方法是调整网格大小以匹配您 运行 所在的 GPU。然而,我们不需要采取这一步来证明这里的改进。