为什么这两个 GPU 内核有巨大的性能差异?
Why these two GPU kernel have massive performance difference?
您好,我正在尝试了解 cuda 内核的某些行为。这是我拥有的两个 cuda 内核。我发现 gpuReduce
需要的持续时间是 gpuReduceOpt
的两倍。是背离造成的吗?
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
if((tid%(2*stride)) == 0){
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
这是我得到的性能数据:
Gpu reduce take:0.004238 s
Gpu reduce opt take:0.002606 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
在您现在发布的代码中,主机代码中仍然存在错误。此构造不正确:
int gpu_result; // not initialized
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
这是未定义的行为。不能保证上面的变量 gpu_result
将从零开始。 gpu_result1
.
也存在同样的问题
当我们解决这个问题时,内核时间执行的差异主要归结为第一个内核中模运算符的使用,正如@talonmies 在第一条评论中所建议的那样。如果您对每个内核进行概要分析,假设使用 nvprof
,并询问 gld_efficiency
、gst_efficiency
、gld_transactions
和 gst_transactions
等指标,您会发现它们两个内核之间基本相同。
但是,如果您用等效但成本较低的算法替换模运算符,内核持续时间将变得几乎相同(在大约 10% 以内):
$ cat t1878a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
#ifdef USE_FAST
if((tid&(2*stride-1)) == 0){
#else
if((tid%(2*stride)) == 0){
#endif
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1 = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo
$ nvprof ./t1878a
==14339== NVPROF is profiling process 14339, command: ./t1878a
Gpu reduce take:0.001021 s
Gpu reduce opt take:0.000543 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14339== Profiling application: ./t1878a
==14339== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.40% 43.743ms 2 21.872ms 21.280ms 22.463ms [CUDA memcpy HtoD]
1.72% 770.61us 1 770.61us 770.61us 770.61us gpuReduce(int*, int*, int)
0.86% 384.30us 1 384.30us 384.30us 384.30us gpuReduceOpt(int*, int*, int)
0.03% 12.960us 2 6.4800us 6.4000us 6.5600us [CUDA memcpy DtoH]
API calls: 69.86% 350.40ms 4 87.601ms 8.0580us 349.79ms cudaMalloc
19.33% 96.969ms 1 96.969ms 96.969ms 96.969ms cudaDeviceReset
9.13% 45.770ms 4 11.442ms 451.76us 22.822ms cudaMemcpy
1.00% 5.0119ms 4 1.2530ms 590.62us 3.2115ms cuDeviceTotalMem
0.50% 2.5242ms 404 6.2470us 427ns 270.20us cuDeviceGetAttribute
0.09% 449.28us 2 224.64us 10.437us 438.85us cudaFree
0.06% 279.02us 4 69.755us 59.853us 94.003us cuDeviceGetName
0.02% 101.11us 2 50.555us 23.936us 77.175us cudaLaunchKernel
0.00% 22.146us 4 5.5360us 3.2730us 10.770us cuDeviceGetPCIBusId
0.00% 14.686us 2 7.3430us 4.1300us 10.556us cudaDeviceSynchronize
0.00% 11.444us 8 1.4300us 506ns 4.8200us cuDeviceGet
0.00% 6.2180us 3 2.0720us 610ns 3.9200us cuDeviceGetCount
0.00% 3.5570us 4 889ns 740ns 1.1270us cuDeviceGetUuid
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo -DUSE_FAST
$ nvprof ./t1878a
==14375== NVPROF is profiling process 14375, command: ./t1878a
Gpu reduce take:0.000656 s
Gpu reduce opt take:0.000538 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14375== Profiling application: ./t1878a
==14375== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.92% 38.947ms 2 19.474ms 19.460ms 19.488ms [CUDA memcpy HtoD]
1.08% 427.79us 1 427.79us 427.79us 427.79us gpuReduce(int*, int*, int)
0.97% 385.99us 1 385.99us 385.99us 385.99us gpuReduceOpt(int*, int*, int)
0.03% 13.216us 2 6.6080us 6.4320us 6.7840us [CUDA memcpy DtoH]
API calls: 67.47% 281.96ms 4 70.491ms 5.5820us 281.49ms cudaMalloc
20.44% 85.428ms 1 85.428ms 85.428ms 85.428ms cudaDeviceReset
9.70% 40.518ms 4 10.129ms 457.52us 19.781ms cudaMemcpy
1.20% 5.0260ms 4 1.2565ms 601.24us 3.2163ms cuDeviceTotalMem
0.94% 3.9413ms 404 9.7550us 270ns 1.7028ms cuDeviceGetAttribute
0.10% 435.98us 2 217.99us 9.5230us 426.46us cudaFree
0.10% 410.88us 4 102.72us 58.347us 225.92us cuDeviceGetName
0.02% 94.871us 2 47.435us 20.952us 73.919us cudaLaunchKernel
0.01% 21.734us 4 5.4330us 3.5080us 8.4130us cuDeviceGetPCIBusId
0.00% 14.504us 2 7.2520us 3.8730us 10.631us cudaDeviceSynchronize
0.00% 12.843us 8 1.6050us 460ns 5.3730us cuDeviceGet
0.00% 9.7040us 3 3.2340us 804ns 6.9430us cuDeviceGetCount
0.00% 2.5870us 4 646ns 517ns 957ns cuDeviceGetUuid
$
备注:
我并不是说以上是模数的一般替代品。它在这种情况下有效,因为 stride
仅取 2 的幂。
我怀疑这是否符合您的想法:
if (tid >= size)
return;
但这里的问题大小(块大小的整数倍)不是特别相关。如果剩余的内核代码使用 __syncthreads()
,这也不是一个合适的选择,但这与此问题无关 size/choice.
您在 2080 Ti 上的代码 运行 比在我的 V100 上慢了大约 5 倍,这对我来说听起来不对。我想知道您是否正在构建调试项目。但这并没有改变这里的观察结果。如果您正在构建调试项目或使用 -G
编译开关,我建议 永远不要 对调试代码进行性能分析。
您好,我正在尝试了解 cuda 内核的某些行为。这是我拥有的两个 cuda 内核。我发现 gpuReduce
需要的持续时间是 gpuReduceOpt
的两倍。是背离造成的吗?
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
if((tid%(2*stride)) == 0){
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
这是我得到的性能数据:
Gpu reduce take:0.004238 s
Gpu reduce opt take:0.002606 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
在您现在发布的代码中,主机代码中仍然存在错误。此构造不正确:
int gpu_result; // not initialized
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
这是未定义的行为。不能保证上面的变量 gpu_result
将从零开始。 gpu_result1
.
当我们解决这个问题时,内核时间执行的差异主要归结为第一个内核中模运算符的使用,正如@talonmies 在第一条评论中所建议的那样。如果您对每个内核进行概要分析,假设使用 nvprof
,并询问 gld_efficiency
、gst_efficiency
、gld_transactions
和 gst_transactions
等指标,您会发现它们两个内核之间基本相同。
但是,如果您用等效但成本较低的算法替换模运算符,内核持续时间将变得几乎相同(在大约 10% 以内):
$ cat t1878a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
#ifdef USE_FAST
if((tid&(2*stride-1)) == 0){
#else
if((tid%(2*stride)) == 0){
#endif
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1 = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo
$ nvprof ./t1878a
==14339== NVPROF is profiling process 14339, command: ./t1878a
Gpu reduce take:0.001021 s
Gpu reduce opt take:0.000543 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14339== Profiling application: ./t1878a
==14339== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.40% 43.743ms 2 21.872ms 21.280ms 22.463ms [CUDA memcpy HtoD]
1.72% 770.61us 1 770.61us 770.61us 770.61us gpuReduce(int*, int*, int)
0.86% 384.30us 1 384.30us 384.30us 384.30us gpuReduceOpt(int*, int*, int)
0.03% 12.960us 2 6.4800us 6.4000us 6.5600us [CUDA memcpy DtoH]
API calls: 69.86% 350.40ms 4 87.601ms 8.0580us 349.79ms cudaMalloc
19.33% 96.969ms 1 96.969ms 96.969ms 96.969ms cudaDeviceReset
9.13% 45.770ms 4 11.442ms 451.76us 22.822ms cudaMemcpy
1.00% 5.0119ms 4 1.2530ms 590.62us 3.2115ms cuDeviceTotalMem
0.50% 2.5242ms 404 6.2470us 427ns 270.20us cuDeviceGetAttribute
0.09% 449.28us 2 224.64us 10.437us 438.85us cudaFree
0.06% 279.02us 4 69.755us 59.853us 94.003us cuDeviceGetName
0.02% 101.11us 2 50.555us 23.936us 77.175us cudaLaunchKernel
0.00% 22.146us 4 5.5360us 3.2730us 10.770us cuDeviceGetPCIBusId
0.00% 14.686us 2 7.3430us 4.1300us 10.556us cudaDeviceSynchronize
0.00% 11.444us 8 1.4300us 506ns 4.8200us cuDeviceGet
0.00% 6.2180us 3 2.0720us 610ns 3.9200us cuDeviceGetCount
0.00% 3.5570us 4 889ns 740ns 1.1270us cuDeviceGetUuid
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo -DUSE_FAST
$ nvprof ./t1878a
==14375== NVPROF is profiling process 14375, command: ./t1878a
Gpu reduce take:0.000656 s
Gpu reduce opt take:0.000538 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14375== Profiling application: ./t1878a
==14375== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.92% 38.947ms 2 19.474ms 19.460ms 19.488ms [CUDA memcpy HtoD]
1.08% 427.79us 1 427.79us 427.79us 427.79us gpuReduce(int*, int*, int)
0.97% 385.99us 1 385.99us 385.99us 385.99us gpuReduceOpt(int*, int*, int)
0.03% 13.216us 2 6.6080us 6.4320us 6.7840us [CUDA memcpy DtoH]
API calls: 67.47% 281.96ms 4 70.491ms 5.5820us 281.49ms cudaMalloc
20.44% 85.428ms 1 85.428ms 85.428ms 85.428ms cudaDeviceReset
9.70% 40.518ms 4 10.129ms 457.52us 19.781ms cudaMemcpy
1.20% 5.0260ms 4 1.2565ms 601.24us 3.2163ms cuDeviceTotalMem
0.94% 3.9413ms 404 9.7550us 270ns 1.7028ms cuDeviceGetAttribute
0.10% 435.98us 2 217.99us 9.5230us 426.46us cudaFree
0.10% 410.88us 4 102.72us 58.347us 225.92us cuDeviceGetName
0.02% 94.871us 2 47.435us 20.952us 73.919us cudaLaunchKernel
0.01% 21.734us 4 5.4330us 3.5080us 8.4130us cuDeviceGetPCIBusId
0.00% 14.504us 2 7.2520us 3.8730us 10.631us cudaDeviceSynchronize
0.00% 12.843us 8 1.6050us 460ns 5.3730us cuDeviceGet
0.00% 9.7040us 3 3.2340us 804ns 6.9430us cuDeviceGetCount
0.00% 2.5870us 4 646ns 517ns 957ns cuDeviceGetUuid
$
备注:
我并不是说以上是模数的一般替代品。它在这种情况下有效,因为
stride
仅取 2 的幂。我怀疑这是否符合您的想法:
if (tid >= size) return;
但这里的问题大小(块大小的整数倍)不是特别相关。如果剩余的内核代码使用
__syncthreads()
,这也不是一个合适的选择,但这与此问题无关 size/choice.您在 2080 Ti 上的代码 运行 比在我的 V100 上慢了大约 5 倍,这对我来说听起来不对。我想知道您是否正在构建调试项目。但这并没有改变这里的观察结果。如果您正在构建调试项目或使用
-G
编译开关,我建议 永远不要 对调试代码进行性能分析。