为什么cuda kernel w/i divergence 的性能优于w/o divergence?
Why the performance of cuda kernel w/i divergence is better than w/o divergence?
您好,我正在编写 cuda 内核来了解经线发散的行为。
这些是我拥有的 3 个内核:
#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>
__global__ void wardUp(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel1(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel2(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
initDevice(0);
int size = 64;
int blocksize = 64;
int nBytes = sizeof(float)*size;
float *a_d;
CHECK(cudaMalloc((float**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid((blocksize-1)/block.x+1, 1);
wardUp<<<grid, block>>>(a_d);
float elapsed = 0;
cudaEvent_t start1, stop1;
CHECK(cudaEventCreate(&start1));
CHECK(cudaEventCreate(&stop1));
CHECK(cudaEventRecord(start1, 0));
kernel1<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop1, 0));
CHECK(cudaEventSynchronize(stop1));
CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
printf("kernel1 take:%2f ms\n", elapsed);
float elapsed_1 = 0;
cudaEvent_t start2, stop2;
CHECK(cudaEventCreate(&start2));
CHECK(cudaEventCreate(&stop2));
CHECK(cudaEventRecord(start2, 0));
kernel2<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop2, 0));
CHECK(cudaEventSynchronize(stop2));
CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
printf("kernel2 take:%2f ms\n", elapsed_1);
cudaFree(a_d);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
return 0;
}
如果我的理解是正确的,kernel1
没有发散问题,因为 if
分支发生在线程 0-31 上,相同的扭曲。
kernel2
由于奇数线程和偶数线程不能同时执行,所以会有分歧问题。
但是我观察到 kernel1
比 kernel2
慢。为什么会这样?
Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms
我转而使用 cudaEventRecord
来记录持续时间,但似乎 kernel1
比 kernel2
慢。
您的方法 are/were 有很多问题。我可能不会一一列举:
- 问题规模太小,无法进行基准测试
- 编译器优化对您不利
- 代码太简单;编译器使用谓词来减轻扭曲发散的影响
- 您的内核持续时间测量方法存在缺陷
- 你基于
blocksize
创建grid
是不明智的(尽管当size
== blocksize
时它恰好是明智的)。它应该基于问题 size
,而不是 blocksize
.
以下代码解决了这些问题,并显示内核持续时间增加了大约 2 倍,从根据 warp 边界执行 if/then 决策的代码到为每个其他线程执行此操作的代码:
$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x
__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&32){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&1){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
int blocksize = 64;
int size = blocksize*1048576;
int nBytes = sizeof(int)*size;
int *a_d;
CHECK(cudaMalloc((int**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid(size/block.x, 1);
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start1 = std::chrono::system_clock::now();
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end1 = std::chrono::system_clock::now();
std::chrono::duration<double>diff1 = end1 - start1;
printf("kernel1 take:%2f s\n", diff1.count());
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start2 = std::chrono::system_clock::now();
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("kernel2 take:%2f s\n", diff2.count());
return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$
您好,我正在编写 cuda 内核来了解经线发散的行为。 这些是我拥有的 3 个内核:
#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>
__global__ void wardUp(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel1(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel2(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
initDevice(0);
int size = 64;
int blocksize = 64;
int nBytes = sizeof(float)*size;
float *a_d;
CHECK(cudaMalloc((float**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid((blocksize-1)/block.x+1, 1);
wardUp<<<grid, block>>>(a_d);
float elapsed = 0;
cudaEvent_t start1, stop1;
CHECK(cudaEventCreate(&start1));
CHECK(cudaEventCreate(&stop1));
CHECK(cudaEventRecord(start1, 0));
kernel1<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop1, 0));
CHECK(cudaEventSynchronize(stop1));
CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
printf("kernel1 take:%2f ms\n", elapsed);
float elapsed_1 = 0;
cudaEvent_t start2, stop2;
CHECK(cudaEventCreate(&start2));
CHECK(cudaEventCreate(&stop2));
CHECK(cudaEventRecord(start2, 0));
kernel2<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop2, 0));
CHECK(cudaEventSynchronize(stop2));
CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
printf("kernel2 take:%2f ms\n", elapsed_1);
cudaFree(a_d);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
return 0;
}
如果我的理解是正确的,kernel1
没有发散问题,因为 if
分支发生在线程 0-31 上,相同的扭曲。
kernel2
由于奇数线程和偶数线程不能同时执行,所以会有分歧问题。
但是我观察到 kernel1
比 kernel2
慢。为什么会这样?
Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms
我转而使用 cudaEventRecord
来记录持续时间,但似乎 kernel1
比 kernel2
慢。
您的方法 are/were 有很多问题。我可能不会一一列举:
- 问题规模太小,无法进行基准测试
- 编译器优化对您不利
- 代码太简单;编译器使用谓词来减轻扭曲发散的影响
- 您的内核持续时间测量方法存在缺陷
- 你基于
blocksize
创建grid
是不明智的(尽管当size
==blocksize
时它恰好是明智的)。它应该基于问题size
,而不是blocksize
.
以下代码解决了这些问题,并显示内核持续时间增加了大约 2 倍,从根据 warp 边界执行 if/then 决策的代码到为每个其他线程执行此操作的代码:
$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x
__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&32){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&1){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
int blocksize = 64;
int size = blocksize*1048576;
int nBytes = sizeof(int)*size;
int *a_d;
CHECK(cudaMalloc((int**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid(size/block.x, 1);
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start1 = std::chrono::system_clock::now();
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end1 = std::chrono::system_clock::now();
std::chrono::duration<double>diff1 = end1 - start1;
printf("kernel1 take:%2f s\n", diff1.count());
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start2 = std::chrono::system_clock::now();
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("kernel2 take:%2f s\n", diff2.count());
return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$