如何优化这个 OpenCL 内核?
How to optimize this OpenCL kernel?
我正在做一个项目,这个 OpenCL 内核有一些问题:-(
__kernel void gemm_fast_5(
__global double *ar, __global double *br, __global double *cr,
__global double *pr, __global double *ur,
unsigned long c, unsigned long c2,
unsigned long c3, unsigned long c4,
unsigned long c5, unsigned long m,
unsigned char com
){
unsigned long i = get_global_id(0);
unsigned long j = get_global_id(1);
unsigned long x = get_local_id(0);
unsigned long y = get_local_id(1);
unsigned long cur = i*c3 + j, rl, rl2, rl3;
#if ks == 1 || ks == 2 || ks == 3 || ks == 4
unsigned long rl4;
#endif
#if ks == 2
rl = (i << 1)*c;
#elif ks == 3
rl = ((i << 1) + 1)*c;
#else
rl = i*c;
#endif
__local double ut, pt;
if (x == 0) pt = pr[i*c4 + ks];
if (y == 0) ut = ur[j*c5 + ks];
double aa = 0.0;
double bb, cc;
double dd, ee;
for (unsigned long k=0; k<m; k++){
#if ks == 1 || ks == 4
rl3 = (k << 1) + 1; rl4 = (k << 2) + 3;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl4 - 1]; ee = br[rl2 + rl4 - 3];
#elif ks == 2 || ks == 3
rl3 = (k << 2) + 3; rl4 = (k << 1) + 1;
bb = ar[rl + rl3 - 3]; cc = ar[rl + rl3 - 2];
dd = br[rl2 + rl4]; ee = br[rl2 + rl4 - 1];
#else
rl3 = (k << 1) + 1;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl3]; ee = br[rl2 + rl3 - 1];
#endif
aa += (bb + dd)*(cc + ee);
}
cr[cur] = aa - pt - ut;
}
在工作时,我注意到如果我删除最后一行,即使将最后一行更改为 cr[cur] = 5.0 - pt - ut;
,内核也会比 运行 少 6 倍。
它不应该采用相同的或至少类似的东西吗?
即使在寻找答案,利用我有 CPU 和 GPU 这一事实,我已经尝试了几次 运行 时间(PoCL 和 opencl-amd),同样的事情发生了:-/
如果有人能帮助我理解为什么会这样,我将不胜感激。我不明白:"v
循环内的所有操作都没有副作用,你只需要从那些__global
指针中读取,然后计算一些临时值,最终通过final累加到aa
中aa += ...
。换句话说,该循环的唯一目的是计算 aa
.
的值
因此,如果您从最后一行(循环外)中删除 aa
,则循环内的所有操作都将完全无用,并且您最终会得到一个除了读取一些值和更新将在函数 return 处丢弃的局部变量。在启用优化的情况下编译上面的代码(我假设你正在这样做,否则你的问题就没有多大意义),编译器很可能只是摆脱整个循环。因此,没有最终 aa
的代码运行得更快。
这里是a GCC example (adapted removing CUDA annotations), where you can see that even the lowest level of optimization (-O1
) removes the entire body of the loop, leaving only comparisons and the incrementing of i
. With -O2
, the whole loop is removed。
我正在做一个项目,这个 OpenCL 内核有一些问题:-(
__kernel void gemm_fast_5(
__global double *ar, __global double *br, __global double *cr,
__global double *pr, __global double *ur,
unsigned long c, unsigned long c2,
unsigned long c3, unsigned long c4,
unsigned long c5, unsigned long m,
unsigned char com
){
unsigned long i = get_global_id(0);
unsigned long j = get_global_id(1);
unsigned long x = get_local_id(0);
unsigned long y = get_local_id(1);
unsigned long cur = i*c3 + j, rl, rl2, rl3;
#if ks == 1 || ks == 2 || ks == 3 || ks == 4
unsigned long rl4;
#endif
#if ks == 2
rl = (i << 1)*c;
#elif ks == 3
rl = ((i << 1) + 1)*c;
#else
rl = i*c;
#endif
__local double ut, pt;
if (x == 0) pt = pr[i*c4 + ks];
if (y == 0) ut = ur[j*c5 + ks];
double aa = 0.0;
double bb, cc;
double dd, ee;
for (unsigned long k=0; k<m; k++){
#if ks == 1 || ks == 4
rl3 = (k << 1) + 1; rl4 = (k << 2) + 3;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl4 - 1]; ee = br[rl2 + rl4 - 3];
#elif ks == 2 || ks == 3
rl3 = (k << 2) + 3; rl4 = (k << 1) + 1;
bb = ar[rl + rl3 - 3]; cc = ar[rl + rl3 - 2];
dd = br[rl2 + rl4]; ee = br[rl2 + rl4 - 1];
#else
rl3 = (k << 1) + 1;
bb = ar[rl + rl3 - 1]; cc = ar[rl + rl3];
dd = br[rl2 + rl3]; ee = br[rl2 + rl3 - 1];
#endif
aa += (bb + dd)*(cc + ee);
}
cr[cur] = aa - pt - ut;
}
在工作时,我注意到如果我删除最后一行,即使将最后一行更改为 cr[cur] = 5.0 - pt - ut;
,内核也会比 运行 少 6 倍。
它不应该采用相同的或至少类似的东西吗? 即使在寻找答案,利用我有 CPU 和 GPU 这一事实,我已经尝试了几次 运行 时间(PoCL 和 opencl-amd),同样的事情发生了:-/
如果有人能帮助我理解为什么会这样,我将不胜感激。我不明白:"v
循环内的所有操作都没有副作用,你只需要从那些__global
指针中读取,然后计算一些临时值,最终通过final累加到aa
中aa += ...
。换句话说,该循环的唯一目的是计算 aa
.
因此,如果您从最后一行(循环外)中删除 aa
,则循环内的所有操作都将完全无用,并且您最终会得到一个除了读取一些值和更新将在函数 return 处丢弃的局部变量。在启用优化的情况下编译上面的代码(我假设你正在这样做,否则你的问题就没有多大意义),编译器很可能只是摆脱整个循环。因此,没有最终 aa
的代码运行得更快。
这里是a GCC example (adapted removing CUDA annotations), where you can see that even the lowest level of optimization (-O1
) removes the entire body of the loop, leaving only comparisons and the incrementing of i
. With -O2
, the whole loop is removed。