如何减少 OpenCL 内核之间的代码重复?
How to reduce code duplication between OpenCL kernels?
我有几个类似的内核来生成随机数据并将其存储在全局内存中。我总是使用相同的算法来随机化,但由于可变范围问题(我需要跟踪数据),我无法避免严重的代码重复。
有什么方法可以避免这种情况吗?在 OpenCL 中生成随机数据似乎是一项相当标准的任务,但它有悖于任何良好的编码标准以具有这种级别的代码重复。例如,这是我的两个内核:
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to AWGN channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterAWGN(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float sigma)
{
int globalID = get_global_id(0);
double c = 2.0/(sigma*sigma);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+2) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u1 = ((double)x + 1.0f) / 4294967296.0f;
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u2 = ((double)x + 1.0f) / 4294967296.0f;
double r = sqrt(-2.0f * log(u1));
double phi = 2 * PI * u2;
u1 = r * cos(phi);
u1 = inputcw[iOut]+sigma*u1;
u1=1/(1+exp(-c*u1));
d_Rand[globalID * nPerRng + iOut]=log((1-u1)/u1);
if (iOut!=nPerRng-1) {
u2 = r * sin(phi);
u2 = inputcw[iOut+1]+sigma*u2;
u2=1/(1+exp(-c*u2));
u2=log((1-u2)/u2);
d_Rand[globalID * nPerRng + iOut+1]=u2;
}
}
}
和
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to BSC channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterBSC(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float flipProb)
{
int globalID = get_global_id(0);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+1) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double c = log((1-flipProb)/flipProb);
double u = ((double)x + 1.0f) / 4294967296.0f;
u = (2*isless(u,flipProb)-1)*inputcw[iOut]*c;
d_Rand[globalID * nPerRng + iOut]=u;
}
}
有什么方法、技巧或方法可以避免这种情况吗?子程序似乎无法正确使用变量(尤其是 mt
),所以我没有设法以其他语言允许的方式减少它。
或者我是否应该将其视为 OpenCL 中的必要之恶并继续以这种方式管理 10 个不同的内核?
在 Khronos 的网站上,它说
OpenCL programs may also contain auxiliary functions and constant data that can be used by __kernel functions.
每个线程生成介于 0.0f 和 1.0f 之间的随机数的示例:
迭代种子的核心函数:
uint wang_hash(uint seed)
{
seed = (seed ^ 61) ^ (seed >> 16);
seed *= 9;
seed = seed ^ (seed >> 4);
seed *= 0x27d4eb2d;
seed = seed ^ (seed >> 15);
return seed;
}
每个线程种子的初始化和迭代:
// id=thread id, rnd=seed array
void wang_rnd_init(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(id);
rnd[id]=rndint;
}
// id=thread id, rnd=seed array
float wang_rnd(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(rnd[id]);
rnd[id]=rndint;
return ((float)rndint)/(float)maxint;
}
随机灰度彩色像素生成器内核中的用法:
__kernel void rnd_1(__global unsigned int * rnd, __global int *rgba)
{
int id=get_global_id(0);
float rgba_register=wang_rnd(rnd,id);
rgba[id] = ((int)(rgba_register * 255) << 24) | ((int)(rgba_register * 255) << 16) | ((int)(rgba_register * 255) << 8) | ((int)(rgba_register * 255));
}
和wang_rnd()可以在其他内核中使用,如果它们在同一个编译上下文中,无需定义两次,就像将所有相关内核和函数放在同一个文件中进行编译。
辅助功能不限于寄存器和全局内存。它们也可以采用本地和常量内存参数。由于它们主要使用设备端内存,因此它们也可以使用 and return 结构。
我有几个类似的内核来生成随机数据并将其存储在全局内存中。我总是使用相同的算法来随机化,但由于可变范围问题(我需要跟踪数据),我无法避免严重的代码重复。
有什么方法可以避免这种情况吗?在 OpenCL 中生成随机数据似乎是一项相当标准的任务,但它有悖于任何良好的编码标准以具有这种级别的代码重复。例如,这是我的两个内核:
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to AWGN channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterAWGN(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float sigma)
{
int globalID = get_global_id(0);
double c = 2.0/(sigma*sigma);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+2) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u1 = ((double)x + 1.0f) / 4294967296.0f;
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double u2 = ((double)x + 1.0f) / 4294967296.0f;
double r = sqrt(-2.0f * log(u1));
double phi = 2 * PI * u2;
u1 = r * cos(phi);
u1 = inputcw[iOut]+sigma*u1;
u1=1/(1+exp(-c*u1));
d_Rand[globalID * nPerRng + iOut]=log((1-u1)/u1);
if (iOut!=nPerRng-1) {
u2 = r * sin(phi);
u2 = inputcw[iOut+1]+sigma*u2;
u2=1/(1+exp(-c*u2));
u2=log((1-u2)/u2);
d_Rand[globalID * nPerRng + iOut+1]=u2;
}
}
}
和
////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to BSC channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterBSC(__global double* d_Rand,
__global int* seeds,
__global long* inputcw,
int nPerRng, float flipProb)
{
int globalID = get_global_id(0);
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN];
//Initialize current state
mt[0] = seeds[globalID];
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut=iOut+1) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
double c = log((1-flipProb)/flipProb);
double u = ((double)x + 1.0f) / 4294967296.0f;
u = (2*isless(u,flipProb)-1)*inputcw[iOut]*c;
d_Rand[globalID * nPerRng + iOut]=u;
}
}
有什么方法、技巧或方法可以避免这种情况吗?子程序似乎无法正确使用变量(尤其是 mt
),所以我没有设法以其他语言允许的方式减少它。
或者我是否应该将其视为 OpenCL 中的必要之恶并继续以这种方式管理 10 个不同的内核?
在 Khronos 的网站上,它说
OpenCL programs may also contain auxiliary functions and constant data that can be used by __kernel functions.
每个线程生成介于 0.0f 和 1.0f 之间的随机数的示例:
迭代种子的核心函数:
uint wang_hash(uint seed)
{
seed = (seed ^ 61) ^ (seed >> 16);
seed *= 9;
seed = seed ^ (seed >> 4);
seed *= 0x27d4eb2d;
seed = seed ^ (seed >> 15);
return seed;
}
每个线程种子的初始化和迭代:
// id=thread id, rnd=seed array
void wang_rnd_init(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(id);
rnd[id]=rndint;
}
// id=thread id, rnd=seed array
float wang_rnd(__global unsigned int * rnd,int id)
{
uint maxint=0;
maxint--; // could be a 0xFFFFFFFF
uint rndint=wang_hash(rnd[id]);
rnd[id]=rndint;
return ((float)rndint)/(float)maxint;
}
随机灰度彩色像素生成器内核中的用法:
__kernel void rnd_1(__global unsigned int * rnd, __global int *rgba)
{
int id=get_global_id(0);
float rgba_register=wang_rnd(rnd,id);
rgba[id] = ((int)(rgba_register * 255) << 24) | ((int)(rgba_register * 255) << 16) | ((int)(rgba_register * 255) << 8) | ((int)(rgba_register * 255));
}
和wang_rnd()可以在其他内核中使用,如果它们在同一个编译上下文中,无需定义两次,就像将所有相关内核和函数放在同一个文件中进行编译。
辅助功能不限于寄存器和全局内存。它们也可以采用本地和常量内存参数。由于它们主要使用设备端内存,因此它们也可以使用 and return 结构。