如何减少 OpenCL 内核之间的代码重复?

How to reduce code duplication between OpenCL kernels?

我有几个类似的内核来生成随机数据并将其存储在全局内存中。我总是使用相同的算法来随机化,但由于可变范围问题(我需要跟踪数据),我无法避免严重的代码重复。

有什么方法可以避免这种情况吗?在 OpenCL 中生成随机数据似乎是一项相当标准的任务,但它有悖于任何良好的编码标准以具有这种级别的代码重复。例如,这是我的两个内核:

////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to AWGN channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterAWGN(__global double* d_Rand, 
                  __global int* seeds,
                              __global long* inputcw,
                  int nPerRng, float sigma)
{
    int globalID = get_global_id(0);
    double c = 2.0/(sigma*sigma);

    int iState, iState1, iStateM, iOut;
    unsigned int mti, mti1, mtiM, x;
    unsigned int mt[MT_NN]; 

    //Initialize current state
    mt[0] = seeds[globalID];
    for (iState = 1; iState < MT_NN; iState++)
        mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;

    iState = 0;
    mti1 = mt[0];
    for (iOut = 0; iOut < nPerRng; iOut=iOut+2) {
        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double u1 = ((double)x + 1.0f) / 4294967296.0f;

        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double u2 = ((double)x + 1.0f) / 4294967296.0f;

        double r = sqrt(-2.0f * log(u1));
        double phi = 2 * PI * u2;

        u1 = r * cos(phi);
        u1 = inputcw[iOut]+sigma*u1;
        u1=1/(1+exp(-c*u1));
        d_Rand[globalID * nPerRng + iOut]=log((1-u1)/u1);
        if (iOut!=nPerRng-1) {
            u2 = r * sin(phi);
            u2 = inputcw[iOut+1]+sigma*u2;
            u2=1/(1+exp(-c*u2));
            u2=log((1-u2)/u2);
            d_Rand[globalID * nPerRng + iOut+1]=u2;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to BSC channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterBSC(__global double* d_Rand, 
                  __global int* seeds,
                              __global long* inputcw,
                  int nPerRng, float flipProb)
{
    int globalID = get_global_id(0);

    int iState, iState1, iStateM, iOut;
    unsigned int mti, mti1, mtiM, x;
    unsigned int mt[MT_NN]; 

    //Initialize current state
    mt[0] = seeds[globalID];
    for (iState = 1; iState < MT_NN; iState++)
        mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;

    iState = 0;
    mti1 = mt[0];
    for (iOut = 0; iOut < nPerRng; iOut=iOut+1) {
        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double c = log((1-flipProb)/flipProb);
        double u = ((double)x + 1.0f) / 4294967296.0f;
        u = (2*isless(u,flipProb)-1)*inputcw[iOut]*c;
        d_Rand[globalID * nPerRng + iOut]=u;
    }
}

有什么方法、技巧或方法可以避免这种情况吗?子程序似乎无法正确使用变量(尤其是 mt),所以我没有设法以其他语言允许的方式减少它。

或者我是否应该将其视为 OpenCL 中的必要之恶并继续以这种方式管理 10 个不同的内核?

在 Khronos 的网站上,它说

OpenCL programs may also contain auxiliary functions and constant data that can be used by __kernel functions.

每个线程生成介于 0.0f 和 1.0f 之间的随机数的示例:

迭代种子的核心函数:

uint wang_hash(uint seed)
{
   seed = (seed ^ 61) ^ (seed >> 16);
   seed *= 9;
   seed = seed ^ (seed >> 4);
   seed *= 0x27d4eb2d;
   seed = seed ^ (seed >> 15);
   return seed;
}

每个线程种子的初始化和迭代:

// id=thread id, rnd=seed array
void wang_rnd_init(__global unsigned int * rnd,int id)                
{
     uint maxint=0;
     maxint--;  // could be a 0xFFFFFFFF
     uint rndint=wang_hash(id);
     rnd[id]=rndint;
}

// id=thread id, rnd=seed array
float wang_rnd(__global unsigned int * rnd,int id)                
{
     uint maxint=0;
     maxint--;  // could be a 0xFFFFFFFF
     uint rndint=wang_hash(rnd[id]);
     rnd[id]=rndint;
     return ((float)rndint)/(float)maxint;
}

随机灰度彩色像素生成器内核中的用法:

__kernel void rnd_1(__global unsigned int * rnd, __global int *rgba)
{
      int id=get_global_id(0);
      float rgba_register=wang_rnd(rnd,id);
      rgba[id] = ((int)(rgba_register * 255) << 24) | ((int)(rgba_register * 255) << 16) | ((int)(rgba_register * 255) << 8) | ((int)(rgba_register * 255));
}

和wang_rnd()可以在其他内核中使用,如果它们在同一个编译上下文中,无需定义两次,就像将所有相关内核和函数放在同一个文件中进行编译。

辅助功能不限于寄存器和全局内存。它们也可以采用本地和常量内存参数。由于它们主要使用设备端内存,因此它们也可以使用 and return 结构。