OpenCL:可变长度数组

OpenCL: array of arrays of variable lengths

我正在尝试使用 C++ 中的 OpenCL 1.2 处理可变长度数组。在每个实例(工作项?)中,我想处理一个子数组。

下面我尝试将数组的数组视为一维数组,但它不起作用 - 数据的随机部分不是进程。

主持人:

vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile("DynMultiDimArr.cl"); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file

cl::Context context(device);
cl::Program program(context, sources);
auto err = program.build(); if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);

int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths 
for (int i = 0; i < 10; i++) {
    idx[i] = c;
    myDat[i] = dat + c;
    for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
}

cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);

cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);

err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat);
cl::finish();

内核:

__kernel void processSubArr(__global double* data, __global int* len, __global int* idx, __global double* outData) {
    for (int i=0;i<len[get_global_id(0)];i++)
        outData[idx[get_global_id(0)]+i] = data[idx[get_global_id(0)]+i]+1000;
}

这只是测试代码。在我的实际问题中,我必须传递 8 个数组数组(所有维度都相同)。第一维是 105 到 106 长,第二维是 1 到 100 长。内核代码约为 100 行代码,使用 k-epsilon 方法计算每个子阵列上的湍流涡流粘度和扩散率。

是这样还是我走错了路?我是 OpenCL 的新手 - 任何帮助将不胜感激。

更新了有效的代码。我不知道这种方法的性能。

vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile("DynMultiDimArr.cl"); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file

cl::Context context(device);
cl::Program program(context, sources);
auto err = program.build(); if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);

int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths 
for (int i = 0; i < 10; i++) {
    idx[i] = c;
    myDat[i] = dat + c;
    for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
}

cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);

cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);

err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat); //queue.enqueueMapBuffer + memcpy faster?

queue.finish();
cl::finish();

for (int i = 0; i < 10; i++) {
    int j = 0;
    for (j = 0; j < lens[i]-1; j++)
        cout << myDat[i][j] << ",";
    cout << myDat[i][j] << endl;
}
delete[] dat;