ERROR: kernel parameter has non-trivially copy constructible class/struct type+sycl+tbb

ERROR: kernel parameter has non-trivially copy constructible class/struct type+sycl+tbb

我尝试提供一种“映射”框架,它通过一些指定目标类型的参数(CPU 或 GPU/Accelerator)包装 OneAPI 调用以隐藏硬件定位问题。我的地图骨架传递函数及其导数,初始点指向牛顿法。 但我有一个错误是:

kernel parameter has non-trivially copy constructible class/struct type 'std::function<double (double)>'

我的代码是:

         #include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include "uTimer.cpp"
#include <cmath>
#include <random>
#include <ctime>
#include <numeric>
#include <cstdlib>

//#include <dos.h> //for delay
//#include <conio.h> //for getch()
//#include <complex>
#define EPSILON 0.000001 // The step size across the X and Y axis

using namespace tbb;

class Clock {
private:
    typedef std::chrono::high_resolution_clock clock;
    std::chrono::time_point<clock> t;
public:
    Clock() {
        start();
    }
    void start() {
        t = clock::now();
    }
    double stop() const {
        return std::chrono::duration_cast<std::chrono::duration<double>>(
                clock::now() - t).count();
    }
};


//std::complex<double> mycomplex(10.0, 2.0);

template<class Tin, class Tout>
class Map {
        private:
                std::function<Tout(Tin)> fun;
                std::function<Tout(Tin)> dfun;
        public:
                Map() {};
                Map(std::function<Tout(Tin)> f, std::function<Tout(Tin)> df) {
                    fun = f;
                    dfun = df;
                };


    void operator()(bool use_tbb, Tin &x1) {
        int iter=100;
        Tout x;
        if (use_tbb) {
            uTimer *timer = new uTimer("Executing Code On CPU");
            tbb::parallel_for(tbb::blocked_range < int > (0, iter),
                    [&](tbb::blocked_range<int> t) {
                        for (int index = t.begin(); index < t.end(); ++index) {
                            do
                            {
                                x = x1;
                                x1 = x - (fun(x) / dfun(x));
                            }while (std::abs(x1 - x) >= EPSILON);
                        }
                });
            timer->~uTimer();
        }else {
            sycl::buffer<Tin, 1> x1_buffer(&x1, iter);
            sycl::buffer<Tout, 1> x_buffer(&x, iter);
            //Profiling GPU

            // Initialize property list with profiling information
            sycl::property_list propList {
                    sycl::property::queue::enable_profiling() };
            // Build the command queue (constructed to handle event profling)
            sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
                    propList);
            // print out the device information used for the kernel code
            std::cout << "Device: "
                    << gpuQueue.get_device().get_info<sycl::info::device::name>()
                    << std::endl;

            std::cout << "Compute Units: "
                    << gpuQueue.get_device().get_info<
                            sycl::info::device::max_compute_units>()
                    << std::endl;

            auto start_overall = std::chrono::system_clock::now();
            auto event = gpuQueue.submit([&](sycl::handler &h) {
                //local copy of fun
                auto f = fun;
                auto df = dfun;
                sycl::accessor x1_accessor(x1_buffer, h, sycl::read_write);
                sycl::accessor x_accessor(x_buffer, h, sycl::read_write);
                h.parallel_for(iter, [=](sycl::id<1> index) {
                    do
                    {
                        x_accessor[index] = x1_accessor[index];
                        x1_accessor[index] = x_accessor[index] - (f(x_accessor[index]) / df(x_accessor[index]));
                    }while (sycl::fabs(f(x1_accessor[index]))>= EPSILON);

                });
            });
            event.wait();
            auto end_overall = std::chrono::system_clock::now();
            cl_ulong submit_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_submit>();
            cl_ulong start_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_start>();
            cl_ulong end_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_end>();
            auto submission_time = (start_time - submit_time) / 1000000.0f;
            std::cout << "Submit Time: " << submission_time << " ms"
                    << std::endl;
            auto execution_time = (end_time - start_time) / 1000000.0f;
            std::cout << "Execution Time: " << execution_time << " ms"
                    << std::endl;
            auto execution_overall = std::chrono::duration_cast
                    < std::chrono::milliseconds > (end_overall - start_overall);
            std::cout << "Overall Execution Time: " << execution_overall.count()
                    << " ms" << std::endl;
        };
    };
};





int main(int argc, char *argv[]) {

    //Define a function
    auto f = [](double x) {return pow(x,3);};
    //Define the derivative of function
    auto df = [](double x) {return pow(x, 2) *3;};
    //Define an instance of Map class
    auto m1 = Map<double, double>(f, df);
    double x1 = 3;
    m1(true, x1);
    //print the result
    //for (auto &e : r) {
        //std::cout << e << " ";
    //}
    return 0;
}

此外,如果不考虑错误,我认为我的代码中有些地方似乎不正确,但我无法理解它是什么。

你不能为所欲为。如果您尝试摆脱 std::function 并使用函数指针,您仍然无法做到(即使它可以轻松复制)。在 SYCL 中,就像在任何其他此类语言(CUDA、hip、OpenCL 等)中一样,设备编译器需要能够通过内核编译所有函数 executed/called。所以不,你不能传递一个函数“in”。归结为您之前回答的一个问题

您可以尝试将您的 lambda 定义为其他地方的函数,然后从您的内核中调用它们。如果您希望能够在运行时在各种功能之间进行选择,您可以编写一个模板化内核(假设是一个枚举)并通过 if constexpr (在内核中)调度您的调用以避免运行时成本(和代码重复数据删除)。最后将实例化 n 个 SYCL 内核,每个内核调用您的一个函数。它们会被设备编译器等正确编译