ERROR: kernel parameter has non-trivially copy constructible class/struct type+sycl+tbb
ERROR: kernel parameter has non-trivially copy constructible class/struct type+sycl+tbb
我尝试提供一种“映射”框架,它通过一些指定目标类型的参数(CPU 或 GPU/Accelerator)包装 OneAPI 调用以隐藏硬件定位问题。我的地图骨架传递函数及其导数,初始点指向牛顿法。
但我有一个错误是:
kernel parameter has non-trivially copy constructible class/struct type 'std::function<double (double)>'
我的代码是:
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include "uTimer.cpp"
#include <cmath>
#include <random>
#include <ctime>
#include <numeric>
#include <cstdlib>
//#include <dos.h> //for delay
//#include <conio.h> //for getch()
//#include <complex>
#define EPSILON 0.000001 // The step size across the X and Y axis
using namespace tbb;
class Clock {
private:
typedef std::chrono::high_resolution_clock clock;
std::chrono::time_point<clock> t;
public:
Clock() {
start();
}
void start() {
t = clock::now();
}
double stop() const {
return std::chrono::duration_cast<std::chrono::duration<double>>(
clock::now() - t).count();
}
};
//std::complex<double> mycomplex(10.0, 2.0);
template<class Tin, class Tout>
class Map {
private:
std::function<Tout(Tin)> fun;
std::function<Tout(Tin)> dfun;
public:
Map() {};
Map(std::function<Tout(Tin)> f, std::function<Tout(Tin)> df) {
fun = f;
dfun = df;
};
void operator()(bool use_tbb, Tin &x1) {
int iter=100;
Tout x;
if (use_tbb) {
uTimer *timer = new uTimer("Executing Code On CPU");
tbb::parallel_for(tbb::blocked_range < int > (0, iter),
[&](tbb::blocked_range<int> t) {
for (int index = t.begin(); index < t.end(); ++index) {
do
{
x = x1;
x1 = x - (fun(x) / dfun(x));
}while (std::abs(x1 - x) >= EPSILON);
}
});
timer->~uTimer();
}else {
sycl::buffer<Tin, 1> x1_buffer(&x1, iter);
sycl::buffer<Tout, 1> x_buffer(&x, iter);
//Profiling GPU
// Initialize property list with profiling information
sycl::property_list propList {
sycl::property::queue::enable_profiling() };
// Build the command queue (constructed to handle event profling)
sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
propList);
// print out the device information used for the kernel code
std::cout << "Device: "
<< gpuQueue.get_device().get_info<sycl::info::device::name>()
<< std::endl;
std::cout << "Compute Units: "
<< gpuQueue.get_device().get_info<
sycl::info::device::max_compute_units>()
<< std::endl;
auto start_overall = std::chrono::system_clock::now();
auto event = gpuQueue.submit([&](sycl::handler &h) {
//local copy of fun
auto f = fun;
auto df = dfun;
sycl::accessor x1_accessor(x1_buffer, h, sycl::read_write);
sycl::accessor x_accessor(x_buffer, h, sycl::read_write);
h.parallel_for(iter, [=](sycl::id<1> index) {
do
{
x_accessor[index] = x1_accessor[index];
x1_accessor[index] = x_accessor[index] - (f(x_accessor[index]) / df(x_accessor[index]));
}while (sycl::fabs(f(x1_accessor[index]))>= EPSILON);
});
});
event.wait();
auto end_overall = std::chrono::system_clock::now();
cl_ulong submit_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_submit>();
cl_ulong start_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_start>();
cl_ulong end_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_end>();
auto submission_time = (start_time - submit_time) / 1000000.0f;
std::cout << "Submit Time: " << submission_time << " ms"
<< std::endl;
auto execution_time = (end_time - start_time) / 1000000.0f;
std::cout << "Execution Time: " << execution_time << " ms"
<< std::endl;
auto execution_overall = std::chrono::duration_cast
< std::chrono::milliseconds > (end_overall - start_overall);
std::cout << "Overall Execution Time: " << execution_overall.count()
<< " ms" << std::endl;
};
};
};
int main(int argc, char *argv[]) {
//Define a function
auto f = [](double x) {return pow(x,3);};
//Define the derivative of function
auto df = [](double x) {return pow(x, 2) *3;};
//Define an instance of Map class
auto m1 = Map<double, double>(f, df);
double x1 = 3;
m1(true, x1);
//print the result
//for (auto &e : r) {
//std::cout << e << " ";
//}
return 0;
}
此外,如果不考虑错误,我认为我的代码中有些地方似乎不正确,但我无法理解它是什么。
你不能为所欲为。如果您尝试摆脱 std::function
并使用函数指针,您仍然无法做到(即使它可以轻松复制)。在 SYCL 中,就像在任何其他此类语言(CUDA、hip、OpenCL 等)中一样,设备编译器需要能够通过内核编译所有函数 executed/called。所以不,你不能传递一个函数“in”。归结为您之前回答的一个问题
您可以尝试将您的 lambda 定义为其他地方的函数,然后从您的内核中调用它们。如果您希望能够在运行时在各种功能之间进行选择,您可以编写一个模板化内核(假设是一个枚举)并通过 if constexpr
(在内核中)调度您的调用以避免运行时成本(和代码重复数据删除)。最后将实例化 n 个 SYCL 内核,每个内核调用您的一个函数。它们会被设备编译器等正确编译
我尝试提供一种“映射”框架,它通过一些指定目标类型的参数(CPU 或 GPU/Accelerator)包装 OneAPI 调用以隐藏硬件定位问题。我的地图骨架传递函数及其导数,初始点指向牛顿法。 但我有一个错误是:
kernel parameter has non-trivially copy constructible class/struct type 'std::function<double (double)>'
我的代码是:
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include "uTimer.cpp"
#include <cmath>
#include <random>
#include <ctime>
#include <numeric>
#include <cstdlib>
//#include <dos.h> //for delay
//#include <conio.h> //for getch()
//#include <complex>
#define EPSILON 0.000001 // The step size across the X and Y axis
using namespace tbb;
class Clock {
private:
typedef std::chrono::high_resolution_clock clock;
std::chrono::time_point<clock> t;
public:
Clock() {
start();
}
void start() {
t = clock::now();
}
double stop() const {
return std::chrono::duration_cast<std::chrono::duration<double>>(
clock::now() - t).count();
}
};
//std::complex<double> mycomplex(10.0, 2.0);
template<class Tin, class Tout>
class Map {
private:
std::function<Tout(Tin)> fun;
std::function<Tout(Tin)> dfun;
public:
Map() {};
Map(std::function<Tout(Tin)> f, std::function<Tout(Tin)> df) {
fun = f;
dfun = df;
};
void operator()(bool use_tbb, Tin &x1) {
int iter=100;
Tout x;
if (use_tbb) {
uTimer *timer = new uTimer("Executing Code On CPU");
tbb::parallel_for(tbb::blocked_range < int > (0, iter),
[&](tbb::blocked_range<int> t) {
for (int index = t.begin(); index < t.end(); ++index) {
do
{
x = x1;
x1 = x - (fun(x) / dfun(x));
}while (std::abs(x1 - x) >= EPSILON);
}
});
timer->~uTimer();
}else {
sycl::buffer<Tin, 1> x1_buffer(&x1, iter);
sycl::buffer<Tout, 1> x_buffer(&x, iter);
//Profiling GPU
// Initialize property list with profiling information
sycl::property_list propList {
sycl::property::queue::enable_profiling() };
// Build the command queue (constructed to handle event profling)
sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),
propList);
// print out the device information used for the kernel code
std::cout << "Device: "
<< gpuQueue.get_device().get_info<sycl::info::device::name>()
<< std::endl;
std::cout << "Compute Units: "
<< gpuQueue.get_device().get_info<
sycl::info::device::max_compute_units>()
<< std::endl;
auto start_overall = std::chrono::system_clock::now();
auto event = gpuQueue.submit([&](sycl::handler &h) {
//local copy of fun
auto f = fun;
auto df = dfun;
sycl::accessor x1_accessor(x1_buffer, h, sycl::read_write);
sycl::accessor x_accessor(x_buffer, h, sycl::read_write);
h.parallel_for(iter, [=](sycl::id<1> index) {
do
{
x_accessor[index] = x1_accessor[index];
x1_accessor[index] = x_accessor[index] - (f(x_accessor[index]) / df(x_accessor[index]));
}while (sycl::fabs(f(x1_accessor[index]))>= EPSILON);
});
});
event.wait();
auto end_overall = std::chrono::system_clock::now();
cl_ulong submit_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_submit>();
cl_ulong start_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_start>();
cl_ulong end_time = event.template get_profiling_info<
cl::sycl::info::event_profiling::command_end>();
auto submission_time = (start_time - submit_time) / 1000000.0f;
std::cout << "Submit Time: " << submission_time << " ms"
<< std::endl;
auto execution_time = (end_time - start_time) / 1000000.0f;
std::cout << "Execution Time: " << execution_time << " ms"
<< std::endl;
auto execution_overall = std::chrono::duration_cast
< std::chrono::milliseconds > (end_overall - start_overall);
std::cout << "Overall Execution Time: " << execution_overall.count()
<< " ms" << std::endl;
};
};
};
int main(int argc, char *argv[]) {
//Define a function
auto f = [](double x) {return pow(x,3);};
//Define the derivative of function
auto df = [](double x) {return pow(x, 2) *3;};
//Define an instance of Map class
auto m1 = Map<double, double>(f, df);
double x1 = 3;
m1(true, x1);
//print the result
//for (auto &e : r) {
//std::cout << e << " ";
//}
return 0;
}
此外,如果不考虑错误,我认为我的代码中有些地方似乎不正确,但我无法理解它是什么。
你不能为所欲为。如果您尝试摆脱 std::function
并使用函数指针,您仍然无法做到(即使它可以轻松复制)。在 SYCL 中,就像在任何其他此类语言(CUDA、hip、OpenCL 等)中一样,设备编译器需要能够通过内核编译所有函数 executed/called。所以不,你不能传递一个函数“in”。归结为您之前回答的一个问题
您可以尝试将您的 lambda 定义为其他地方的函数,然后从您的内核中调用它们。如果您希望能够在运行时在各种功能之间进行选择,您可以编写一个模板化内核(假设是一个枚举)并通过 if constexpr
(在内核中)调度您的调用以避免运行时成本(和代码重复数据删除)。最后将实例化 n 个 SYCL 内核,每个内核调用您的一个函数。它们会被设备编译器等正确编译