如何测量设备+OpenCL+GPU 中代码的执行时间
How to measure execution time of code in device+OpenCL+GPU
我尝试测量我的代码在 CPU 和 GPU 上的执行时间。
为了测量 CPU 上的时间,我使用了 std::chrono::high_resolution_clock::now() 和 std::chrono::high_resolution_clock::now(),std::chrono::duration_caststd::chrono::nanoseconds(end - begin)
为了测量 GPU 设备上的时间,我阅读了这些链接:
1- https://github.com/intel/pti-gpu/blob/master/chapters/device_activity_tracing/OpenCL.md
2- https://docs.oneapi.com/versions/latest/dpcpp/iface/event.html
3- https://developer.codeplay.com/products/computecpp/ce/guides/computecpp-profiler/step-by-step-profiler-guide?version=2.2.1
等等……
问题是,我很困惑,我无法理解如何使用分析来测量 GPU 上代码的执行时间。我什至不知道我应该把我的代码放在哪里,我犯了很多错误。
我的代码是:
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include <CL/opencl.h>
using namespace tbb;
template<class Tin, class Tout, class Function>
class Map {
private:
Function fun;
public:
Map() {}
Map(Function f):fun(f) {}
std::vector<Tout> operator()(bool use_tbb, std::vector<Tin>& v) {
std::vector<Tout> r(v.size());
if(use_tbb) {
// Start measuring time
auto begin = std::chrono::high_resolution_clock::now();
tbb::parallel_for(tbb::blocked_range<Tin>(0, v.size()),
[&](tbb::blocked_range<Tin> t) {
for(int index = t.begin(); index < t.end(); ++index) {
r[index] = fun(v[index]);
}
});
// Stop measuring time and calculate the elapsed time
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
return r;
} else {
sycl::queue gpuQueue { sycl::gpu_selector() };
sycl::range<1> n_item { v.size() };
sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);
sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);
//Try To use Profiling to measure the execution time of code on GPU device!
cl_queue_properties props[3] = { CL_QUEUE_PROPERTIES,
CL_QUEUE_PROFILING_ENABLE, 0 };
cl_int status = CL_SUCCESS;
cl_command_queue queue =clCreateCommandQueueWithProperties(context, device, props, &status);
assert(status == CL_SUCCESS);
cl_event event = nullptr;
status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr,
global_size, local_size, 0, nullptr, &event);
assert(status == CL_SUCESS);
status = clSetEventCallback(event, CL_COMPLETE, EventNotify, nullptr);
assert(status == CL_SUCESS);
void CL_CALLBACK EventNotify(cl_event event,
cl_int event_status,
void* user_data) {
cl_int status = CL_SUCCESS;
cl_ulong start = 0, end = 0;
assert(event_status == CL_COMPLETE);
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, nullptr);
assert(status == CL_SUCCESS);
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, nullptr);
assert(status == CL_SUCCESS);
}
gpuQueue.submit([&](sycl::handler& h) {
//local copy of fun
auto f = fun;
sycl::accessor in_accessor(in_buffer, h, sycl::read_only);
sycl::accessor out_accessor(out_buffer, h, sycl::write_only);
h.parallel_for(n_item, [=](sycl::id<1> index) {
out_accessor[index] = f(in_accessor[index]);
});
}).wait();
}
return r;
}
};
template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f) {
return Map<Tin, Tout, Function>(f);
}
//typedef int(*func)(int x);
//define different functions
/*
auto function = [](int x){ return x; };
auto functionTimesTwo = [](int x){ return (x*2); };
auto functionDivideByTwo = [](int x){ return (x/2); };
auto lambdaFunction = [](int x){return (++x);};
*/
int main(int argc, char *argv[]) {
std::vector<int> v = { 1,2,3,4,5,6,7,8,9 };
auto f = [](int x) {return (++x); };
//Array of functions
/*func functions[] =
{
function,
functionTimesTwo,
functionDivideByTwo,
lambdaFunction
};
for(int i = 0; i< sizeof(functions); i++){
auto m1 = make_map<int, int>(functions[i]);
*/
auto m1 = make_map<int, int>(f);
std::vector<int> r = m1(true, v);
//print the result
for(auto &e:r) {
std::cout << e << " ";
}
return 0;
}
一个好的开始是格式化你的代码,使你有一致的缩进。我已经在这里为你做到了。如果您使用 Visual Studio 社区,select 文本并按 Ctrl
+K
,然后按 Ctrl
+F
。
现在进行分析。这是一个简单的 Clock
class ,易于用于分析:
#include <chrono>
class Clock {
private:
typedef chrono::high_resolution_clock clock;
chrono::time_point<clock> t;
public:
Clock() { start(); }
void start() { t = clock::now(); }
double stop() const { return chrono::duration_cast<chrono::duration<double>>(clock::now()-t).count(); }
};
准备好后,下面是您如何配置 GPU 内核:
Clock clock;
clock.start();
status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr, global_size, local_size, 0, nullptr, &event);
clFinish(queue); // important: this will block until kernel execution is finished
std::cout << "Execution time: " << clock.stop() << " seconds" << std::endl;
为了获得更准确的测量,您可以在循环中多次测量内核执行时间并计算平均值。
我尝试测量我的代码在 CPU 和 GPU 上的执行时间。 为了测量 CPU 上的时间,我使用了 std::chrono::high_resolution_clock::now() 和 std::chrono::high_resolution_clock::now(),std::chrono::duration_caststd::chrono::nanoseconds(end - begin) 为了测量 GPU 设备上的时间,我阅读了这些链接: 1- https://github.com/intel/pti-gpu/blob/master/chapters/device_activity_tracing/OpenCL.md 2- https://docs.oneapi.com/versions/latest/dpcpp/iface/event.html 3- https://developer.codeplay.com/products/computecpp/ce/guides/computecpp-profiler/step-by-step-profiler-guide?version=2.2.1 等等…… 问题是,我很困惑,我无法理解如何使用分析来测量 GPU 上代码的执行时间。我什至不知道我应该把我的代码放在哪里,我犯了很多错误。 我的代码是:
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include <CL/opencl.h>
using namespace tbb;
template<class Tin, class Tout, class Function>
class Map {
private:
Function fun;
public:
Map() {}
Map(Function f):fun(f) {}
std::vector<Tout> operator()(bool use_tbb, std::vector<Tin>& v) {
std::vector<Tout> r(v.size());
if(use_tbb) {
// Start measuring time
auto begin = std::chrono::high_resolution_clock::now();
tbb::parallel_for(tbb::blocked_range<Tin>(0, v.size()),
[&](tbb::blocked_range<Tin> t) {
for(int index = t.begin(); index < t.end(); ++index) {
r[index] = fun(v[index]);
}
});
// Stop measuring time and calculate the elapsed time
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
return r;
} else {
sycl::queue gpuQueue { sycl::gpu_selector() };
sycl::range<1> n_item { v.size() };
sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);
sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);
//Try To use Profiling to measure the execution time of code on GPU device!
cl_queue_properties props[3] = { CL_QUEUE_PROPERTIES,
CL_QUEUE_PROFILING_ENABLE, 0 };
cl_int status = CL_SUCCESS;
cl_command_queue queue =clCreateCommandQueueWithProperties(context, device, props, &status);
assert(status == CL_SUCCESS);
cl_event event = nullptr;
status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr,
global_size, local_size, 0, nullptr, &event);
assert(status == CL_SUCESS);
status = clSetEventCallback(event, CL_COMPLETE, EventNotify, nullptr);
assert(status == CL_SUCESS);
void CL_CALLBACK EventNotify(cl_event event,
cl_int event_status,
void* user_data) {
cl_int status = CL_SUCCESS;
cl_ulong start = 0, end = 0;
assert(event_status == CL_COMPLETE);
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, nullptr);
assert(status == CL_SUCCESS);
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, nullptr);
assert(status == CL_SUCCESS);
}
gpuQueue.submit([&](sycl::handler& h) {
//local copy of fun
auto f = fun;
sycl::accessor in_accessor(in_buffer, h, sycl::read_only);
sycl::accessor out_accessor(out_buffer, h, sycl::write_only);
h.parallel_for(n_item, [=](sycl::id<1> index) {
out_accessor[index] = f(in_accessor[index]);
});
}).wait();
}
return r;
}
};
template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f) {
return Map<Tin, Tout, Function>(f);
}
//typedef int(*func)(int x);
//define different functions
/*
auto function = [](int x){ return x; };
auto functionTimesTwo = [](int x){ return (x*2); };
auto functionDivideByTwo = [](int x){ return (x/2); };
auto lambdaFunction = [](int x){return (++x);};
*/
int main(int argc, char *argv[]) {
std::vector<int> v = { 1,2,3,4,5,6,7,8,9 };
auto f = [](int x) {return (++x); };
//Array of functions
/*func functions[] =
{
function,
functionTimesTwo,
functionDivideByTwo,
lambdaFunction
};
for(int i = 0; i< sizeof(functions); i++){
auto m1 = make_map<int, int>(functions[i]);
*/
auto m1 = make_map<int, int>(f);
std::vector<int> r = m1(true, v);
//print the result
for(auto &e:r) {
std::cout << e << " ";
}
return 0;
}
一个好的开始是格式化你的代码,使你有一致的缩进。我已经在这里为你做到了。如果您使用 Visual Studio 社区,select 文本并按 Ctrl
+K
,然后按 Ctrl
+F
。
现在进行分析。这是一个简单的 Clock
class ,易于用于分析:
#include <chrono>
class Clock {
private:
typedef chrono::high_resolution_clock clock;
chrono::time_point<clock> t;
public:
Clock() { start(); }
void start() { t = clock::now(); }
double stop() const { return chrono::duration_cast<chrono::duration<double>>(clock::now()-t).count(); }
};
准备好后,下面是您如何配置 GPU 内核:
Clock clock;
clock.start();
status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr, global_size, local_size, 0, nullptr, &event);
clFinish(queue); // important: this will block until kernel execution is finished
std::cout << "Execution time: " << clock.stop() << " seconds" << std::endl;
为了获得更准确的测量,您可以在循环中多次测量内核执行时间并计算平均值。