使用 OpenCL 计算 GPU 的最大触发器
Calculating GPU's maximum flops using OpenCL
我正在编写一个简单的 OpenCL 应用程序,它将计算目标 GPU 设备的最大实验 FLOPS。我决定让我的 cl 内核尽可能简单。这是我的 OpenCL 内核和主机代码。内核代码为:
__kernel void flops(__global float *data) {
int gid = get_global_id(0);
double s = data[gid];
data[gid] = s * 0.35;
}
主机代码为:
#include <iostream>
#include <sstream>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "support.h"
#include "Event.h"
#include "ResultDatabase.h"
#include "OptionParser.h"
#include "ProgressBar.h"
using namespace std;
std::string kernels_folder = "/home/users/saman/shoc/src/opencl/level3/FlopsFolder/";
std::string kernel_file = "flops.cl";
static const char *opts = "-cl-mad-enable -cl-no-signed-zeros "
"-cl-unsafe-math-optimizations -cl-finite-math-only";
cl_program createProgram (cl_context context,
cl_device_id device,
const char* fileName) {
cl_int errNum;
cl_program program;
std::ifstream kernelFile (fileName, std::ios::in);
if (!kernelFile.is_open()) {
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource (context, 1, (const char **)&srcStr,
NULL, &errNum);
CL_CHECK_ERROR(errNum);
errNum = clBuildProgram (program, 0, NULL, NULL, NULL, NULL);
CL_CHECK_ERROR (errNum);
return program;
}
bool createMemObjects (cl_context context, cl_command_queue queue,
cl_mem* memObject,
const int memFloatsSize, float *a) {
cl_int err;
*memObject = clCreateBuffer (context, CL_MEM_READ_WRITE,
memFloatsSize * sizeof(float), NULL, &err);
CL_CHECK_ERROR(err);
if (*memObject == NULL) {
std::cerr << "Error creating memory objects. " << std::endl;
return false;
}
Event evWrite("write");
err = clEnqueueWriteBuffer (queue, *memObject, CL_FALSE, 0, memFloatsSize * sizeof(float),
a, 0, NULL, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
err = clWaitForEvents (1, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
return true;
}
void cleanup (cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObject) {
if (memObject != NULL)
clReleaseMemObject (memObject);
if (kernel != NULL)
clReleaseKernel (kernel);
if (program != NULL)
clReleaseProgram (program);
}
void addBenchmarkSpecOptions(OptionParser &op) {
}
void RunBenchmark(cl_device_id id,
cl_context ctx,
cl_command_queue queue,
ResultDatabase &resultDB,
OptionParser &op)
{
for (float i = 0.1; i <= 0.2; i+=0.1 ) {
std::cout << "Deploying " << 100*i << "%" << std::endl;
bool verbose = false;
cl_int errNum;
cl_program program = 0;
cl_kernel kernel;
cl_mem memObject = 0;
char maxFloatsStr[128];
char testStr[128];
program = createProgram (ctx, id, (kernels_folder + kernel_file).c_str());
if (program == NULL) {
exit (0);
}
if (verbose) std::cout << "Program created successfully!" << std::endl;
kernel = clCreateKernel (program, "flops", &errNum);
CL_CHECK_ERROR(errNum);
if (verbose) std::cout << "Kernel created successfully!" << std::endl;
// Identify maximum size of the global memory on the device side
cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
clGetDeviceInfo (id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(cl_long), &maxWorkGroupSize, NULL);
// Let's use 80% of this memory for transferring data
cl_long maxFloatsUsageSize = ((maxAllocSizeBytes / 4) * 0.8);
if (verbose) std::cout << "Max floats usage size is " << maxFloatsUsageSize << std::endl;
if (verbose) std::cout << "Max compute unit is " << maxComputeUnits << std::endl;
if (verbose) std::cout << "Max Work Group size is " << maxWorkGroupSize << std::endl;
// Prepare buffer on the host side
float *a = new float[maxFloatsUsageSize];
for (int j = 0; j < maxFloatsUsageSize; j++) {
a[j] = (float) (j % 77);
}
if (verbose) std::cout << "Host buffer been prepared!" << std::endl;
// Creating buffer on the device side
if (!createMemObjects(ctx, queue, &memObject, maxFloatsUsageSize, a)) {
exit (0);
}
errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject);
CL_CHECK_ERROR(errNum);
size_t wg_size, wg_multiple;
cl_ulong local_mem, private_usage, local_usage;
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof (wg_size), &wg_size, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof (wg_multiple), &wg_multiple, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_LOCAL_MEM_SIZE,
sizeof (local_usage), &local_usage, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof (private_usage), &private_usage, NULL);
CL_CHECK_ERROR (errNum);
if (verbose) std::cout << "Work Group size is " << wg_size << std::endl;
if (verbose) std::cout << "Preferred Work Group size is " << wg_multiple << std::endl;
if (verbose) std::cout << "Local memory size is " << local_usage << std::endl;
if (verbose) std::cout << "Private memory size is " << private_usage << std::endl;
size_t globalWorkSize[1] = {maxFloatsUsageSize};
size_t localWorkSize[1] = {1};
Event evKernel("flops");
errNum = clEnqueueNDRangeKernel (queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &evKernel.CLEvent());
CL_CHECK_ERROR (errNum);
if (verbose) cout << "Waiting for execution to finish ";
errNum = clWaitForEvents(1, &evKernel.CLEvent());
CL_CHECK_ERROR(errNum);
evKernel.FillTimingInfo();
if (verbose) cout << "Kernel execution terminated successfully!" << std::endl;
delete[] a;
sprintf (maxFloatsStr, "Size: %d", maxFloatsUsageSize);
sprintf (testStr, "Flops: %f\% Memory", 100*i);
double flopCount = maxFloatsUsageSize * 16000;
double gflop = flopCount / (double)(evKernel.SubmitEndRuntime());
resultDB.AddResult (testStr, maxFloatsStr, "GFLOPS", gflop);
// Now it's time to read back the data
a = new float[maxFloatsUsageSize];
errNum = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, maxFloatsUsageSize*sizeof(float), a, 0, NULL, NULL);
CL_CHECK_ERROR(errNum);
if (verbose) {
for (int j = 0; j < 10; j++) {
std::cout << a[j] << " ";
}
}
delete[] a;
if (memObject != NULL)
clReleaseMemObject (memObject);
if (program != NULL)
clReleaseProgram (program);
if (kernel != NULL)
clReleaseKernel (kernel);
}
std::cout << "Program executed successfully!" << std::endl;
}
解释一下代码,在内核代码中我实际上做了一个浮点运算,这意味着每个任务都会在 FOPS 上执行。在主机代码中,我首先检索 GPU 的最大全局内存大小,分配其中的一部分(for 循环定义其中的多少),然后将数据和内核执行推送到其中。我将测量 clEnqueueNDRangeKernel 的执行时间,然后计算应用程序的 GFLOPS。在我当前的实现中,无论 cl_mem 的大小是多少,我都获得了大约 0.28 GFLOPS 的性能,这比宣传的功率要低得多。我假设我在这里效率低下地做特定的事情。或者总的来说,我计算 GPU 性能的方法是不正确的。有谁能告诉我应该对代码进行哪些更改?
本地组大小为 1 时,您浪费了 31/32 的资源(因此您最多只能拥有峰值性能的 1/32)。您需要至少 32(并且是 32 的倍数)的本地组大小才能充分利用计算资源,需要 64 个才能实现 100% 的占用率(尽管 100% 的占用率不是必需的)。
内存访问具有高延迟和低带宽。如果其他事情是正确的,你的内核将一直等待内存控制器。你需要做更多的算术运算才能使 ALU 忙碌。
您需要先阅读文档并使用 Visual Profiler。在前两部分中,我只想告诉你,事情比你想象的更奇怪。但更奇怪的事情还在等着我们。
您可以使用汇编语言在 CPU 上轻松实现最佳性能(通过仅执行独立的算术运算。如果您用 C 编写此类代码,它只会被编译器丢弃)。 NVidia 只为我们提供了一个名为 PTX 的 IL 接口,我不确定编译器是否会优化它。而且我认为你只能在 CUDA 中使用 PTX。
编辑:编译器似乎会优化未使用的 PTX 代码,至少在内联汇编中是这样。
我正在编写一个简单的 OpenCL 应用程序,它将计算目标 GPU 设备的最大实验 FLOPS。我决定让我的 cl 内核尽可能简单。这是我的 OpenCL 内核和主机代码。内核代码为:
__kernel void flops(__global float *data) {
int gid = get_global_id(0);
double s = data[gid];
data[gid] = s * 0.35;
}
主机代码为:
#include <iostream>
#include <sstream>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "support.h"
#include "Event.h"
#include "ResultDatabase.h"
#include "OptionParser.h"
#include "ProgressBar.h"
using namespace std;
std::string kernels_folder = "/home/users/saman/shoc/src/opencl/level3/FlopsFolder/";
std::string kernel_file = "flops.cl";
static const char *opts = "-cl-mad-enable -cl-no-signed-zeros "
"-cl-unsafe-math-optimizations -cl-finite-math-only";
cl_program createProgram (cl_context context,
cl_device_id device,
const char* fileName) {
cl_int errNum;
cl_program program;
std::ifstream kernelFile (fileName, std::ios::in);
if (!kernelFile.is_open()) {
std::cerr << "Failed to open file for reading: " << fileName << std::endl;
}
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource (context, 1, (const char **)&srcStr,
NULL, &errNum);
CL_CHECK_ERROR(errNum);
errNum = clBuildProgram (program, 0, NULL, NULL, NULL, NULL);
CL_CHECK_ERROR (errNum);
return program;
}
bool createMemObjects (cl_context context, cl_command_queue queue,
cl_mem* memObject,
const int memFloatsSize, float *a) {
cl_int err;
*memObject = clCreateBuffer (context, CL_MEM_READ_WRITE,
memFloatsSize * sizeof(float), NULL, &err);
CL_CHECK_ERROR(err);
if (*memObject == NULL) {
std::cerr << "Error creating memory objects. " << std::endl;
return false;
}
Event evWrite("write");
err = clEnqueueWriteBuffer (queue, *memObject, CL_FALSE, 0, memFloatsSize * sizeof(float),
a, 0, NULL, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
err = clWaitForEvents (1, &evWrite.CLEvent());
CL_CHECK_ERROR(err);
return true;
}
void cleanup (cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObject) {
if (memObject != NULL)
clReleaseMemObject (memObject);
if (kernel != NULL)
clReleaseKernel (kernel);
if (program != NULL)
clReleaseProgram (program);
}
void addBenchmarkSpecOptions(OptionParser &op) {
}
void RunBenchmark(cl_device_id id,
cl_context ctx,
cl_command_queue queue,
ResultDatabase &resultDB,
OptionParser &op)
{
for (float i = 0.1; i <= 0.2; i+=0.1 ) {
std::cout << "Deploying " << 100*i << "%" << std::endl;
bool verbose = false;
cl_int errNum;
cl_program program = 0;
cl_kernel kernel;
cl_mem memObject = 0;
char maxFloatsStr[128];
char testStr[128];
program = createProgram (ctx, id, (kernels_folder + kernel_file).c_str());
if (program == NULL) {
exit (0);
}
if (verbose) std::cout << "Program created successfully!" << std::endl;
kernel = clCreateKernel (program, "flops", &errNum);
CL_CHECK_ERROR(errNum);
if (verbose) std::cout << "Kernel created successfully!" << std::endl;
// Identify maximum size of the global memory on the device side
cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
clGetDeviceInfo (id, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (id, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(cl_long), &maxWorkGroupSize, NULL);
// Let's use 80% of this memory for transferring data
cl_long maxFloatsUsageSize = ((maxAllocSizeBytes / 4) * 0.8);
if (verbose) std::cout << "Max floats usage size is " << maxFloatsUsageSize << std::endl;
if (verbose) std::cout << "Max compute unit is " << maxComputeUnits << std::endl;
if (verbose) std::cout << "Max Work Group size is " << maxWorkGroupSize << std::endl;
// Prepare buffer on the host side
float *a = new float[maxFloatsUsageSize];
for (int j = 0; j < maxFloatsUsageSize; j++) {
a[j] = (float) (j % 77);
}
if (verbose) std::cout << "Host buffer been prepared!" << std::endl;
// Creating buffer on the device side
if (!createMemObjects(ctx, queue, &memObject, maxFloatsUsageSize, a)) {
exit (0);
}
errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject);
CL_CHECK_ERROR(errNum);
size_t wg_size, wg_multiple;
cl_ulong local_mem, private_usage, local_usage;
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof (wg_size), &wg_size, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof (wg_multiple), &wg_multiple, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_LOCAL_MEM_SIZE,
sizeof (local_usage), &local_usage, NULL);
CL_CHECK_ERROR (errNum);
errNum = clGetKernelWorkGroupInfo (kernel, id,
CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof (private_usage), &private_usage, NULL);
CL_CHECK_ERROR (errNum);
if (verbose) std::cout << "Work Group size is " << wg_size << std::endl;
if (verbose) std::cout << "Preferred Work Group size is " << wg_multiple << std::endl;
if (verbose) std::cout << "Local memory size is " << local_usage << std::endl;
if (verbose) std::cout << "Private memory size is " << private_usage << std::endl;
size_t globalWorkSize[1] = {maxFloatsUsageSize};
size_t localWorkSize[1] = {1};
Event evKernel("flops");
errNum = clEnqueueNDRangeKernel (queue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &evKernel.CLEvent());
CL_CHECK_ERROR (errNum);
if (verbose) cout << "Waiting for execution to finish ";
errNum = clWaitForEvents(1, &evKernel.CLEvent());
CL_CHECK_ERROR(errNum);
evKernel.FillTimingInfo();
if (verbose) cout << "Kernel execution terminated successfully!" << std::endl;
delete[] a;
sprintf (maxFloatsStr, "Size: %d", maxFloatsUsageSize);
sprintf (testStr, "Flops: %f\% Memory", 100*i);
double flopCount = maxFloatsUsageSize * 16000;
double gflop = flopCount / (double)(evKernel.SubmitEndRuntime());
resultDB.AddResult (testStr, maxFloatsStr, "GFLOPS", gflop);
// Now it's time to read back the data
a = new float[maxFloatsUsageSize];
errNum = clEnqueueReadBuffer(queue, memObject, CL_TRUE, 0, maxFloatsUsageSize*sizeof(float), a, 0, NULL, NULL);
CL_CHECK_ERROR(errNum);
if (verbose) {
for (int j = 0; j < 10; j++) {
std::cout << a[j] << " ";
}
}
delete[] a;
if (memObject != NULL)
clReleaseMemObject (memObject);
if (program != NULL)
clReleaseProgram (program);
if (kernel != NULL)
clReleaseKernel (kernel);
}
std::cout << "Program executed successfully!" << std::endl;
}
解释一下代码,在内核代码中我实际上做了一个浮点运算,这意味着每个任务都会在 FOPS 上执行。在主机代码中,我首先检索 GPU 的最大全局内存大小,分配其中的一部分(for 循环定义其中的多少),然后将数据和内核执行推送到其中。我将测量 clEnqueueNDRangeKernel 的执行时间,然后计算应用程序的 GFLOPS。在我当前的实现中,无论 cl_mem 的大小是多少,我都获得了大约 0.28 GFLOPS 的性能,这比宣传的功率要低得多。我假设我在这里效率低下地做特定的事情。或者总的来说,我计算 GPU 性能的方法是不正确的。有谁能告诉我应该对代码进行哪些更改?
本地组大小为 1 时,您浪费了 31/32 的资源(因此您最多只能拥有峰值性能的 1/32)。您需要至少 32(并且是 32 的倍数)的本地组大小才能充分利用计算资源,需要 64 个才能实现 100% 的占用率(尽管 100% 的占用率不是必需的)。
内存访问具有高延迟和低带宽。如果其他事情是正确的,你的内核将一直等待内存控制器。你需要做更多的算术运算才能使 ALU 忙碌。
您需要先阅读文档并使用 Visual Profiler。在前两部分中,我只想告诉你,事情比你想象的更奇怪。但更奇怪的事情还在等着我们。
您可以使用汇编语言在 CPU 上轻松实现最佳性能(通过仅执行独立的算术运算。如果您用 C 编写此类代码,它只会被编译器丢弃)。 NVidia 只为我们提供了一个名为 PTX 的 IL 接口,我不确定编译器是否会优化它。而且我认为你只能在 CUDA 中使用 PTX。
编辑:编译器似乎会优化未使用的 PTX 代码,至少在内联汇编中是这样。