OpenCL 矩阵乘法运行,但答案始终为零
OpenCL Matrix Multiply runs, but answer is always zero
我正在尝试 learn/teach 我自己的 OpenCL 并开始使用一个程序来执行矩阵乘法。无论我做什么,我最终的答案都是零。
我知道 1x3 和 3x1 应该产生 1x1 的答案,并且按照我创建随机浮点数的设置方式,它应该是非零的。这是我的主体和内核。除了警告;我错过了什么,我已经研究了好几个小时了,看不出问题所在。
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
int main()
{
int nX = 1;
int nY = 3;
int nZ = 1;
// Get all platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// create a command queue
cl::CommandQueue queue(context, device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()) {
throw std::runtime_error("Can not open kernel source file.");
}
std::stringstream ss;
ss << fs.rdbuf();
std::string code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
// Build the kernel
cl::Program program(context, sources);
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
//Create Matrix arrays and fill with random float values
float *A = new float[nX*nY];
float *B = new float[nY*nZ];
float *C = new float[nX*nZ];
for(int i =0; i < nX; i++){
for(int j = 0; j < nY; j++)
{
A[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1;
std::cout << " A[" << std::to_string(j + i * nY) << "] = ";
std::cout << A[j + i*nY] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
for(int i =0; i < nY; i++){
for(int j = 0; j < nZ; j++)
{
B[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " B[" + std::to_string(j + i * nY) + "] = " ;
std::cout << B[j + i * nY] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
//fill Matrix C with random values
for(int i =0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
C[j + i*nX] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " C[" + std::to_string(j + i * nX) + "] = " ;
std::cout << B[j + i * nX] << " ";
}
std::cout << std::endl;
}
// Create data/memory buffers, and equeue them
cl::Buffer bufA(context, CL_MEM_READ_ONLY, sizeof(float) * nX * nY);
cl::Buffer bufB(context, CL_MEM_READ_ONLY, sizeof(float) * nY * nZ);
cl::Buffer bufC(context, CL_MEM_READ_WRITE, sizeof(float) * nX * nZ);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nX * nY, A);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nY * nZ, B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, nX);
kernel.setArg(1, nY);
kernel.setArg(2, nZ);
kernel.setArg(3, bufA);
kernel.setArg(4, bufB);
kernel.setArg(5, bufC);
// Execute the kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
// Retrieve results from global memory
queue.enqueueReadBuffer(bufC, CL_TRUE,0, sizeof(float) * nX * nZ, C);
queue.finish();
fs.close();
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
std::cout << "C[" + std::to_string(j*nZ+i) + "] = " ;
std::cout << C[j*nZ+i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
这是我的内核函数:
__kernel void mCrossProd(const int nX, const int nY, const int nZ, __global float* A, __global float* B, __global float* C) {
int i = get_global_id(0);
int j = get_global_id(1);
for(int k = 0; k < nX; k++){
C[j*nY+i] += A[j*nX+k] * B[k*nY+i];
}
}
问题出在下面这行代码中:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
试试这个:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M,N), cl::NDRange(1,1));
我相信 enqueueNDRangeKernel 的参数指的是这个内核的 workers/threads 运行 的数量(建议阅读文档)。尝试使用数字和基准。此外,您还可以检查错误,这应该会使调试更容易。
if( queue.enqueueNDRangeKernel() != CL_SUCCESS ) { throw error; }
通过添加这个,你会发现你的代码目前甚至没有计算矩阵乘法并且无法排队内核。
您访问数组的方式也存在错误。您只为矩阵 A 和矩阵 B 分配了 3 个浮点数。但是您将值分配给超出范围的索引。例如float *A = new float[3]; A[5] = 10.0f;
。这是未定义的行为,这就是为什么它不一定会崩溃但非常危险的原因。您也不会释放内存。如果您使用 new
关键字,请记住匹配 delete
或 delete[]
在这种情况下,您何时使用完内存。否则你会发生内存泄漏。或者,您可以考虑使用向量、智能指针、静态大小的数组等。
此外,
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
可能应该替换为
if(program.build({device}!=CL_SUCCESS))
{
std::cerr << "Failed to compile kernel code" << std::endl;
exit(1);
}
或者至少做 catch(...)
而不是捕获特定的异常。这是因为我认为 program.build 不会抛出异常。文档指出它 returns 是一个错误代码,您应该检查一下。
最后一件事,尽量让代码更简单,这样更容易调试。例如,您可以尝试创建一个简单的内核,它只是将两个数组相加而不是相乘。
无论如何,我已经修改了你的代码。希望它有意义:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include <CL/cl.hpp>
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!=CL_SUCCESS){
std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)<<"\n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
float A[] = {2.0f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f,
4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f,
3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f,
0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f};
float B[] = {3.0f, 2.0f, 4.0f,
1.0f, 1.0f, 2.0f,
4.0f, 2.0f, 1.0f,
0.0f, 0.0f, 1.0f,
9.0f, 2.0f, 1.0f,
2.0f, 1.0f, 0.0f};
float C[] = {0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
cl::CommandQueue queue(context,device);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*A_dims,A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(float)*B_dims,B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(float)*C_dims,C);
std::cout << sizeof(C) / sizeof(float) << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
内核源码:
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
}
}
if((i*N+j)==debug_elem_id)
{
printf("SUM: %.2f\n", C[i*N+j]);
}
}
编辑:更正了示例代码和说明。感谢@mogu
我正在尝试 learn/teach 我自己的 OpenCL 并开始使用一个程序来执行矩阵乘法。无论我做什么,我最终的答案都是零。
我知道 1x3 和 3x1 应该产生 1x1 的答案,并且按照我创建随机浮点数的设置方式,它应该是非零的。这是我的主体和内核。除了警告;我错过了什么,我已经研究了好几个小时了,看不出问题所在。
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
int main()
{
int nX = 1;
int nY = 3;
int nZ = 1;
// Get all platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// create a command queue
cl::CommandQueue queue(context, device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()) {
throw std::runtime_error("Can not open kernel source file.");
}
std::stringstream ss;
ss << fs.rdbuf();
std::string code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
// Build the kernel
cl::Program program(context, sources);
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
//Create Matrix arrays and fill with random float values
float *A = new float[nX*nY];
float *B = new float[nY*nZ];
float *C = new float[nX*nZ];
for(int i =0; i < nX; i++){
for(int j = 0; j < nY; j++)
{
A[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1;
std::cout << " A[" << std::to_string(j + i * nY) << "] = ";
std::cout << A[j + i*nY] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
for(int i =0; i < nY; i++){
for(int j = 0; j < nZ; j++)
{
B[j + i*nY] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " B[" + std::to_string(j + i * nY) + "] = " ;
std::cout << B[j + i * nY] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
//fill Matrix C with random values
for(int i =0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
C[j + i*nX] = rand()/(float)RAND_MAX * 10 + 1 ;
std::cout << " C[" + std::to_string(j + i * nX) + "] = " ;
std::cout << B[j + i * nX] << " ";
}
std::cout << std::endl;
}
// Create data/memory buffers, and equeue them
cl::Buffer bufA(context, CL_MEM_READ_ONLY, sizeof(float) * nX * nY);
cl::Buffer bufB(context, CL_MEM_READ_ONLY, sizeof(float) * nY * nZ);
cl::Buffer bufC(context, CL_MEM_READ_WRITE, sizeof(float) * nX * nZ);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nX * nY, A);
queue.enqueueWriteBuffer(bufA, CL_TRUE, 0, sizeof(float) * nY * nZ, B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, nX);
kernel.setArg(1, nY);
kernel.setArg(2, nZ);
kernel.setArg(3, bufA);
kernel.setArg(4, bufB);
kernel.setArg(5, bufC);
// Execute the kernel
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
// Retrieve results from global memory
queue.enqueueReadBuffer(bufC, CL_TRUE,0, sizeof(float) * nX * nZ, C);
queue.finish();
fs.close();
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < nX; i++){
for(int j = 0; j < nZ; j++)
{
std::cout << "C[" + std::to_string(j*nZ+i) + "] = " ;
std::cout << C[j*nZ+i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
这是我的内核函数:
__kernel void mCrossProd(const int nX, const int nY, const int nZ, __global float* A, __global float* B, __global float* C) {
int i = get_global_id(0);
int j = get_global_id(1);
for(int k = 0; k < nX; k++){
C[j*nY+i] += A[j*nX+k] * B[k*nY+i];
}
}
问题出在下面这行代码中:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(nX, nY), cl::NDRange(16,16));
试试这个:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(M,N), cl::NDRange(1,1));
我相信 enqueueNDRangeKernel 的参数指的是这个内核的 workers/threads 运行 的数量(建议阅读文档)。尝试使用数字和基准。此外,您还可以检查错误,这应该会使调试更容易。
if( queue.enqueueNDRangeKernel() != CL_SUCCESS ) { throw error; }
通过添加这个,你会发现你的代码目前甚至没有计算矩阵乘法并且无法排队内核。
您访问数组的方式也存在错误。您只为矩阵 A 和矩阵 B 分配了 3 个浮点数。但是您将值分配给超出范围的索引。例如float *A = new float[3]; A[5] = 10.0f;
。这是未定义的行为,这就是为什么它不一定会崩溃但非常危险的原因。您也不会释放内存。如果您使用 new
关键字,请记住匹配 delete
或 delete[]
在这种情况下,您何时使用完内存。否则你会发生内存泄漏。或者,您可以考虑使用向量、智能指针、静态大小的数组等。
此外,
try{
program.build({device});
} catch(std::exception &err){
throw std::runtime_error(program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
}
可能应该替换为
if(program.build({device}!=CL_SUCCESS))
{
std::cerr << "Failed to compile kernel code" << std::endl;
exit(1);
}
或者至少做 catch(...)
而不是捕获特定的异常。这是因为我认为 program.build 不会抛出异常。文档指出它 returns 是一个错误代码,您应该检查一下。
最后一件事,尽量让代码更简单,这样更容易调试。例如,您可以尝试创建一个简单的内核,它只是将两个数组相加而不是相乘。
无论如何,我已经修改了你的代码。希望它有意义:
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include <CL/cl.hpp>
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!=CL_SUCCESS){
std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)<<"\n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
float A[] = {2.0f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f,
4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f,
3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f,
0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f};
float B[] = {3.0f, 2.0f, 4.0f,
1.0f, 1.0f, 2.0f,
4.0f, 2.0f, 1.0f,
0.0f, 0.0f, 1.0f,
9.0f, 2.0f, 1.0f,
2.0f, 1.0f, 0.0f};
float C[] = {0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f,
0.0f, 0.0f, 0.0f};
cl::CommandQueue queue(context,device);
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*A_dims,A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(float)*B_dims,B);
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(float)*C_dims,C);
std::cout << sizeof(C) / sizeof(float) << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "\nThe solution is" << std::endl;
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
内核源码:
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
printf("PROD, %.2f\n", A[i*K+k] * B[N*k+j]);
}
}
if((i*N+j)==debug_elem_id)
{
printf("SUM: %.2f\n", C[i*N+j]);
}
}
编辑:更正了示例代码和说明。感谢@mogu