OpenACC - C++ 'new' 运算符问题
OpenACC - The C++ 'new' operator issue
C++ new
运算符似乎在 OpenACC routine
区域中被禁止。
我想知道为什么(我已经检查了 routine
指令的规范,但没有找到任何内容)。
这是我使用 OpenACC 实现的代码,它是一个使用我自己的复数的基本复数矩阵乘积 class(我缩小了我的代码以使其更具可读性):
class Complex {
private:
double* c;
public:
#pragma acc routine seq
Complex ( )
{
c = new double[2];
#pragma acc enter data copyin(this)
#pragma acc enter data create(c[:2])
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c = new double[2];
#pragma acc enter data copyin(this)
#pragma acc enter data create(c[:2])
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
#pragma acc exit data delete(c[:2])
#pragma acc exit data delete(this)
delete[] c;
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
};
#pragma acc routine seq
inline Complexe operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#pragma acc data copyout(C[:N]) copyin(A[:N],B[:N])
{
#pragma acc parallel loop
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int j = 0; j < N; j++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
}
}
我知道复数的动态数组远不是最好的主意,但这只是一个例子。
当我用 pgc++
编译时,我得到这个错误(来自我的 Complex::Complex()
构造函数):
PGCC-S-1000-Call in OpenACC region to procedure '_Znam' which has no acc routine information
我了解到 _Znam
过程由 new
调用。
所以我想知道为什么无法在 OpenACC 区域内使用 new
以及如何更改我的代码以避免此问题?
在大多数情况下,OpenACC 标准并未指定对特定语言功能的支持。这留给了实施,并将取决于目标设备。对于 PGI 针对 NVIDIA GPU 的 OpenACC 实施,不,OpenACC 计算区域内不支持新的。支持 "malloc",但我强烈建议不要从设备代码中动态分配数据。除了拥有相对较小的堆(目前默认为 8MB,但可以使用环境变量 PGI_ACC_CUDA_HEAPSIZE 增加到 32MB)之外,拥有数千个线程分配数据会导致严重的性能下降。
下面我使用固定大小的数据成员和动态数据成员更新了您的示例。除了修复一些拼写错误外,我还从 constructor/destructor 中删除了 "data" 指令,因为 "data" 指令只能在主机代码中使用。当使用固定大小的数据成员时,代码很简单。对于动态数据成员,每个单独的数据成员都需要 "attached"(即成员的设备地址需要在设备对象中设置)。 OpenACC 标准委员会正在研究一种自动执行此操作的方法,但目前需要在程序本身中完成。下面使用的方法,也称为手动深拷贝,是一个 PGI 扩展,将在下一个 OpenACC 标准 2.6 中采用。
测试 1 固定大小数据成员:
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 32
#endif
class Complex {
private:
double c[2];
public:
#pragma acc routine seq
Complex ( )
{
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
void printme() {
std::cout << c[0] << ":" << c[1] << std::endl;
}
};
#pragma acc routine seq
inline Complex operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#pragma acc data copyout(C[:N]) copyin(A[:N],B[:N])
{
#pragma acc parallel loop
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int k = 0; k < N; k++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
}
C[0][0].printme();
}
测试 2 个动态数据成员
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 32
#endif
class Complex {
private:
double *c;
public:
#pragma acc routine seq
Complex ( )
{
c = (double*) malloc(sizeof(double)*2);
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c = (double*) malloc(sizeof(double)*2);
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
free(c);
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
void printme() {
std::cout << c[0] << ":" << c[1] << std::endl;
}
#ifdef _OPENACC
void acc_create() {
#pragma acc enter data create(c[0:2])
}
void acc_copyin() {
#pragma acc enter data copyin(c[0:2])
}
void acc_delete() {
#pragma acc exit data delete(c)
}
void acc_copyout() {
#pragma acc exit data copyout(c[0:2])
}
#endif
};
#pragma acc routine seq
inline Complex operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#ifdef _OPENACC
#pragma acc enter data create(A[0:N][0:N],B[0:N][0:N],C[0:N][0:N])
for (unsigned int i = 0; i < N; i++)
{
for (unsigned int j = 0; j < N; j++) {
A[i][j].acc_copyin();
B[i][j].acc_copyin();
C[i][j].acc_create();
}
}
#endif
#pragma acc parallel loop present(A,B,C)
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int k = 0; k < N; k++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
#ifdef _OPENACC
for (unsigned int i = 0; i < N; i++)
{
for (unsigned int j = 0; j < N; j++) {
A[i][j].acc_delete();
B[i][j].acc_delete();
C[i][j].acc_copyout();
}
}
#pragma acc exit data delete(A[0:N][0:N],B[0:N][0:N],C[0:N][0:N])
#endif
C[0][0].printme();
}
C++ new
运算符似乎在 OpenACC routine
区域中被禁止。
我想知道为什么(我已经检查了 routine
指令的规范,但没有找到任何内容)。
这是我使用 OpenACC 实现的代码,它是一个使用我自己的复数的基本复数矩阵乘积 class(我缩小了我的代码以使其更具可读性):
class Complex {
private:
double* c;
public:
#pragma acc routine seq
Complex ( )
{
c = new double[2];
#pragma acc enter data copyin(this)
#pragma acc enter data create(c[:2])
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c = new double[2];
#pragma acc enter data copyin(this)
#pragma acc enter data create(c[:2])
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
#pragma acc exit data delete(c[:2])
#pragma acc exit data delete(this)
delete[] c;
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
};
#pragma acc routine seq
inline Complexe operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#pragma acc data copyout(C[:N]) copyin(A[:N],B[:N])
{
#pragma acc parallel loop
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int j = 0; j < N; j++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
}
}
我知道复数的动态数组远不是最好的主意,但这只是一个例子。
当我用 pgc++
编译时,我得到这个错误(来自我的 Complex::Complex()
构造函数):
PGCC-S-1000-Call in OpenACC region to procedure '_Znam' which has no acc routine information
我了解到 _Znam
过程由 new
调用。
所以我想知道为什么无法在 OpenACC 区域内使用 new
以及如何更改我的代码以避免此问题?
在大多数情况下,OpenACC 标准并未指定对特定语言功能的支持。这留给了实施,并将取决于目标设备。对于 PGI 针对 NVIDIA GPU 的 OpenACC 实施,不,OpenACC 计算区域内不支持新的。支持 "malloc",但我强烈建议不要从设备代码中动态分配数据。除了拥有相对较小的堆(目前默认为 8MB,但可以使用环境变量 PGI_ACC_CUDA_HEAPSIZE 增加到 32MB)之外,拥有数千个线程分配数据会导致严重的性能下降。
下面我使用固定大小的数据成员和动态数据成员更新了您的示例。除了修复一些拼写错误外,我还从 constructor/destructor 中删除了 "data" 指令,因为 "data" 指令只能在主机代码中使用。当使用固定大小的数据成员时,代码很简单。对于动态数据成员,每个单独的数据成员都需要 "attached"(即成员的设备地址需要在设备对象中设置)。 OpenACC 标准委员会正在研究一种自动执行此操作的方法,但目前需要在程序本身中完成。下面使用的方法,也称为手动深拷贝,是一个 PGI 扩展,将在下一个 OpenACC 标准 2.6 中采用。
测试 1 固定大小数据成员:
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 32
#endif
class Complex {
private:
double c[2];
public:
#pragma acc routine seq
Complex ( )
{
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
void printme() {
std::cout << c[0] << ":" << c[1] << std::endl;
}
};
#pragma acc routine seq
inline Complex operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#pragma acc data copyout(C[:N]) copyin(A[:N],B[:N])
{
#pragma acc parallel loop
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int k = 0; k < N; k++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
}
C[0][0].printme();
}
测试 2 个动态数据成员
#include <iostream>
#ifdef _OPENACC
#include <openacc.h>
#endif
#ifndef N
#define N 32
#endif
class Complex {
private:
double *c;
public:
#pragma acc routine seq
Complex ( )
{
c = (double*) malloc(sizeof(double)*2);
c[0] = 0.0;
c[1] = 0.0;
}
Complex ( Complex const& z )
{
c = (double*) malloc(sizeof(double)*2);
c[0] = z.c[0];
c[1] = z.c[1];
}
~Complex ( )
{
free(c);
}
#pragma acc routine seq
Complex& operator= ( Complex const z )
{
c[0] = z.c[0];
c[1] = z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator+= ( Complex const z )
{
c[0] += z.c[0];
c[1] += z.c[1];
return *this;
}
#pragma acc routine seq
Complex& operator*= ( Complex const z )
{
double a(c[0]), b(c[1]);
c[0] = a*z.c[0] - b*z.c[1];
c[1] = b*z.c[0] + a*z.c[1];
return *this;
}
void printme() {
std::cout << c[0] << ":" << c[1] << std::endl;
}
#ifdef _OPENACC
void acc_create() {
#pragma acc enter data create(c[0:2])
}
void acc_copyin() {
#pragma acc enter data copyin(c[0:2])
}
void acc_delete() {
#pragma acc exit data delete(c)
}
void acc_copyout() {
#pragma acc exit data copyout(c[0:2])
}
#endif
};
#pragma acc routine seq
inline Complex operator* ( Complex z1, Complex const z2 )
{
z1 *= z2;
return z1;
}
int main ( )
{
Complex A[N][N];
Complex B[N][N];
// initialisation of A and B
Complex C[N][N];
#ifdef _OPENACC
#pragma acc enter data create(A[0:N][0:N],B[0:N][0:N],C[0:N][0:N])
for (unsigned int i = 0; i < N; i++)
{
for (unsigned int j = 0; j < N; j++) {
A[i][j].acc_copyin();
B[i][j].acc_copyin();
C[i][j].acc_create();
}
}
#endif
#pragma acc parallel loop present(A,B,C)
for (unsigned int i = 0; i < N; i++)
{
#pragma acc loop
for (unsigned int j = 0; j < N; j++)
{
Complex accum;
#pragma acc loop seq
for (unsigned int k = 0; k < N; k++)
{
accum += A[i][k]*B[k][j];
}
C[i][j] = accum;
}
}
#ifdef _OPENACC
for (unsigned int i = 0; i < N; i++)
{
for (unsigned int j = 0; j < N; j++) {
A[i][j].acc_delete();
B[i][j].acc_delete();
C[i][j].acc_copyout();
}
}
#pragma acc exit data delete(A[0:N][0:N],B[0:N][0:N],C[0:N][0:N])
#endif
C[0][0].printme();
}