如何直接(不使用指针作为函数参数)访问 OpenAcc 计算区域中 GPU 上的数组?

How to directly (not using a pointer as a function parameter) access arrays on GPU in OpenAcc compute regions?

我有以下简单的代码,由 4 个文件组成:

//Data.h:
#ifndef DATA_H
#define DATA_H

constexpr int N=10000000;
namespace data{
  float particles[N];
  float outputArray[N];
}
#endif

//Process.h:
#ifndef PROCESS_H
#define PROCESS_H

template <typename ProcessImpl>
class Process{
public:
  using Base_t = ProcessImpl;
  Process(ProcessImpl arg):fProcessImpl(arg){}
  void Get1(int N, float * outputArray) const;
  void Get2(int N) const;
private:
  ProcessImpl fProcessImpl;
};
template <class ProcessImpl>
void Process<ProcessImpl>::Get1(int N, float * outputArray) const
{         
#pragma acc parallel loop gang vector present(outputArray)
  for(int ind=0; ind < N; ++ind){outputCSArray[ind]=fProcessImpl.Get1(ind);}
}   
template <class ProcessImpl>
void Process<ProcessImpl>::Get2(int N) const
{
#pragma acc parallel loop gang vector
  for (int ind = 0u; ind < N; ++ind){fProcessImpl.Get2(ind);}
}
#endif

//ProcessImpl.h:
#ifndef PROCESSIMPL_H
#define PROCESSIMPL_H

#include "Data.h"
using namespace data;
class ProcessImpl
{
public:
  inline float Get1(int ind, float * outputArray) const;
  inline void Get2(int ind) const;
};
float ProcessImpl::Get1(int ind, float * outputArray) const
{
  outputArray[ind]=particles[ind];
  return particles[ind+1];
}
void ProcessImpl::Get2(int ind) const
{
  particles[ind]=2*particles[ind];
}
#endif

//main.cpp:
#include <iostream>
#include "Data.h"
#include "Process.h"
#include "ProcessImpl.h"

#include <accelmath.h>
#include <openacc.h>

using namespace data;
using Process_t = Process<ProcessImpl>;
Process_t process = Process_t(typename Process_t::Base_t());

int main(int argc, char **argv)
{
#pragma acc data create(particles,outputArray)
  {
  #pragma acc parallel loop gang vector present(particles)
    for(int i=0; i<N; ++i) particles[i]=static_cast<float>(i);
  #pragma acc update host(particles)
    for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
    std::cout<<std::endl;

    process.Get2(N);

  #pragma acc update host(particles)
    for(int i=0; i<100; ++i) std::cout<<particles[i]<<" ";
    std::cout<<std::endl;  
  }
  return 0;
}

使用 PGI 19.4 编译器在 CPU 上可以正常工作。 但我的任务是在 GPU 上启动代码。我使用 PGI 19.4 + OpenAcc。 使用简单的 CMakeLists.txt 文件和编译行(GPU Nvidia GeForce 650 Ti,计算能力 3.0):

cmake . -DCMAKE_C_COMPILER=pgcc -DCMAKE_CXX_COMPILER=pgc++
-DCMAKE_C_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"
-DCMAKE_CXX_FLAGS="-acc -Minfo=acc -mcmodel=medium -ta=tesla:cc30"

编译失败:

> Scanning dependencies of target Test
[ 50%] Building CXX object CMakeFiles/Test.dir/main.cpp.o
main:
     16, Generating create(_ZN4data11outputArrayE[:])
         Generating present(_ZN4data9particlesE[:])
         Generating create(_ZN4data9particlesE[:])
         Generating Tesla code
         18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     18, Generating update self(_ZN4data9particlesE[:])
     23, Generating update self(_ZN4data9particlesE[:])
         PGCC-W-0155-External and Static variables are not supported in acc routine - 
         _ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)
         ProcessImpl::Get2(int) const:
      4, include "ProcessImpl.h"
         18, Generating implicit acc routine seq
             Process<ProcessImpl>::Get2(int) const:
      3, include "Process.h"
         25, Generating Tesla code
             27, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
         25, Generating implicit copyin(this[:])
PGCC/x86-64 Linux 19.4-0: compilation completed with warnings
[100%] Linking CXX executable Test
nvlink error   : Undefined reference to '_ZNK11ProcessImpl4Get2Ei' in 
'CMakeFiles/Test.dir/main.cpp.o'
pgacclnk: child process exit status 2: /opt/pgi/linux86-64-llvm/19.4/bin/pgnvd
CMakeFiles/Test.dir/build.make:83: recipe for target 'Test' failed
make[2]: *** [Test] Error 2
CMakeFiles/Makefile2:72: recipe for target 'CMakeFiles/Test.dir/all' failed
make[1]: *** [CMakeFiles/Test.dir/all] Error 2
Makefile:83: recipe for target 'all' failed
make: *** [all] Error 2

使用 pggdecode,发现“_ZNK11ProcessImpl4Get2Ei”是 ProcessImpl::Get2(int) 的错位名称常量 。 我从 ProcessImpl.h 中删除了 inline 关键字并尝试将 copyin(process) 添加到#pragma acc data create(particles,outputArray)main() 中,但这没有帮助。使用 gcc 5.3.1 在 Fedora 23 上工作。

在完整代码中,我避免了数组 particlesoutputArray 的多重定义问题,将它们包含在单个 .cpp 文件中,因为 OpenAcc 不允许使用 extern 关键字。可能不是很好(如果你知道如何做得更好,请提出建议),但它有效。

题目是:

如何正确地将数组 particlesoutputArray 的 GPU 版本传递给 Get1()Get2()Process.h 并使 Get1()Get2() in ProcessImpl.h 使用在 GPU 上分配的数组?以及如何编译这段代码?

OpenAcc 如何允许直接 访问在 OpenAcc 计算区域的代码中复制到 GPU 的全局分配数组,而不将指针作为调用函数的参数传递给它们?

谢谢。

未定义的引用是由于以下错误导致未创建 Get2 的设备版本:

PGCC-W-0155-External and Static variables are not supported in acc routine _ZN4data9particlesE (/home/70-gaa/source/13OpenAccTest/main.cpp: 19)

问题是在设备例程中直接访问的全局变量需要一个在 link 时间定义的设备版本,因此 linker 可以在两者之间建立关联。一种选择是将 "particles" 作为参数传递,但更简单的选择是将 "particles" 放在 "declare create" 指令中。

"declare" 指令创建一个数据区域,该区域与定义它的范围单元具有相同的范围。因此将它用于具有全局范围的变量,也会在设备上放入全局范围。

% cat Data.h
//Data.h:
#ifndef DATA_H
#define DATA_H

constexpr int N=10000000;
namespace data{
  float particles[N];
  float outputArray[N];
#pragma acc declare create(particles[:N])
}
#endif

% pgc++ -I. main.cpp -ta=tesla -Minfo=accel
main:
     17, Generating create(_ZN4data11outputArrayE[:]) [if not already present]
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     19, Generating update self(_ZN4data9particlesE[:])
     24, Generating update self(_ZN4data9particlesE[:])
ProcessImpl::Get2(int) const:
      5, include "ProcessImpl.h"
          19, Generating implicit acc routine seq
              Generating acc routine seq
              Generating Tesla code
Process<ProcessImpl>::Get2(int) const:
      4, include "Process.h"
          23, Generating Tesla code
              25, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
          23, Generating implicit copyin(this[:]) [if not already present]
% a.out
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198