如何使用 OpenACC 在 C 中执行结构的二维动态数组的手动深度复制

How to perform manual deep copy of 2D dynamic array of struct in C using OpenACC

我正在尝试使用 OpenACC 将现有的粒子方法代码修改为 GPU 上的 运行。现有代码使用 c 中的 struct 的 2D 动态数组。我需要将结构复制到 GPU 以进行进一步计算。代码示例如下:

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

structDomain domain;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
  int iX,iY, capacity;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++) 
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);

// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;

        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
        }
    }
  }

在查阅了互联网上的各种资料后,我提出了以下解决方案(可能是完全错误的)"

// It is needed to create the memory for **bucket and *list on GPU. 
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        #pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
    }
  }

请求将 **bucket 和 *list 深度复制到 GPU 内存的建议手册。我的解决方案准确吗?有人可以为所述结构的手动深度复制提出改进或更好的解决方案吗?

我在 Windows 10 上使用 PGI 19.4 编译器。 非常感谢

关闭。我唯一不同的是不创建 "domain.bucket" 并更新存储桶的计数,以便设备具有此信息。此外,由于 updates/copies 很浅,请确保仅更新结构中的列表数组或标量。否则你可能会覆盖 device/host 指针。这是一个例子。当我使用 Linux 时,除了可执行文件名称外,代码应该相同。

% cat test.c

#include <stdio.h>
#include <stdlib.h>

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

#define XDIM 64
#define YDIM 64

int main() {

  structDomain domain;
  int iX,iY, capacity;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);


// Calculate domain.bucket[iX][iY].count here using some logic
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
       domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
  }}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
        }
    }
  }

#pragma acc parallel loop gang collapse(2) present(domain)
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc loop vector
           for (int i = 0; i < capacity; ++i) {
                domain.bucket[iX][iY].list[i] = i;
           }
        }
   }}

  for (iX = 0; iX < 5; iX++)
  {
    for (iY = 0; iY < 5; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
           printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
           for (int i = 0; i < capacity; ++i) {
                printf("%d ",domain.bucket[iX][iY].list[i]);
           }
           printf("\n");
        }
   }}

  exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
     40, Generating enter data copyin(domain)
     41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
     49, Generating update device(domain.bucket->->count)
     52, Generating enter data create(domain.bucket->->list[:capacity])
     57, Generating present(domain)
         Generating Tesla code
         58, #pragma acc loop gang collapse(2) /* blockIdx.x */
         60,   /* blockIdx.x collapsed */
         65, #pragma acc loop vector(128) /* threadIdx.x */
     65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
         Loop is parallelizable
     78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
        0
iX=0 iY=2 Cnt=2
        0 1
iX=0 iY=3 Cnt=3
        0 1 2
iX=0 iY=4 Cnt=4
        0 1 2 3
iX=1 iY=0 Cnt=5
        0 1 2 3 4
iX=1 iY=1 Cnt=6
        0 1 2 3 4 5
iX=1 iY=2 Cnt=7
        0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
        0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
        0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
        0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
        0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
        0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
        0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
        0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23