如何使用 OpenACC 在 C 中执行结构的二维动态数组的手动深度复制
How to perform manual deep copy of 2D dynamic array of struct in C using OpenACC
我正在尝试使用 OpenACC 将现有的粒子方法代码修改为 GPU 上的 运行。现有代码使用 c 中的 struct 的 2D 动态数组。我需要将结构复制到 GPU 以进行进一步计算。代码示例如下:
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
structDomain domain;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
int iX,iY, capacity;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
}
}
}
在查阅了互联网上的各种资料后,我提出了以下解决方案(可能是完全错误的)"
// It is needed to create the memory for **bucket and *list on GPU.
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
#pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
}
}
请求将 **bucket 和 *list 深度复制到 GPU 内存的建议手册。我的解决方案准确吗?有人可以为所述结构的手动深度复制提出改进或更好的解决方案吗?
我在 Windows 10 上使用 PGI 19.4 编译器。
非常感谢
关闭。我唯一不同的是不创建 "domain.bucket" 并更新存储桶的计数,以便设备具有此信息。此外,由于 updates/copies 很浅,请确保仅更新结构中的列表数组或标量。否则你可能会覆盖 device/host 指针。这是一个例子。当我使用 Linux 时,除了可执行文件名称外,代码应该相同。
% cat test.c
#include <stdio.h>
#include <stdlib.h>
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
#define XDIM 64
#define YDIM 64
int main() {
structDomain domain;
int iX,iY, capacity;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
}}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
}
}
}
#pragma acc parallel loop gang collapse(2) present(domain)
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc loop vector
for (int i = 0; i < capacity; ++i) {
domain.bucket[iX][iY].list[i] = i;
}
}
}}
for (iX = 0; iX < 5; iX++)
{
for (iY = 0; iY < 5; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
for (int i = 0; i < capacity; ++i) {
printf("%d ",domain.bucket[iX][iY].list[i]);
}
printf("\n");
}
}}
exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
40, Generating enter data copyin(domain)
41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
49, Generating update device(domain.bucket->->count)
52, Generating enter data create(domain.bucket->->list[:capacity])
57, Generating present(domain)
Generating Tesla code
58, #pragma acc loop gang collapse(2) /* blockIdx.x */
60, /* blockIdx.x collapsed */
65, #pragma acc loop vector(128) /* threadIdx.x */
65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
Loop is parallelizable
78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
0
iX=0 iY=2 Cnt=2
0 1
iX=0 iY=3 Cnt=3
0 1 2
iX=0 iY=4 Cnt=4
0 1 2 3
iX=1 iY=0 Cnt=5
0 1 2 3 4
iX=1 iY=1 Cnt=6
0 1 2 3 4 5
iX=1 iY=2 Cnt=7
0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
我正在尝试使用 OpenACC 将现有的粒子方法代码修改为 GPU 上的 运行。现有代码使用 c 中的 struct 的 2D 动态数组。我需要将结构复制到 GPU 以进行进一步计算。代码示例如下:
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
structDomain domain;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
int iX,iY, capacity;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
}
}
}
在查阅了互联网上的各种资料后,我提出了以下解决方案(可能是完全错误的)"
// It is needed to create the memory for **bucket and *list on GPU.
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
#pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
}
}
请求将 **bucket 和 *list 深度复制到 GPU 内存的建议手册。我的解决方案准确吗?有人可以为所述结构的手动深度复制提出改进或更好的解决方案吗?
我在 Windows 10 上使用 PGI 19.4 编译器。 非常感谢
关闭。我唯一不同的是不创建 "domain.bucket" 并更新存储桶的计数,以便设备具有此信息。此外,由于 updates/copies 很浅,请确保仅更新结构中的列表数组或标量。否则你可能会覆盖 device/host 指针。这是一个例子。当我使用 Linux 时,除了可执行文件名称外,代码应该相同。
% cat test.c
#include <stdio.h>
#include <stdlib.h>
typedef struct{
int *list; // it is list of particles in a given bucket
int count; // it is the total number of particles in the bucket
} structBucket;
typedef struct{
structBucket **bucket;
int numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;
#define XDIM 64
#define YDIM 64
int main() {
structDomain domain;
int iX,iY, capacity;
// Allocate memory for **bucket
domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );
for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);
// Calculate domain.bucket[iX][iY].count here using some logic
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
}}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
if (capacity > 0)
{
domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
}
}
}
#pragma acc parallel loop gang collapse(2) present(domain)
for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
{
for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc loop vector
for (int i = 0; i < capacity; ++i) {
domain.bucket[iX][iY].list[i] = i;
}
}
}}
for (iX = 0; iX < 5; iX++)
{
for (iY = 0; iY < 5; iY++)
{
capacity = domain.bucket[iX][iY].count;
if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
for (int i = 0; i < capacity; ++i) {
printf("%d ",domain.bucket[iX][iY].list[i]);
}
printf("\n");
}
}}
exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
40, Generating enter data copyin(domain)
41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
49, Generating update device(domain.bucket->->count)
52, Generating enter data create(domain.bucket->->list[:capacity])
57, Generating present(domain)
Generating Tesla code
58, #pragma acc loop gang collapse(2) /* blockIdx.x */
60, /* blockIdx.x collapsed */
65, #pragma acc loop vector(128) /* threadIdx.x */
65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
Loop is parallelizable
78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
0
iX=0 iY=2 Cnt=2
0 1
iX=0 iY=3 Cnt=3
0 1 2
iX=0 iY=4 Cnt=4
0 1 2 3
iX=1 iY=0 Cnt=5
0 1 2 3 4
iX=1 iY=1 Cnt=6
0 1 2 3 4 5
iX=1 iY=2 Cnt=7
0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23