在 OpenACC 中加入数组结果
Join array results in OpenACC
我正在编写具有数组依赖性的 OpenACC 代码。内循环的每次迭代可以更新数组的相同位置。这是一些代码:
long unsigned int digits[d + 11];
for (long unsigned int digit = 0; digit < d + 11; ++digit)
digits[digit] = 0;
for (long unsigned int i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digits[digit] += div; // here
remainder = mod * 10;
}
}
OpenMP版本写法如下:
#pragma omp parallel private(i)
{
long unsigned int digit_local[d+11];
for(i=0;i<d+11;i++)
digit_local[i] = 0;
#pragma omp for
for (i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit] += div;
remainder = mod * 10;
}
}
#pragma omp critical
for(long unsigned int digit = 0; digit < d+11; ++digit)
digits[digit] += digit_local[digit];
}
在 OpenACC 中,关键字 private 与数组一起使用,但我不知道如何将私有数组与全局数组连接起来。
谢谢。
您将使用 OpenACC "atomic update" 指令。
#pragma acc atomic update
digits[digit] += div; // here
或者,您可以执行与您的 OpenMP 版本类似的操作。
long unsigned int digit_local[d+11][n];
#pragma acc data create(digit_local) copyout(digits)
{
#pragma acc parallel loop gang vector
for (i = 1; i <= n; ++i) {
for(j=0;j<d+11;j++) digit_local[j][i] = 0;
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit][i] += div;
remainder = mod * 10;
}
}
#pragma acc parallel loop gang
for(long unsigned int digit = 0; digit < d+11; ++digit) {
long unsigned int dsum = 0;
#pragma acc loop vector reduction(+:dsum)
for (i = 1; i <= n; ++i) {
dsum += digit_local[digit][i];
}
digits[digit] = dsum;
}
}
不过,我不确定这是否会加速。
希望这对您有所帮助,
垫子
我正在编写具有数组依赖性的 OpenACC 代码。内循环的每次迭代可以更新数组的相同位置。这是一些代码:
long unsigned int digits[d + 11];
for (long unsigned int digit = 0; digit < d + 11; ++digit)
digits[digit] = 0;
for (long unsigned int i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digits[digit] += div; // here
remainder = mod * 10;
}
}
OpenMP版本写法如下:
#pragma omp parallel private(i)
{
long unsigned int digit_local[d+11];
for(i=0;i<d+11;i++)
digit_local[i] = 0;
#pragma omp for
for (i = 1; i <= n; ++i) {
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit] += div;
remainder = mod * 10;
}
}
#pragma omp critical
for(long unsigned int digit = 0; digit < d+11; ++digit)
digits[digit] += digit_local[digit];
}
在 OpenACC 中,关键字 private 与数组一起使用,但我不知道如何将私有数组与全局数组连接起来。
谢谢。
您将使用 OpenACC "atomic update" 指令。
#pragma acc atomic update
digits[digit] += div; // here
或者,您可以执行与您的 OpenMP 版本类似的操作。
long unsigned int digit_local[d+11][n];
#pragma acc data create(digit_local) copyout(digits)
{
#pragma acc parallel loop gang vector
for (i = 1; i <= n; ++i) {
for(j=0;j<d+11;j++) digit_local[j][i] = 0;
long unsigned int remainder = 1;
for (long unsigned int digit = 0; digit < d + 11 && remainder; ++digit) {
long unsigned int div = remainder / i;
long unsigned int mod = remainder % i;
digit_local[digit][i] += div;
remainder = mod * 10;
}
}
#pragma acc parallel loop gang
for(long unsigned int digit = 0; digit < d+11; ++digit) {
long unsigned int dsum = 0;
#pragma acc loop vector reduction(+:dsum)
for (i = 1; i <= n; ++i) {
dsum += digit_local[digit][i];
}
digits[digit] = dsum;
}
}
不过,我不确定这是否会加速。
希望这对您有所帮助, 垫子