循环携带 `->` 的依赖性阻止并行化
Loop carried dependence of `->` prevents parallelization
我有一个模型 class,它保存模型的数据并在该数据上运行多个函数。细节可能不太重要,除了它有以下设计:
- 变量存储在 class 命名空间中。
- 变量由 class 的方法之一初始化和释放。
- 变量被其他几种方法使用。
class 的 MWE 显示如下:
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0)
: nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
#pragma acc parallel loop collapse(2) independent present(h,rec,width,height)
for(int y=2;y<height-2;y++)
for(int x=2;x<width-2;x++){
const int c = y*width+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(this,rec)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
当我编译时:
pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel -fast test.cpp -std=c++11
我收到以下警告:
51, Loop without integer trip count will be executed in sequential mode
Complex loop carried dependence of rec->,nshift prevents parallelization
Loop carried dependence of rec-> prevents parallelization
Loop carried backward dependence of rec-> prevents vectorization
互联网上的一些挖掘表明,一个典型的原因是指针别名可能导致依赖性。
我曾尝试使用 *restrict
和 independent
(如图所示)告诉编译器一切正常,但它忽略了我并且没有并行化循环。
通过适当使用 restrict
将指针作为参数传递给函数可以消除错误,但我对这种设计有审美偏好。或者,所有的方法,每个本质上都是一个内核,可以在 run()
函数中串在一起,但同样,这是不可取的。
如果我在内循环中使用 independent
,我得到:
PGCC-W-0155-inner loop of tiled/collapsed loop nest should not have another loop directive (actual_code.cpp: 227)
但是循环似乎是并行化的。
我正在使用 PGI 17.9 进行编译。
这里的问题是 "height" 和 "width" 是 class 数据成员。因此,编译器必须假定它们可能具有对它们的外部引用,因此可以在这些循环的执行期间更改值。
解决方案是将值复制到局部变量,然后使用局部变量作为循环边界。
请注意,由于您在外循环上有 "collapse(2)",因此 "independent" 子句已经适用于两个循环。 (尽管 "independent" 是 "parallel" 计算区域的默认值,因此不需要。)折叠多个循环时不允许使用第二个 "loop" 构造。
% cat test.cpp
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
int hgt = height;
int wdt = width;
#pragma acc parallel loop collapse(2) present(h,rec)
for(int y=2;y<hgt-2;y++)
for(int x=2;x<wdt-2;x++){
const int c = y*wdt+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out
Model::FindDownstream():
49, Generating present(h[:])
Accelerator kernel generated
Generating Tesla code
51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
52, /* blockIdx.x threadIdx.x collapsed */
61, #pragma acc loop seq
49, Generating implicit copy(this[:])
Generating present(rec[:])
61, Loop carried scalar dependence for max_slope at line 63
Model::run(int):
74, Generating enter data copyin(nshift[:],h[:size])
Generating enter data create(rec[:size])
Generating enter data copyin(this[:1])
83, Generating exit data delete(this[:1],rec[:1])
Generating exit data copyout(h[:size])
Generating exit data delete(nshift[:])
我有一个模型 class,它保存模型的数据并在该数据上运行多个函数。细节可能不太重要,除了它有以下设计:
- 变量存储在 class 命名空间中。
- 变量由 class 的方法之一初始化和释放。
- 变量被其他几种方法使用。
class 的 MWE 显示如下:
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0)
: nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
#pragma acc parallel loop collapse(2) independent present(h,rec,width,height)
for(int y=2;y<height-2;y++)
for(int x=2;x<width-2;x++){
const int c = y*width+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(this,rec)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
当我编译时:
pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel -fast test.cpp -std=c++11
我收到以下警告:
51, Loop without integer trip count will be executed in sequential mode
Complex loop carried dependence of rec->,nshift prevents parallelization
Loop carried dependence of rec-> prevents parallelization
Loop carried backward dependence of rec-> prevents vectorization
互联网上的一些挖掘表明,一个典型的原因是指针别名可能导致依赖性。
我曾尝试使用 *restrict
和 independent
(如图所示)告诉编译器一切正常,但它忽略了我并且没有并行化循环。
通过适当使用 restrict
将指针作为参数传递给函数可以消除错误,但我对这种设计有审美偏好。或者,所有的方法,每个本质上都是一个内核,可以在 run()
函数中串在一起,但同样,这是不可取的。
如果我在内循环中使用 independent
,我得到:
PGCC-W-0155-inner loop of tiled/collapsed loop nest should not have another loop directive (actual_code.cpp: 227)
但是循环似乎是并行化的。
我正在使用 PGI 17.9 进行编译。
这里的问题是 "height" 和 "width" 是 class 数据成员。因此,编译器必须假定它们可能具有对它们的外部引用,因此可以在这些循环的执行期间更改值。
解决方案是将值复制到局部变量,然后使用局部变量作为循环边界。
请注意,由于您在外循环上有 "collapse(2)",因此 "independent" 子句已经适用于两个循环。 (尽管 "independent" 是 "parallel" 计算区域的默认值,因此不需要。)折叠多个循环时不允许使用第二个 "loop" 构造。
% cat test.cpp
#include <cstdlib>
class Model {
private:
int width;
int height;
int size;
int nshift[8]; //Offset from a focal cell's index to its neighbours
double *restrict h; //Digital elevation model (height)
int *restrict rec; //Index of receiving cell
const int NO_FLOW = -1;
const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};
private:
void GenerateRandomTerrain(){
//srand(std::random_device()());
for(int y=0;y<height;y++)
for(int x=0;x<width;x++){
const int c = y*width+x;
h[c] = rand()/(double)RAND_MAX;
}
}
public:
Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
{
width = width0;
height = height0;
size = width*height;
h = new double[size];
GenerateRandomTerrain();
}
~Model(){
delete[] h;
}
private:
void FindDownstream(){
//! computing receiver array
int hgt = height;
int wdt = width;
#pragma acc parallel loop collapse(2) present(h,rec)
for(int y=2;y<hgt-2;y++)
for(int x=2;x<wdt-2;x++){
const int c = y*wdt+x;
//The slope must be greater than zero for there to be downhill flow;
//otherwise, the cell is marekd NO_FLOW
double max_slope = 0;
int max_n = NO_FLOW;
#pragma acc loop seq
for(int n=0;n<8;n++){
double slope = (h[c] - h[c+nshift[n]])/dr[n];
if(slope>max_slope){
max_slope = slope;
max_n = n;
}
}
rec[c] = max_n;
}
}
public:
void run(const int nstep){
rec = new int[size];
#pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size])
for(int step=0;step<=nstep;step++)
FindDownstream();
#pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this)
delete[] rec;
}
};
int main(int argc, char **argv){
Model model(300,300);
model.run(100);
return 0;
}
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out
Model::FindDownstream():
49, Generating present(h[:])
Accelerator kernel generated
Generating Tesla code
51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
52, /* blockIdx.x threadIdx.x collapsed */
61, #pragma acc loop seq
49, Generating implicit copy(this[:])
Generating present(rec[:])
61, Loop carried scalar dependence for max_slope at line 63
Model::run(int):
74, Generating enter data copyin(nshift[:],h[:size])
Generating enter data create(rec[:size])
Generating enter data copyin(this[:1])
83, Generating exit data delete(this[:1],rec[:1])
Generating exit data copyout(h[:size])
Generating exit data delete(nshift[:])