Hybrid OpenMP+MPI :我需要这个例子的解释
Hybrid OpenMP+MPI : I need an explanation from this example
我在网上找到了这个例子,但是我不明白如果是 A[5] 主节点到底发送了什么,例如什么会发送给其他从节点?第 5 行或第 5 行之前的所有元素或第 5 行的所有元素等等???
#包括
#包括
#包括
#include
#define TAG 13
int main(int argc, char *argv[]) {
//double **A, **B, **C, *tmp;
double **A, **B, **C, *tmp;
double startTime, endTime;
int numElements, offset, stripSize, myrank, numnodes, N, i, j, k;
int numThreads, chunkSize = 10;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &numnodes);
N = atoi(argv[1]);
numThreads = atoi(argv[2]); // difference from MPI: how many threads/rank?
omp_set_num_threads(numThreads); // OpenMP call to set threads per rank
if (myrank == 0) {
tmp = (double *) malloc (sizeof(double ) * N * N);
A = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
A[i] = &tmp[i * N];
}
else {
tmp = (double *) malloc (sizeof(double ) * N * N / numnodes);
A = (double **) malloc (sizeof(double *) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
A[i] = &tmp[i * N];
}
tmp = (double *) malloc (sizeof(double ) * N * N);
B = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
B[i] = &tmp[i * N];
if (myrank == 0) {
tmp = (double *) malloc (sizeof(double ) * N * N);
C = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
C[i] = &tmp[i * N];
}
else {
tmp = (double *) malloc (sizeof(double ) * N * N / numnodes);
C = (double **) malloc (sizeof(double *) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
C[i] = &tmp[i * N];
}
if (myrank == 0) {
// initialize A and B
for (i=0; i<N; i++) {
for (j=0; j<N; j++) {
A[i][j] = 1.0;
B[i][j] = 1.0;
}
}
}
// start timer
if (myrank == 0) {
startTime = MPI_Wtime();
}
stripSize = N/numnodes;
// send each node its piece of A -- note could be done via MPI_Scatter
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i=1; i<numnodes; i++) {
我无法理解下面这行的发送
MPI_Send(A[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD);
offset += stripSize;
}
}
else { // receive my part of A
这里还有:
MPI_Recv(A[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
与广播相同,B 将发送什么?
// everyone gets B
MPI_Bcast(B[0], N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Let each process initialize C to zero
for (i=0; i<stripSize; i++) {
for (j=0; j<N; j++) {
C[i][j] = 0.0;
}
}
// do the work---this is the primary difference from the pure MPI program
#pragma omp parallel for shared(A,B,C,numThreads) private(i,j,k) schedule (static, chunkSize)
for (i=0; i<stripSize; i++) {
for (j=0; j<N; j++) {
for (k=0; k<N; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
// master receives from workers -- note could be done via MPI_Gather
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i=1; i<numnodes; i++) {
MPI_Recv(C[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
offset += stripSize;
}
}
else { // send my contribution to C
MPI_Send(C[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
A、B 和 C 是使用指针方法的动态二维数组。指数是 A[row][col]
。当省略最后一个索引时,返回该行中第一个元素(零列)的地址。这很有用,因为您可以使用该地址和矩阵的 "width" 传递单行。这是二维数组在内存中的存储方式:
发送矩阵 A
的第 5 行,共有 num_cols
列:
MPI_Send(A[5], num_cols, MPI_DOUBLE, ...);
同样,您可以编写 &A[5][0]
来访问相同的地址,但它更混乱。
此外,如果您希望发送完整的二维矩阵,这很容易完成,因为每一行都连续存储在内存中。只需使用第一行 B[0]
(也指向第一列)并使用 N*N
线性化长度(假设这是一个方阵)。
发送完整的N*N
方阵B
:
MPI_Bcast(B[0], N*N, MPI_DOUBLE, ...);
我在网上找到了这个例子,但是我不明白如果是 A[5] 主节点到底发送了什么,例如什么会发送给其他从节点?第 5 行或第 5 行之前的所有元素或第 5 行的所有元素等等??? #包括 #包括 #包括 #include
#define TAG 13
int main(int argc, char *argv[]) {
//double **A, **B, **C, *tmp;
double **A, **B, **C, *tmp;
double startTime, endTime;
int numElements, offset, stripSize, myrank, numnodes, N, i, j, k;
int numThreads, chunkSize = 10;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &numnodes);
N = atoi(argv[1]);
numThreads = atoi(argv[2]); // difference from MPI: how many threads/rank?
omp_set_num_threads(numThreads); // OpenMP call to set threads per rank
if (myrank == 0) {
tmp = (double *) malloc (sizeof(double ) * N * N);
A = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
A[i] = &tmp[i * N];
}
else {
tmp = (double *) malloc (sizeof(double ) * N * N / numnodes);
A = (double **) malloc (sizeof(double *) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
A[i] = &tmp[i * N];
}
tmp = (double *) malloc (sizeof(double ) * N * N);
B = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
B[i] = &tmp[i * N];
if (myrank == 0) {
tmp = (double *) malloc (sizeof(double ) * N * N);
C = (double **) malloc (sizeof(double *) * N);
for (i = 0; i < N; i++)
C[i] = &tmp[i * N];
}
else {
tmp = (double *) malloc (sizeof(double ) * N * N / numnodes);
C = (double **) malloc (sizeof(double *) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
C[i] = &tmp[i * N];
}
if (myrank == 0) {
// initialize A and B
for (i=0; i<N; i++) {
for (j=0; j<N; j++) {
A[i][j] = 1.0;
B[i][j] = 1.0;
}
}
}
// start timer
if (myrank == 0) {
startTime = MPI_Wtime();
}
stripSize = N/numnodes;
// send each node its piece of A -- note could be done via MPI_Scatter
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i=1; i<numnodes; i++) {
我无法理解下面这行的发送
MPI_Send(A[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD);
offset += stripSize;
}
}
else { // receive my part of A
这里还有:
MPI_Recv(A[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
与广播相同,B 将发送什么?
// everyone gets B
MPI_Bcast(B[0], N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Let each process initialize C to zero
for (i=0; i<stripSize; i++) {
for (j=0; j<N; j++) {
C[i][j] = 0.0;
}
}
// do the work---this is the primary difference from the pure MPI program
#pragma omp parallel for shared(A,B,C,numThreads) private(i,j,k) schedule (static, chunkSize)
for (i=0; i<stripSize; i++) {
for (j=0; j<N; j++) {
for (k=0; k<N; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
// master receives from workers -- note could be done via MPI_Gather
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i=1; i<numnodes; i++) {
MPI_Recv(C[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
offset += stripSize;
}
}
else { // send my contribution to C
MPI_Send(C[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
A、B 和 C 是使用指针方法的动态二维数组。指数是 A[row][col]
。当省略最后一个索引时,返回该行中第一个元素(零列)的地址。这很有用,因为您可以使用该地址和矩阵的 "width" 传递单行。这是二维数组在内存中的存储方式:
发送矩阵 A
的第 5 行,共有 num_cols
列:
MPI_Send(A[5], num_cols, MPI_DOUBLE, ...);
同样,您可以编写 &A[5][0]
来访问相同的地址,但它更混乱。
此外,如果您希望发送完整的二维矩阵,这很容易完成,因为每一行都连续存储在内存中。只需使用第一行 B[0]
(也指向第一列)并使用 N*N
线性化长度(假设这是一个方阵)。
发送完整的N*N
方阵B
:
MPI_Bcast(B[0], N*N, MPI_DOUBLE, ...);