OpenMP:gcc 在 -march=native (-march=skylake-avx512) 和 -O3 的情况下导致奇怪的求和
OpenMP: gcc causes weird summation in case of -march=native (-march=skylake-avx512) and -O3
以下代码的行为会有所不同,具体取决于 gcc 应用的优化和目标架构:
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
static void malloc_testvals(int **testvals, int num_tests, int num_threads) {
for (int i = 0; i < num_tests; i++) {
testvals[i] = malloc(num_threads * sizeof(int));
}
}
static void free_testvals(int **testvals, int num_tests) {
for (int i = 0; i < num_tests; i++) {
free(testvals[i]);
}
}
static void make_random_testvals(int **testvals, int *sums, int num_tests, int num_threads) {
srand(time(NULL));
for (int i = 0; i < num_tests; i++) {
sums[i] = 0;
for (int j = 0; j < num_threads; j++) {
testvals[i][j] = -100 + rand() % 201;
sums[i] += testvals[i][j];
}
}
}
typedef struct ThreadCommunicator_s ThreadCommunicator;
typedef struct {
long num_threads;
ThreadCommunicator **threads;
} Communicator;
typedef struct ThreadCommunicator_s {
Communicator *parent;
long omp_longval;
} ThreadCommunicator;
static void ThreadCommunicator_init(ThreadCommunicator* self, Communicator* parent) {
*self = (ThreadCommunicator) {
.parent = parent,
.omp_longval = 0
};
}
static void Communicator_init(Communicator* self) {
self->num_threads = omp_get_max_threads();
self->threads = malloc(sizeof(ThreadCommunicator *) * self->num_threads);
for (int rank = 0; rank < self->num_threads; rank++) {
self->threads[rank] = malloc(sizeof(ThreadCommunicator));
ThreadCommunicator_init(self->threads[rank], self);
}
}
static void Communicator_deinit(Communicator* self) {
for (int rank = 0; rank < self->num_threads; rank++) {
free(self->threads[rank]);
}
free(self->threads);
}
//Sums over all thread-inherent numbers
static long ThreadCommunicator_allreduce_sum_l(ThreadCommunicator* self, long myval) {
//share my result with others
self->omp_longval = myval;
#pragma omp barrier
#pragma omp single
{
printf("self->parent->num_threads = %ld\n", self->parent->num_threads);
printf("omp_get_num_threads() = %d\n", omp_get_num_threads());
}
//------------------------------------------------------------------------------------------------------------------
//Error will be miraculously gone if self->parent->num_threads is replaced by omp_get_num_threads().
//------------------------------------------------------------------------------------------------------------------
long sum = 0;
for (int rank = 0; rank < self->parent->num_threads; rank++) {
sum += self->parent->threads[rank]->omp_longval;
}
#pragma omp barrier
return sum;
}
#define NUM_TESTS 1
int main() {
Communicator communicator;
Communicator_init(&communicator);
int *testvals[NUM_TESTS];
//solutions
int sums[NUM_TESTS];
malloc_testvals(testvals, NUM_TESTS, communicator.num_threads);
make_random_testvals(testvals, sums, NUM_TESTS, communicator.num_threads);
unsigned long error = 0;
#pragma omp parallel
{
if (communicator.num_threads != omp_get_num_threads()) {
printf("This is not supported in this test.\n");
//due to make_random_testvals works with communicator.num_threads
//but ThreadCommunicator_allreduce_sum_l does set only the first omp_get_num_threads() values of it
abort();
}
ThreadCommunicator *thread_comm = communicator.threads[omp_get_thread_num()];
for (int i = 0; i < NUM_TESTS; i++) {
long thread_sum = ThreadCommunicator_allreduce_sum_l(thread_comm, testvals[i][omp_get_thread_num()]);
#pragma omp atomic
error += (unsigned long) labs(thread_sum - sums[i]);
}
}
if (error != 0) {
printf("Error occurred (error = %lu)!\n", error);
}
free_testvals(testvals, NUM_TESTS);
Communicator_deinit(&communicator);
}
由
编译
gcc -Wall -std=c99 -fopenmp -O3 -march=skylake-avx512
或
gcc -Wall -std=c99 -fopenmp -O3 -march=native
在
英特尔(R) 至强(R) 金牌 6230 CPU
与
gcc (GCC) 8.3.1 20191121(红帽 8.3.1-5)
将示例性地产生此输出:
self->parent->num_threads = 16
omp_get_num_threads() = 16
Error occurred (error = 8070309797393041808)!
有趣的是,如果应用以下更改之一,此错误就会消失:
- 将
-O3
替换为-O2
- 从选项列表中删除
-march=...
- 将
self->parent->num_threads
替换为 omp_get_num_threads()
,如代码中所示。
- 使用 -march=native 在不同的机器上编译(虽然我当然没有完整的概述哪些系统受到影响,哪些不受影响)
我在问这是否是一个编译器错误,或者我的代码是否不符合 C 或 OpenMP 规范,例如由于数据竞争。非常感谢任何帮助!
编辑:
根据评论 (@Laci) 更新了代码。
事实证明,更新 binutils 后问题消失了。因此,该问题被认为是 compiler bug.
以下代码的行为会有所不同,具体取决于 gcc 应用的优化和目标架构:
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
static void malloc_testvals(int **testvals, int num_tests, int num_threads) {
for (int i = 0; i < num_tests; i++) {
testvals[i] = malloc(num_threads * sizeof(int));
}
}
static void free_testvals(int **testvals, int num_tests) {
for (int i = 0; i < num_tests; i++) {
free(testvals[i]);
}
}
static void make_random_testvals(int **testvals, int *sums, int num_tests, int num_threads) {
srand(time(NULL));
for (int i = 0; i < num_tests; i++) {
sums[i] = 0;
for (int j = 0; j < num_threads; j++) {
testvals[i][j] = -100 + rand() % 201;
sums[i] += testvals[i][j];
}
}
}
typedef struct ThreadCommunicator_s ThreadCommunicator;
typedef struct {
long num_threads;
ThreadCommunicator **threads;
} Communicator;
typedef struct ThreadCommunicator_s {
Communicator *parent;
long omp_longval;
} ThreadCommunicator;
static void ThreadCommunicator_init(ThreadCommunicator* self, Communicator* parent) {
*self = (ThreadCommunicator) {
.parent = parent,
.omp_longval = 0
};
}
static void Communicator_init(Communicator* self) {
self->num_threads = omp_get_max_threads();
self->threads = malloc(sizeof(ThreadCommunicator *) * self->num_threads);
for (int rank = 0; rank < self->num_threads; rank++) {
self->threads[rank] = malloc(sizeof(ThreadCommunicator));
ThreadCommunicator_init(self->threads[rank], self);
}
}
static void Communicator_deinit(Communicator* self) {
for (int rank = 0; rank < self->num_threads; rank++) {
free(self->threads[rank]);
}
free(self->threads);
}
//Sums over all thread-inherent numbers
static long ThreadCommunicator_allreduce_sum_l(ThreadCommunicator* self, long myval) {
//share my result with others
self->omp_longval = myval;
#pragma omp barrier
#pragma omp single
{
printf("self->parent->num_threads = %ld\n", self->parent->num_threads);
printf("omp_get_num_threads() = %d\n", omp_get_num_threads());
}
//------------------------------------------------------------------------------------------------------------------
//Error will be miraculously gone if self->parent->num_threads is replaced by omp_get_num_threads().
//------------------------------------------------------------------------------------------------------------------
long sum = 0;
for (int rank = 0; rank < self->parent->num_threads; rank++) {
sum += self->parent->threads[rank]->omp_longval;
}
#pragma omp barrier
return sum;
}
#define NUM_TESTS 1
int main() {
Communicator communicator;
Communicator_init(&communicator);
int *testvals[NUM_TESTS];
//solutions
int sums[NUM_TESTS];
malloc_testvals(testvals, NUM_TESTS, communicator.num_threads);
make_random_testvals(testvals, sums, NUM_TESTS, communicator.num_threads);
unsigned long error = 0;
#pragma omp parallel
{
if (communicator.num_threads != omp_get_num_threads()) {
printf("This is not supported in this test.\n");
//due to make_random_testvals works with communicator.num_threads
//but ThreadCommunicator_allreduce_sum_l does set only the first omp_get_num_threads() values of it
abort();
}
ThreadCommunicator *thread_comm = communicator.threads[omp_get_thread_num()];
for (int i = 0; i < NUM_TESTS; i++) {
long thread_sum = ThreadCommunicator_allreduce_sum_l(thread_comm, testvals[i][omp_get_thread_num()]);
#pragma omp atomic
error += (unsigned long) labs(thread_sum - sums[i]);
}
}
if (error != 0) {
printf("Error occurred (error = %lu)!\n", error);
}
free_testvals(testvals, NUM_TESTS);
Communicator_deinit(&communicator);
}
由
编译gcc -Wall -std=c99 -fopenmp -O3 -march=skylake-avx512
或
gcc -Wall -std=c99 -fopenmp -O3 -march=native
在
英特尔(R) 至强(R) 金牌 6230 CPU 与 gcc (GCC) 8.3.1 20191121(红帽 8.3.1-5)
将示例性地产生此输出:
self->parent->num_threads = 16
omp_get_num_threads() = 16
Error occurred (error = 8070309797393041808)!
有趣的是,如果应用以下更改之一,此错误就会消失:
- 将
-O3
替换为-O2
- 从选项列表中删除
-march=...
- 将
self->parent->num_threads
替换为omp_get_num_threads()
,如代码中所示。 - 使用 -march=native 在不同的机器上编译(虽然我当然没有完整的概述哪些系统受到影响,哪些不受影响)
我在问这是否是一个编译器错误,或者我的代码是否不符合 C 或 OpenMP 规范,例如由于数据竞争。非常感谢任何帮助!
编辑: 根据评论 (@Laci) 更新了代码。
事实证明,更新 binutils 后问题消失了。因此,该问题被认为是 compiler bug.