如何在 OpenMP 中实现 MPI_MINLOC?
How to implement MPI_MINLOC in OpenMP?
如何使用 C 语言在 OpenMP 中计算全局最小值和附加到最小值的索引?
线程少能得到真正的好处吗?
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for reduction(min:gmin)
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
gmin = v[i];
myloc = i;
}
}
if(gmin == mymin) {
// find global result
#pragma omp single
{
gloc = myloc;
}
}
是的,您可以在此处使用 OpenMP 获得解决大型问题的好处。当输入足够大以致内存带宽受限并且多线程提供更高的内存带宽时,就会实现好处。这几乎肯定会限于不适合最后一级缓存的输入。
我提供了一个关于如何执行此操作的简单示例程序。此实现使用旧的和广泛可用的功能。可以使用 OpenMP 5.0 用户定义的缩减,但我怀疑这是否会在性能上提供明显的好处,您可能会发现许多实现不支持此功能。
请注意,我的示例程序不会对重复测试进行计时,也不会对计时进行任何统计。因此,您不应将其视为高度科学的测试。但是,正如预期的那样,它在我的双核笔记本电脑上展示了 100M 双精度阵列的明显优势。
$ make minloc && ./minloc 100000000
icc -O3 -qopenmp -std=c11 minloc.c -o minloc
MINLOC of 100000000 elements with 4 threads
OpenMP: dt=0.076681, gmin=0.000000, gloc=4022958
Sequential: dt=0.157333, gmin=0.000000, gloc=4022958
SUCCESS
相关摘录
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
// write thread-private results to shared
tmin[me] = mymin;
tloc[me] = myloc;
#pragma omp barrier
// find global result
#pragma omp master
{
for (int i=0; i<nt; ++i) {
if (tmin[i] < gmin) {
gmin = tmin[i];
gloc = tloc[i];
}
}
}
完整的示例程序
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <omp.h>
int main(int argc, char* argv[])
{
size_t n = (argc > 1) ? atol(argv[1]) : 100;
double * v = malloc(n * sizeof(double));
if (v==NULL) abort();
// g = global
double gmin = DBL_MAX;
size_t gloc = SIZE_MAX;
const int mt = omp_get_max_threads();
printf("MINLOC of %zu elements with %d threads\n", n, mt);
// t = thread
double * tmin = malloc(mt * sizeof(double));
size_t * tloc = malloc(mt * sizeof(size_t));
if (tmin==NULL || tloc==NULL) abort();
for (int i=0; i<mt; ++i) {
tmin[i] = DBL_MAX;
tloc[i] = SIZE_MAX;
}
double dt = 0.0;
#pragma omp parallel firstprivate(n) shared(v, tmin, tloc, gmin, gloc, dt)
{
const int me = omp_get_thread_num();
const int nt = omp_get_num_threads();
unsigned int seed = (unsigned int)me;
srand(seed);
#pragma omp for
for (size_t i=0; i<n; ++i) {
// this is not a _good_ random number generator, but it does not matter for this use case
double r = (double)rand_r(&seed) / (double)RAND_MAX;
v[i] = r;
}
double t0 = 0.0;
#pragma omp barrier
#pragma omp master
{
t0 = omp_get_wtime();
}
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
// write thread-private results to shared
tmin[me] = mymin;
tloc[me] = myloc;
#pragma omp barrier
// find global result
#pragma omp master
{
for (int i=0; i<nt; ++i) {
if (tmin[i] < gmin) {
gmin = tmin[i];
gloc = tloc[i];
}
}
}
#pragma omp barrier
#pragma omp master
{
double t1 = omp_get_wtime();
dt = t1 - t0;
}
#if 0
#pragma omp critical
{
printf("%d: mymin=%f, myloc=%zu\n", me, mymin, myloc);
fflush(stdout);
}
#endif
}
printf("OpenMP: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
fflush(stdout);
// sequential reference timing
{
double t0 = omp_get_wtime();
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
gmin = mymin;
gloc = myloc;
double t1 = omp_get_wtime();
dt = t1 - t0;
}
printf("Sequential: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
fflush(stdout);
// debug printing for toy inputs
if (n<100) {
for (size_t i=0; i<n; ++i) {
printf("v[%zu]=%f\n", i , v[i]);
}
fflush(stdout);
}
free(v);
printf("SUCCESS\n");
return 0;
}
如何使用 C 语言在 OpenMP 中计算全局最小值和附加到最小值的索引?
线程少能得到真正的好处吗?
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for reduction(min:gmin)
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
gmin = v[i];
myloc = i;
}
}
if(gmin == mymin) {
// find global result
#pragma omp single
{
gloc = myloc;
}
}
是的,您可以在此处使用 OpenMP 获得解决大型问题的好处。当输入足够大以致内存带宽受限并且多线程提供更高的内存带宽时,就会实现好处。这几乎肯定会限于不适合最后一级缓存的输入。
我提供了一个关于如何执行此操作的简单示例程序。此实现使用旧的和广泛可用的功能。可以使用 OpenMP 5.0 用户定义的缩减,但我怀疑这是否会在性能上提供明显的好处,您可能会发现许多实现不支持此功能。
请注意,我的示例程序不会对重复测试进行计时,也不会对计时进行任何统计。因此,您不应将其视为高度科学的测试。但是,正如预期的那样,它在我的双核笔记本电脑上展示了 100M 双精度阵列的明显优势。
$ make minloc && ./minloc 100000000
icc -O3 -qopenmp -std=c11 minloc.c -o minloc
MINLOC of 100000000 elements with 4 threads
OpenMP: dt=0.076681, gmin=0.000000, gloc=4022958
Sequential: dt=0.157333, gmin=0.000000, gloc=4022958
SUCCESS
相关摘录
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
// write thread-private results to shared
tmin[me] = mymin;
tloc[me] = myloc;
#pragma omp barrier
// find global result
#pragma omp master
{
for (int i=0; i<nt; ++i) {
if (tmin[i] < gmin) {
gmin = tmin[i];
gloc = tloc[i];
}
}
}
完整的示例程序
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <omp.h>
int main(int argc, char* argv[])
{
size_t n = (argc > 1) ? atol(argv[1]) : 100;
double * v = malloc(n * sizeof(double));
if (v==NULL) abort();
// g = global
double gmin = DBL_MAX;
size_t gloc = SIZE_MAX;
const int mt = omp_get_max_threads();
printf("MINLOC of %zu elements with %d threads\n", n, mt);
// t = thread
double * tmin = malloc(mt * sizeof(double));
size_t * tloc = malloc(mt * sizeof(size_t));
if (tmin==NULL || tloc==NULL) abort();
for (int i=0; i<mt; ++i) {
tmin[i] = DBL_MAX;
tloc[i] = SIZE_MAX;
}
double dt = 0.0;
#pragma omp parallel firstprivate(n) shared(v, tmin, tloc, gmin, gloc, dt)
{
const int me = omp_get_thread_num();
const int nt = omp_get_num_threads();
unsigned int seed = (unsigned int)me;
srand(seed);
#pragma omp for
for (size_t i=0; i<n; ++i) {
// this is not a _good_ random number generator, but it does not matter for this use case
double r = (double)rand_r(&seed) / (double)RAND_MAX;
v[i] = r;
}
double t0 = 0.0;
#pragma omp barrier
#pragma omp master
{
t0 = omp_get_wtime();
}
// thread-private result
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
// bottleneck parallel part
#pragma omp for
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
// write thread-private results to shared
tmin[me] = mymin;
tloc[me] = myloc;
#pragma omp barrier
// find global result
#pragma omp master
{
for (int i=0; i<nt; ++i) {
if (tmin[i] < gmin) {
gmin = tmin[i];
gloc = tloc[i];
}
}
}
#pragma omp barrier
#pragma omp master
{
double t1 = omp_get_wtime();
dt = t1 - t0;
}
#if 0
#pragma omp critical
{
printf("%d: mymin=%f, myloc=%zu\n", me, mymin, myloc);
fflush(stdout);
}
#endif
}
printf("OpenMP: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
fflush(stdout);
// sequential reference timing
{
double t0 = omp_get_wtime();
double mymin = DBL_MAX;
double myloc = SIZE_MAX;
for (size_t i=0; i<n; ++i) {
if (v[i] < mymin) {
mymin = v[i];
myloc = i;
}
}
gmin = mymin;
gloc = myloc;
double t1 = omp_get_wtime();
dt = t1 - t0;
}
printf("Sequential: dt=%f, gmin=%f, gloc=%zu\n", dt, gmin, gloc);
fflush(stdout);
// debug printing for toy inputs
if (n<100) {
for (size_t i=0; i<n; ++i) {
printf("v[%zu]=%f\n", i , v[i]);
}
fflush(stdout);
}
free(v);
printf("SUCCESS\n");
return 0;
}