10 维 Monte Carlo 与 openmp 集成

Question

我正在尝试使用 openmp 学习并行化。我写了一个 c++ 脚本，它通过 MC 为函数计算 10 维积分： F = x1+ x2 + x3 +...+x10

现在我正在尝试将其转换为使用 4 线程的 openmp。我的串行代码给出了可理解的输出，所以我确信它工作正常。这是我的序列号：我想输出 N= 样本点数的每 4^k 次迭代。

/* compile with 

               $ g++ -o monte ND_MonteCarlo.cpp 
               $ ./monte N

   unsigned long long int for i, N
   Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/


#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>

using namespace std;


//define multivariate function F(x1, x2, ...xk)            

double f(double x[], int n)
{
    double y;
    int j;
    y = 0.0;

    for (j = 0; j < n; j = j+1)
      {
         y = y + x[j];
      }     

    y = y;
    return y;
}

//define function for Monte Carlo Multidimensional integration

double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)

{
    double r, x[n], v;
    int i, j;
    r = 0.0;
    v = 1.0;


    // step 1: calculate the common factor V
    for (j = 0; j < n; j = j+1)
      {
         v = v*(b[j]-a[j]);
      } 

    // step 2: integration
    for (i = 1; i <= m; i=i+1)
    {
        // calculate random x[] points
        for (j = 0; j < n; j = j+1)
        {
            x[j] = a[j] +  (rand()) /( (RAND_MAX/(b[j]-a[j])));
        }         
        r = r + fn(x,n);
    }
    r = r*v/m;

    return r;
}




double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int); 



int main(int argc, char **argv)
{    



    /* define how many integrals */
    const int n = 10;       

    double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,5.0};                    
    double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};  

    double result, mean;
    int m;

    unsigned long long int i, N;
    // initial seed value (use system time) 
    srand(time(NULL));  


    cout.precision(6);
    cout.setf(ios::fixed | ios::showpoint); 

    // current time in seconds (begin calculations)
    time_t seconds_i;
    seconds_i = time (NULL);

    m = 4;                // initial number of intervals

    // convert command-line input to N = number of points
    N = atoi( argv[1] );

    for (i=0; i <=N/pow(4,i); i++)
    {
        result = int_mcnd(f, a, b, n, m);
        mean = result/(pow(10,10));
        cout << setw(30)  << m << setw(30) << result << setw(30) << mean <<endl;
        m = m*4; 
    }



// current time in seconds (end of calculations)
    time_t seconds_f;
    seconds_f = time (NULL);
    cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;

    return 0;
}

并输出：

N            integral                                mean_integral
 4            62061079725.185936                      6.206108
 16            33459275100.477665                      3.345928
 64            -2204654740.788784                     -0.220465
 256             4347440045.990804                      0.434744
 1024            -1265056243.116922                     -0.126506
 4096              681660387.953380                      0.068166
 16384             -799507050.896809                     -0.079951
 65536             -462592561.594820                     -0.046259
 262144               50902035.836772                      0.005090
 1048576              -91104861.129695                     -0.009110
 4194304                3746742.588701                      0.000375
 16777216              -32967862.853915                     -0.003297
 67108864               17730924.602974                      0.001773
 268435456                -416824.977687                     -0.00004
 1073741824                2843188.477219                      0.000284

但我认为我的并行代码根本不起作用。我知道我当然在做一些愚蠢的事情。由于我的线程数是 4，我想将结果除以 4，结果很荒谬。

这是相同代码的并行版本：

/* compile with 

               $ g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm
               $ ./mcOMP N

   unsigned long long int for i, N
   Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/


#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <omp.h>

using namespace std;


//define multivariate function F(x1, x2, ...xk)            

double f(double x[], int n)
{
    double y;
    int j;
    y = 0.0;

    for (j = 0; j < n; j = j+1)
      {
         y = y + x[j];
      }     

    y = y;
    return y;
}

//define function for Monte Carlo Multidimensional integration

double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)

{
    double r, x[n], v;
    int i, j;
    r = 0.0;
    v = 1.0;


    // step 1: calculate the common factor V
    #pragma omp for
    for (j = 0; j < n; j = j+1)
      {
         v = v*(b[j]-a[j]);
      } 

    // step 2: integration
    #pragma omp for
    for (i = 1; i <= m; i=i+1)
    {
        // calculate random x[] points

        for (j = 0; j < n; j = j+1)
        {
            x[j] = a[j] +  (rand()) /( (RAND_MAX/(b[j]-a[j])));
        }         
        r = r + fn(x,n);
    }
    r = r*v/m;

    return r;
}




double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int); 



int main(int argc, char **argv)
{    



    /* define how many integrals */
    const int n = 10;       

    double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};                    
    double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};  

    double result, mean;
    int m;

    unsigned long long int i, N;
    int NumThreads = 4;


    // initial seed value (use system time) 
    srand(time(NULL));  


    cout.precision(6);
    cout.setf(ios::fixed | ios::showpoint); 

    // current time in seconds (begin calculations)
    time_t seconds_i;
    seconds_i = time (NULL);

    m = 4;                // initial number of intervals

    // convert command-line input to N = number of points
    N = atoi( argv[1] );

    #pragma omp parallel private(result, mean) shared(N, m) num_threads(NumThreads)
    for (i=0; i <=N/pow(4,i); i++)
    {
        result = int_mcnd(f, a, b, n, m);
        mean = result/(pow(10,10));
        #pragma omp master
        cout << setw(30)  << m/4 << setw(30) << result/4 << setw(30) << mean/4 <<endl;
        m = m*4; 
    }



// current time in seconds (end of calculations)
    time_t seconds_f;
    seconds_f = time (NULL);
    cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;

    return 0;
}

我只想让主线程输出值。我编译了：

g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm

非常感谢您对修复代码的帮助和建议。非常感谢。

Answer 1

我不会详细介绍，但会提供一些参考点

以这部分代码为例：

// step 1: calculate the common factor V
#pragma omp for
for (j = 0; j < n; j = j+1)
  {
     v = v*(b[j]-a[j]);
  }

如果您查看变量 v，就会发现存在明显的竞争条件。也就是说，您必须声明 v 对线程私有（可以将其称为 local_v），然后通过缩减操作将所有值收集到一个 global_v 值中。例如

一般来说，我建议您寻找 openmp 的竞争条件、临界区、共享和私有内存的概念。

Answer 2

让我们看看您的程序做了什么。在 omp parallel，您的线程已生成，它们将并行执行剩余的代码。像这样的操作：

m = m * 4;

未定义（通常没有意义，因为它们每次迭代执行四次）。

此外，当这些线程遇到 omp for 时，它们将共享循环的工作，即每次迭代只会由某个线程执行一次。由于 int_mcnd 在 parallel 区域内执行，因此它的所有局部变量都是私有的。您的代码中没有构造来实际收集那些私有结果（result 和 mean 也是私有的）。

正确的做法是使用带有reduction子句的并行for循环，表明有一个变量（r/v）在整个执行过程中被聚合循环。

为此，缩减变量需要在并行区域范围之外声明为共享变量。最简单的解决方案是将平行区域移动到 int_mcnd 内。这也避免了 m.

的竞争条件

还有一个障碍：rand 正在使用全局状态，至少我的实现是锁定的。由于大部分时间花在了 rand 上，您的代码会严重扩展。解决方案是通过 rand_r 使用显式 threadprivate 状态。（另请参阅 this question）。

拼凑起来，修改后的代码如下所示：

double int_mcnd(double (*fn)(double[], int), double a[], double b[], int n, int m)
{
    // Reduction variables need to be shared
    double r = 0.0;
    double v = 1.0;

    #pragma omp parallel
    // All variables declared inside are private
    {
        // step 1: calculate the common factor V
        #pragma omp for reduction(* : v)
        for (int j = 0; j < n; j = j + 1)
        {
            v = v * (b[j] - a[j]);
        }

        // step 2: integration
        unsigned int private_seed = omp_get_thread_num();
        #pragma omp for reduction(+ : r)
        for (int i = 1; i <= m; i = i + 1)
        {
            // Note: X MUST be private, otherwise, you have race-conditions again
            double x[n];
            // calculate random x[] points
            for (int j = 0; j < n; j = j + 1)
            {
                x[j] = a[j] + (rand_r(&private_seed)) / ((RAND_MAX / (b[j] - a[j])));
            }
            r = r + fn(x, n);
        }
    }
    r = r * v / m;

    return r;
}

double f(double[], int);
double int_mcnd(double (*)(double[], int), double[], double[], int, int);

int main(int argc, char** argv)
{

    /* define how many integrals */
    const int n = 10;

    double b[n] = { 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0 };
    double a[n] = { -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0 };

    int m;

    unsigned long long int i, N;
    int NumThreads = 4;

    // initial seed value (use system time)
    srand(time(NULL));

    cout.precision(6);
    cout.setf(ios::fixed | ios::showpoint);

    // current time in seconds (begin calculations)
    time_t seconds_i;
    seconds_i = time(NULL);

    m = 4; // initial number of intervals

    // convert command-line input to N = number of points
    N = atoi(argv[1]);

    for (i = 0; i <= N / pow(4, i); i++)
    {
        double result = int_mcnd(f, a, b, n, m);
        double mean = result / (pow(10, 10));
        cout << setw(30) << m << setw(30) << result << setw(30) << mean << endl;
        m = m * 4;
    }

    // current time in seconds (end of calculations)
    time_t seconds_f;
    seconds_f = time(NULL);
    cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;

    return 0;
}

请注意，我删除了除以四的部分，而且输出是在并行区域之外完成的。结果应该与串行版本相似（当然随机性除外）。

我在 -O3.

的 16 核系统上观察到完美的 16 倍加速

补充几点：

尽可能在本地声明变量。

如果线程开销是个问题，您可以将并行区域移到外面，但您需要更仔细地考虑并行执行，并找到共享缩减变量的解决方案。鉴于 Monte Carlo 代码令人尴尬的并行性质，您可以通过删除 omp for 指令来更贴近您的初始解决方案 - 这意味着每个线程都执行 all 循环迭代。然后您可以手动汇总结果变量并打印出来。但我真的不明白这一点。

10 维 Monte Carlo 与 openmp 集成

10 dimensional Monte Carlo integration with openmp

c++

parallel-processing

multithreading

openmp

montecarlo