OpenMP 实现比串行实现慢

OpenMP implementation slower than serial implementation

我目前正在尝试熟悉 OpenMP。为了练习,我用 OpenMP 实现了一个贪婪的 "learning" 算法。然后我用

测量了时间
time ./a.out

我与我的串行实现进行了比较,无论我的程序执行了多少次迭代,OpenMP 总是慢得多。

这是我的代码,评论应该能解释一切:

#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>

#define THREADS 4

using namespace std;

struct TrainData {
    double input;
    double output;
};

//Long Term Memory struct 
struct LTM {
        double a; //paramter a of the polynom
        double b;
        double c;
        double score; //score to be minimized!

        LTM()
        {
            a=0;
            b=0;
            c=0;
            score=0;
        }

        //random LTM with paramters from low to high (including low and high)
        LTM(int low, int high)
        {
            score=0;
            a= rand() % high + low;
            b= rand() % high + low;
            c= rand() % high + low; 

        }

        LTM(double _a, double _b, double _c)
        {
            a=_a;
            b=_b;
            c=_c;
        }

        void print()
        {
            cout<<"Score: "<<score<<endl;
            cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
        }
};

//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM &ltm, const double &x)
{
    double ret;
    ret = ltm.a*x*x + ltm.b*x + ltm.c;

    return ret; 
}


//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM &ltmnew, vector<TrainData> &td)
{
    double score;
    double val;
    int tdsize=td.size();
    score=0;

    for(int i=0; i< tdsize; i++)
    {
        val = (td.at(i)).output -  evaluate(ltmnew, (td.at(i)).input);
        val *=  val;
        score += val;
    }

    score /= (double)tdsize;

    score = sqrt(score);

    return score;
}

LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
    LTM fav = LTM(low,high);
    fav.score = score_function(fav, td);
    fav.print();
    LTM favs[THREADS]; // array for collecting the favorites of each thread

    #pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
    {
        #pragma omp master
        printf("Threads: %d\n", omp_get_num_threads());

        LTM cand;
        #pragma omp for private(cand)
        for(int i=0; i<iterations; i++)
        {
            cand = LTM(low, high);
            cand.score = score_function(cand, td);

            if(cand.score < fav.score)
                fav = cand;
        }

        //save the favorite before ending the parallel section
        #pragma omp critical
        favs[omp_get_thread_num()] = fav;
    }

    //search for the best one in the array
    for(int i=0; i<THREADS; i++)
    {
        if(favs[i].score < fav.score)
            fav=favs[i];
    }

    return fav;
}

//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
    #pragma omp parallel for schedule(dynamic, 25) 
    for(int i=-50; i< 50; i++)
    {
        struct TrainData d;
        d.input = i;
        d.output = evaluate(train, (double)i);
        #pragma omp critical
        td->push_back(d);

        //cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
    }

}

int main(int argc, char *argv[])
{

    int its= 10000000; //number of iterations 
    int a=2;
    int b=4;
    int c=6;

    srand(time(NULL));
    LTM pol = LTM(a,b,c); //original polynom parameters
    vector<TrainData> td;

    //first genarte some training data and save it to td
    generateTrainData(&td, pol); 

    //try to find the best solution
    LTM fav = iterate( its, td, 1, 6);


    printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);

    return 0;
}

在我家的 PC 上,这个实现花了 12 秒。序列号只有6s。 如果我将迭代次数增加 10 倍,则大约为 2 分钟/1 分钟(omp / serial)。

谁能帮帮我?

好的,感谢我最初问题的评论,我可以解决性能问题。

就像评论中所说的那样,问题出在我使用的 rand() 函数上。 我用适当的线程安全 drand48_r().

替换了它们

喜欢:

...
LTM(double low, double high, struct drand48_data *buff)
{
    score=0;
    double x;
    drand48_r(buff,&x);
    a= low + x * (high - low);
    drand48_r(buff,&x);
    b= low + x * (high - low);
    drand48_r(buff,&x);
    c= low + x * (high - low);

}
...

现在我得到的时间不到一秒! 谢谢! :)