OpenMP 实现比串行实现慢
OpenMP implementation slower than serial implementation
我目前正在尝试熟悉 OpenMP。为了练习,我用 OpenMP 实现了一个贪婪的 "learning" 算法。然后我用
测量了时间
time ./a.out
我与我的串行实现进行了比较,无论我的程序执行了多少次迭代,OpenMP 总是慢得多。
这是我的代码,评论应该能解释一切:
#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>
#define THREADS 4
using namespace std;
struct TrainData {
double input;
double output;
};
//Long Term Memory struct
struct LTM {
double a; //paramter a of the polynom
double b;
double c;
double score; //score to be minimized!
LTM()
{
a=0;
b=0;
c=0;
score=0;
}
//random LTM with paramters from low to high (including low and high)
LTM(int low, int high)
{
score=0;
a= rand() % high + low;
b= rand() % high + low;
c= rand() % high + low;
}
LTM(double _a, double _b, double _c)
{
a=_a;
b=_b;
c=_c;
}
void print()
{
cout<<"Score: "<<score<<endl;
cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
}
};
//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM <m, const double &x)
{
double ret;
ret = ltm.a*x*x + ltm.b*x + ltm.c;
return ret;
}
//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM <mnew, vector<TrainData> &td)
{
double score;
double val;
int tdsize=td.size();
score=0;
for(int i=0; i< tdsize; i++)
{
val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input);
val *= val;
score += val;
}
score /= (double)tdsize;
score = sqrt(score);
return score;
}
LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
LTM fav = LTM(low,high);
fav.score = score_function(fav, td);
fav.print();
LTM favs[THREADS]; // array for collecting the favorites of each thread
#pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
{
#pragma omp master
printf("Threads: %d\n", omp_get_num_threads());
LTM cand;
#pragma omp for private(cand)
for(int i=0; i<iterations; i++)
{
cand = LTM(low, high);
cand.score = score_function(cand, td);
if(cand.score < fav.score)
fav = cand;
}
//save the favorite before ending the parallel section
#pragma omp critical
favs[omp_get_thread_num()] = fav;
}
//search for the best one in the array
for(int i=0; i<THREADS; i++)
{
if(favs[i].score < fav.score)
fav=favs[i];
}
return fav;
}
//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
#pragma omp parallel for schedule(dynamic, 25)
for(int i=-50; i< 50; i++)
{
struct TrainData d;
d.input = i;
d.output = evaluate(train, (double)i);
#pragma omp critical
td->push_back(d);
//cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
}
}
int main(int argc, char *argv[])
{
int its= 10000000; //number of iterations
int a=2;
int b=4;
int c=6;
srand(time(NULL));
LTM pol = LTM(a,b,c); //original polynom parameters
vector<TrainData> td;
//first genarte some training data and save it to td
generateTrainData(&td, pol);
//try to find the best solution
LTM fav = iterate( its, td, 1, 6);
printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);
return 0;
}
在我家的 PC 上,这个实现花了 12 秒。序列号只有6s。
如果我将迭代次数增加 10 倍,则大约为 2 分钟/1 分钟(omp / serial)。
谁能帮帮我?
好的,感谢我最初问题的评论,我可以解决性能问题。
就像评论中所说的那样,问题出在我使用的 rand() 函数上。
我用适当的线程安全 drand48_r().
替换了它们
喜欢:
...
LTM(double low, double high, struct drand48_data *buff)
{
score=0;
double x;
drand48_r(buff,&x);
a= low + x * (high - low);
drand48_r(buff,&x);
b= low + x * (high - low);
drand48_r(buff,&x);
c= low + x * (high - low);
}
...
现在我得到的时间不到一秒!
谢谢! :)
我目前正在尝试熟悉 OpenMP。为了练习,我用 OpenMP 实现了一个贪婪的 "learning" 算法。然后我用
测量了时间time ./a.out
我与我的串行实现进行了比较,无论我的程序执行了多少次迭代,OpenMP 总是慢得多。
这是我的代码,评论应该能解释一切:
#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>
#define THREADS 4
using namespace std;
struct TrainData {
double input;
double output;
};
//Long Term Memory struct
struct LTM {
double a; //paramter a of the polynom
double b;
double c;
double score; //score to be minimized!
LTM()
{
a=0;
b=0;
c=0;
score=0;
}
//random LTM with paramters from low to high (including low and high)
LTM(int low, int high)
{
score=0;
a= rand() % high + low;
b= rand() % high + low;
c= rand() % high + low;
}
LTM(double _a, double _b, double _c)
{
a=_a;
b=_b;
c=_c;
}
void print()
{
cout<<"Score: "<<score<<endl;
cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
}
};
//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM <m, const double &x)
{
double ret;
ret = ltm.a*x*x + ltm.b*x + ltm.c;
return ret;
}
//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM <mnew, vector<TrainData> &td)
{
double score;
double val;
int tdsize=td.size();
score=0;
for(int i=0; i< tdsize; i++)
{
val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input);
val *= val;
score += val;
}
score /= (double)tdsize;
score = sqrt(score);
return score;
}
LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
LTM fav = LTM(low,high);
fav.score = score_function(fav, td);
fav.print();
LTM favs[THREADS]; // array for collecting the favorites of each thread
#pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
{
#pragma omp master
printf("Threads: %d\n", omp_get_num_threads());
LTM cand;
#pragma omp for private(cand)
for(int i=0; i<iterations; i++)
{
cand = LTM(low, high);
cand.score = score_function(cand, td);
if(cand.score < fav.score)
fav = cand;
}
//save the favorite before ending the parallel section
#pragma omp critical
favs[omp_get_thread_num()] = fav;
}
//search for the best one in the array
for(int i=0; i<THREADS; i++)
{
if(favs[i].score < fav.score)
fav=favs[i];
}
return fav;
}
//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
#pragma omp parallel for schedule(dynamic, 25)
for(int i=-50; i< 50; i++)
{
struct TrainData d;
d.input = i;
d.output = evaluate(train, (double)i);
#pragma omp critical
td->push_back(d);
//cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
}
}
int main(int argc, char *argv[])
{
int its= 10000000; //number of iterations
int a=2;
int b=4;
int c=6;
srand(time(NULL));
LTM pol = LTM(a,b,c); //original polynom parameters
vector<TrainData> td;
//first genarte some training data and save it to td
generateTrainData(&td, pol);
//try to find the best solution
LTM fav = iterate( its, td, 1, 6);
printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);
return 0;
}
在我家的 PC 上,这个实现花了 12 秒。序列号只有6s。 如果我将迭代次数增加 10 倍,则大约为 2 分钟/1 分钟(omp / serial)。
谁能帮帮我?
好的,感谢我最初问题的评论,我可以解决性能问题。
就像评论中所说的那样,问题出在我使用的 rand() 函数上。 我用适当的线程安全 drand48_r().
替换了它们喜欢:
...
LTM(double low, double high, struct drand48_data *buff)
{
score=0;
double x;
drand48_r(buff,&x);
a= low + x * (high - low);
drand48_r(buff,&x);
b= low + x * (high - low);
drand48_r(buff,&x);
c= low + x * (high - low);
}
...
现在我得到的时间不到一秒! 谢谢! :)