使用长动态数组优化 c++ Monte Carlo 模拟
Optimize c++ Monte Carlo simulation with long dynamic arrays
第一次post来这里,经验不多,请见谅。
我正在为我的博士学位构建一个 Monte Carlo C++ 模拟,我需要帮助来优化它的计算时间和性能。我有一个在每个坐标中重复的 3d 立方体作为模拟体积,并且在每个立方体内部以簇的形式生成磁性粒子。然后,在中央立方体中,创建并移动质子环,并在每一步中计算它们感受到的所有粒子(除其他外)的总磁场。
此时我在主函数中定义了所有内容,因为我需要粒子的位置来进行计算(我计算粒子在放置期间以及质子运动期间的距离),我将它们存储在动态数组。我还没有使用任何 class 或功能。这使得我的模拟非常慢,因为我最终必须使用数百万个粒子和数千个质子。即使有数百个,也需要几天时间。我还使用了很多 for 和 while 循环以及 reading/writing 到 .dat 文件。
我真的需要你的帮助。我花了数周时间尝试优化我的代码,但我的项目落后于计划。你有什么建议吗?我需要数组来存储粒子的位置。你认为 classes 还是函数会更有效率?一般来说,任何建议都是有帮助的。对不起,如果那太长了,但我很绝望...
好的,我编辑了我的原始 post 并分享了我的完整脚本。我希望这能让您对我的模拟有所了解。谢谢。
另外我添加了两个输入文件
parametersDiffusion_spher_shel.txt
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <math.h>
#include <iomanip> //precision to output
//#include <time.h>
#include <ctime>
#include <cstdlib>
#include <algorithm>
#include <string>
//#include <complex>
#include <chrono> //random generator
#include <random>
using namespace std;
#define PI 3.14159265
#define tN 500000 //# of timepoints (steps) to define the arrays ONLY
#define D_const 3.0E-9 //diffusion constant (m^2/s)
#define Beq 0.16 // Tesla
#define gI 2.6752218744E8 //(sT)^-1
int main(){
//Mersenne Twister random engine
mt19937 rng(chrono::steady_clock::now().time_since_epoch().count());
//uniform_int_distribution<int> intDist(0,1);
uniform_real_distribution<double> realDist(0.,1.);
//for(int i=1; i<100; i++){
//cout<<"R max: "<<Ragg-Rspm<<" r_spm: "<<(Ragg-Rspm)*sqrt(realDist(rng))<<endl;
//}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//input files
double Rionp=1.0E-8, Ragg=2.0E-7, t_tot=2.0E-2, l_tot = 3.0E-4;
int ionpN=10, aggN=10,cubAxN=10, parN=1E5;
int temp_ionpN, temp_aggN, temp_cubAxN, temp_parN;
ifstream inIONP;
inIONP.open("parametersIONP_spher_shel.txt");
if(!inIONP){
cout << "Unable to open IONP parameters file";
exit(1); // terminate with error
}
while (inIONP>>Rionp>>Ragg>>temp_ionpN>>temp_aggN>>l_tot>>temp_cubAxN) {
ionpN = (int)temp_ionpN;
aggN = (int)temp_aggN;
cubAxN = (int)temp_cubAxN;
}
inIONP.close();
cout<<"Rionp: "<<Rionp<<" ionpN: "<<ionpN <<" aggN: "<<aggN<<endl;
cout<<"l_tot: "<<l_tot<<" cubAxN: "<<cubAxN<<endl;
ifstream indiff;
indiff.open("parametersDiffusion_spher_shel.txt");
if(!indiff){
cout << "Unable to open diffusion parameters file";
exit(1); // terminate with error
}
while (indiff>>temp_parN>>t_tot) {
parN = (int)temp_parN;
}
indiff.close();
cout<<"parN: "<<parN<<" t_tot: "<<t_tot<<endl;
/////////////////////////////////////////////////////////////////////////////////////////////////
int cubN = pow(cubAxN,3.); // total cubes
int Nionp_tot = ionpN*aggN*cubN; //total IONP
double f_tot = (double)Nionp_tot*(4.*PI*pow(Rionp,3.)/3.)/pow(l_tot,3.);//volume density
//central cube
double l_c = l_tot/(double)cubAxN;
int Nionp_c = ionpN*aggN; //SPM in central cube
double f_c = (double)Nionp_c*(4.*PI*pow(Rionp,3.)/3.)/pow(l_c,3.);
cout<<"f_tot: "<<f_tot<<" Nionp_tot: "<<Nionp_tot<<" l_tot "<<l_tot<<endl;
cout<<"f_c: "<<f_c<<" Nionp_c: "<<Nionp_c<<" l_c "<<l_c<<endl;
cout<<"Now IONP are generated..."<<endl;
//position of aggregate (spherical distribution IONP)
double *x1_ionp, *x2_ionp, *x3_ionp, *theta_ionp, *phi_ionp, *r_ionp, *x1_agg, *x2_agg, *x3_agg;
x1_ionp = new double [Nionp_tot];
x2_ionp = new double [Nionp_tot];
x3_ionp = new double [Nionp_tot];
theta_ionp = new double [Nionp_tot];
phi_ionp = new double [Nionp_tot];
r_ionp = new double [Nionp_tot];
x1_agg = new double [Nionp_tot];
x2_agg = new double [Nionp_tot];
x3_agg = new double [Nionp_tot];
int ionpCounter = 0;
int aggCounter = 0;
double x1_aggTemp=0., x2_aggTemp=0., x3_aggTemp=0.;
double ionpDist = 0.; //distance SPM-SPM
for(int a=0; a<cubAxN; a++){ //x1-filling cubes
for(int b=0; b<cubAxN; b++){ //x2-
for(int c=0; c<cubAxN; c++){ //x3-
bool far_ionp = true;
cout<<"cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
for(int i=0; i<aggN; i++){ //aggregate iterations
x1_aggTemp=realDist(rng)*l_c + l_c*a - l_tot/2.; //from neg to pos filling
x2_aggTemp=realDist(rng)*l_c + l_c*b - l_tot/2.;
x3_aggTemp=realDist(rng)*l_c + l_c*c - l_tot/2.;
for(int j=0; j<ionpN; j++){ //SPM iterations
// cout<<"SPM: "<<j<<" aggregate: "<<i<<" cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
x1_agg[ionpCounter]=x1_aggTemp;
x2_agg[ionpCounter]=x2_aggTemp;
x3_agg[ionpCounter]=x3_aggTemp;
//uniform 4pi distribution in sphere
while(true){
far_ionp = true; //must be updated!
theta_ionp[ionpCounter] = 2.*PI*realDist(rng);
phi_ionp[ionpCounter] = acos(1. - 2.*realDist(rng));
r_ionp[ionpCounter] = (Ragg-Rionp)*sqrt(realDist(rng)); // to have uniform distribution sqrt
x1_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*cos(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x1_agg[ionpCounter];
x2_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*sin(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x2_agg[ionpCounter];
x3_ionp[ionpCounter] = cos(phi_ionp[ionpCounter])*r_ionp[ionpCounter] + x3_agg[ionpCounter];
for(int m=0; m<ionpCounter; m++){ //impenetrable IONP to each other
ionpDist = sqrt(pow(x1_ionp[m]-x1_ionp[ionpCounter],2.)+pow(x2_ionp[m]-x2_ionp[ionpCounter],2.)+pow(x3_ionp[m]-x3_ionp[ionpCounter],2.));
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 2*Rionp)){
far_ionp = false;
cout<<"CLOSE ionp-ionp! Distanse ionp-ionp: "<<ionpDist<<endl;
}
}
if(far_ionp){
cout<<"IONP can break now! ionpCounter: "<<ionpCounter<<endl;
break;
}
}
cout<<"r_ionp: "<<r_ionp[ionpCounter]<<" x1_ionp: "<<x1_ionp[ionpCounter]<<" x2_ionp: "<<x2_ionp[ionpCounter]<<" x3_ionp: "<<x3_ionp[ionpCounter]<<endl;
cout<<"x1_agg: "<<x1_agg[ionpCounter]<<" x2_agg: "<<x2_agg[ionpCounter]<<" x3_agg: "<<x3_agg[ionpCounter]<<endl;
ionpCounter++;
}
aggCounter++;
}
}
}
}
cout<<"ionpCounter: "<<ionpCounter<<" aggCounter: "<<aggCounter<<endl;
//=====proton diffusion=============//
//outfile
//proton diffusion time-positionSPM_uniform
FILE *outP_tPos;
outP_tPos = fopen("V3_MAT_positionProtons_spherical.dat","wb+");
if(!outP_tPos){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
//proton diffusion time-positionSPM_uniform
FILE *outP_tB;
outP_tB = fopen("V3_MAT_positionB_spherical.dat","wb+");
if(!outP_tB){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
double *cosPhase_S, *sinPhase_S, *m_tot;
cosPhase_S = new double [tN];
sinPhase_S = new double [tN];
m_tot = new double [tN];
double tstep = 0.; // time of each step
int stepCounter = 0; // counter for the steps for each proton
int cnt_stpMin=0; //, cnt_stpMax=0; //counters for the step length conditions
for (int i=0; i<parN; i++){// repetition for all the protons in the sample
stepCounter = 0; //reset
cout<<"Now diffusion calculated for proton: "<<i<<endl;
double x0[3]={0.}, xt[3]={0.}, vt[3]={0.};
double tt=0.;
double stepL_min = Rionp/8.; //min step length
double stepL_max = Rionp; //max step length
double stepL = 0.;
double extraL = 0.; //extra length beyond central cube
bool hit_ionp = false;
double pIONPDist = 0.; // proton-IONP Distance (vector ||)
double pIONPCosTheta = 0.; //proton_IONP vector cosTheta with Z axis
double Bloc = 0.; //B 1 IONP
double Btot = 0.; //SUM B all IONP
double Dphase = 0.; //Delta phase for step 1p
double phase = 0.; //phase 1p
double theta_p=0., phi_p=0.;
//randomized initial position of the particle;
x0[0] = realDist(rng)*l_c - l_c/2.;
x0[1] = realDist(rng)*l_c - l_c/2.;
x0[2] = realDist(rng)*l_c - l_c/2.;
//for (int j=0; j<tN; j++){ //steps
bool diffTime = true; // flag protons are allowed to diffuse (tt<10ms)
while(diffTime){ // steps loop
//unit vector for 4p direction
theta_p = 2.*PI*realDist(rng);
phi_p = acos(1. - 2.*realDist(rng));
vt[0] = sin(phi_p)*cos(theta_p);
vt[1] = sin(phi_p)*sin(theta_p);
vt[2] = cos(phi_p);;
//determine length of step
for(int k=0; k<ionpCounter; k++){
if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) <= 8*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) > 8*Rionp){
stepL = Rionp;
}
else{
cout<<"sth wrong with the proton-IONP distance!"<<endl;
}
}
//determine Dt step duration
tstep = pow(stepL,2.)/(6.*D_const);
tt += tstep;
if(tt>t_tot){
diffTime = false; //proton is not allowed to diffuse any longer
cout<<"Proton id: "<<i<<" has reached diffusion time! -> Move to next one!"<<endl;
cout<<"stepCounter: "<<stepCounter<<" cnt_stpMin: "<<cnt_stpMin<<endl;
}
while(true){
xt[0]=x0[0]+vt[0]*stepL;
xt[1]=x0[1]+vt[1]*stepL;
xt[2]=x0[2]+vt[2]*stepL;
for(int m=0; m<3; m++){
if(abs(xt[m]) > l_c/2.){ //particle outside central cube,// reflected, elastic collision(no!)
//particle enters fron the other way, periodicity
// hit_cx[m] = true; //I don't need it yet
extraL = abs(xt[m]) - l_c/2.;
// xt[m]=-x0[m];
cout<<"proton outside! xt[m]: "<<xt[m]<<" extra lenght: "<<extraL<<endl;
xt[m] = xt[m]-l_c;
cout<<"Relocating => new x[t]: "<<xt[m]<<endl;
}
}
for(int k=0; k<ionpCounter; k++){//check if proton inside SPM
pIONPDist = sqrt(pow((x1_ionp[k]-xt[0]),2.)+pow((x2_ionp[k]-xt[1]),2.)+pow((x3_ionp[k]-xt[2]),2.)) - Rionp;
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else if(pIONPDist > 0.){
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
Bloc = pow(Rionp,3.)*Beq*(3.*pIONPCosTheta - 1.)/pow(pIONPDist,3.);
Btot += Bloc;
//cout<<"pSPMDist: "<<pSPMDist<<" pSPMCosTheta: "<<pSPMCosTheta<<" Bloc: "<<Bloc<<" Btot: "<<Btot<<endl;
}
else{
cout<<"Something wrong with the calculation of pIONPDist! "<<pIONPDist<<endl;
hit_ionp = true;
}
}
if(!hit_ionp){
// hit_spm=false; //reset flag (unnessesary alreaty false)
break;
}
}// end of while for new position -> the new position is determined, Btot calculated
// Dphase, phase
Dphase = gI*Btot*tstep;
phase += Dphase;
//store phase for this step
//filled for each proton at this timepoint (step)
cosPhase_S[stepCounter] += cos(phase);
sinPhase_S[stepCounter] += sin(phase);
//reset Btot
Btot = 0.;
stepCounter++;
} //end of for loop step
} //end of for loop particles
//-----calculate the <m> the total magnetization
for(int t=0; t<tN; t++){
m_tot[t] = sqrt(pow(cosPhase_S[t],2.) + pow(sinPhase_S[t],2.))/(double)parN;
//cout<<"m_tot[t]: "<<m_tot[t]<<endl;
}
fclose(outP_tPos); //proton time-position
fclose(outP_tB); //proton time-B
//====== outfile data=============//
//----- output data of SPM position---------//
FILE *outP_S;
outP_S = fopen("V3_MAT_positionSPM_spherical.dat","wb+");
if(!outP_S){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int i=0; i<ionpCounter; ++i){
fprintf(outP_S,"%.10f \t %.10f \t %.10f\n",x1_ionp[i],x2_ionp[i],x3_ionp[i]);
}
fclose(outP_S);
FILE *outP_agg;
outP_agg = fopen("V3_MAT_positionAggreg_spherical.dat","wb+");
if(!outP_agg){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int j=0; j<ionpCounter; ++j){
fprintf(outP_agg,"%.10f \t %.10f \t %.10f\n",x1_agg[j],x2_agg[j],x3_agg[j]);
}
fclose(outP_agg);
FILE *outSngl;
outSngl = fopen("V3_MAT_positionSingle_spherical.dat","wb+");
if(!outSngl){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
int findAgg = (int)(realDist(rng)*aggN);
int idxMin = findAgg*ionpN;
int idxMax = idxMin + ionpN;
for (int k=idxMin; k<idxMax; ++k){
fprintf(outSngl,"%.10f\t%.10f\t%.10f\t%.10f\t%.10f\t%.10f\n",x1_agg[k],x2_agg[k],x3_agg[k],x1_ionp[k],x2_ionp[k],x3_ionp[k]);
}
fclose(outSngl);
//delete new arrays
delete[] x1_ionp;
delete[] x2_ionp;
delete[] x3_ionp;
delete[] theta_ionp;
delete[] phi_ionp;
delete[] r_ionp;
delete[] x1_agg;
delete[] x2_agg;
delete[] x3_agg;
delete[] cosPhase_S;
delete[] sinPhase_S;
delete[] m_tot;
}
我在更多步骤中讨论了这个问题,首先我使 运行 可重现:
mt19937 rng(127386261); //I want a deterministic seed
然后我创建一个脚本来比较程序生成的三个输出文件:
#!/bin/bash
diff V3_MAT_positionAggreg_spherical.dat V3_MAT_positionAggreg_spherical2.dat
diff V3_MAT_positionSingle_spherical.dat V3_MAT_positionSingle_spherical2.dat
diff V3_MAT_positionSPM_spherical.dat V3_MAT_positionSPM_spherical2.dat
以两个结尾的文件由优化代码创建,另一个由您的版本创建。
我 运行 你的版本用 O3 标志编译并标记了时间(对于 20 个磁性粒子和 10 个质子,在我的盒子上需要 79 秒,我的架构并不那么重要,因为我们只是要比较差异)。
然后我开始逐步重构,运行比较输出文件和时间的每一个小变化,这里是所有迭代:
如果获得 5 秒(总共 运行 74.0 秒)则删除多余的 else
if(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.)) <= 7*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else { //this was an else if and an else for error that will never happen
stepL = Rionp;
}
至此,我运行它在profiler和pow函数下脱颖而出
用方形和立方体替换 pow 获得 61 秒(总计 运行 13.2 秒)
只需将 pow(x,2.) 替换为 square(x) 并将 pow(x,3.) 替换为 cube(x) 即可将 运行 时间减少约 600%
double square(double d)
{
return d*d;
}
double cube(double d)
{
return d*d*d;
}
现在每次改进都会减少很多增益,但仍然如此。
去除多余的sqrt增益(总计运行 12.9 s)
double ionpDist = square(x1_ionp[m]-x1_ionp[ionpCounter])+square(x2_ionp[m]-x2_ionp[ionpCounter])+square(x3_ionp[m]-x3_ionp[ionpCounter]);
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 4*square_Rionp)){
引入常量变量square_Rionp和cube_Rionp(共运行12.7秒)
const double square_Rionp = square(Rionp);
const double cube_Rionp = cube(Rionp);
//replaced in the code like this
if((j>0) && (ionpDist <= 4*square_Rionp)){
为 pi 引入变量(总 运行 12.6 秒)
const double Two_PI = PI*2.0;
const double FourThird_PI = PI*4.0/3.0;
删除一个(另一个)多余的else if(总运行 11.9s)
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else { //this was an else if without any reason
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
...
}
再去掉一个多余的平方根(共运行11.2秒)
const double Seven_Rionp_squared =square(7*Rionp);
...
for(int k=0; k<ionpCounter; k++){
if(square(x1_ionp[k]-x0[0])+square(x2_ionp[k]-x0[1])+square(x3_ionp[k]-x0[2]) <= Seven_Rionp_squared){
//spm closer than 8R
stepL = stepL_min;
cnt_stpMin ++;
break;
}
我没有看到更多明显的东西可以从中挤出更多的性能。进一步的优化可能需要一些思考。
我用 50 个磁性粒子和 10 个质子进行了另一次比较 运行,我发现我的版本比你的快 7 倍,而且它生成的文件完全相同。
我会在源代码管理的帮助下完成这个练习。
您的代码可以简单地并行化,但我会在您优化单线程版本后才走这条路。
编辑
用=运算符改变+=(共运行 6.23秒)
我注意到 += 运算符被无缘无故地使用,替换为运算符 = 是一个巨大的收获:
cosPhase_S[stepCounter] = cos(phase);
sinPhase_S[stepCounter] = sin(phase);
第一次post来这里,经验不多,请见谅。
我正在为我的博士学位构建一个 Monte Carlo C++ 模拟,我需要帮助来优化它的计算时间和性能。我有一个在每个坐标中重复的 3d 立方体作为模拟体积,并且在每个立方体内部以簇的形式生成磁性粒子。然后,在中央立方体中,创建并移动质子环,并在每一步中计算它们感受到的所有粒子(除其他外)的总磁场。
此时我在主函数中定义了所有内容,因为我需要粒子的位置来进行计算(我计算粒子在放置期间以及质子运动期间的距离),我将它们存储在动态数组。我还没有使用任何 class 或功能。这使得我的模拟非常慢,因为我最终必须使用数百万个粒子和数千个质子。即使有数百个,也需要几天时间。我还使用了很多 for 和 while 循环以及 reading/writing 到 .dat 文件。
我真的需要你的帮助。我花了数周时间尝试优化我的代码,但我的项目落后于计划。你有什么建议吗?我需要数组来存储粒子的位置。你认为 classes 还是函数会更有效率?一般来说,任何建议都是有帮助的。对不起,如果那太长了,但我很绝望...
好的,我编辑了我的原始 post 并分享了我的完整脚本。我希望这能让您对我的模拟有所了解。谢谢。
另外我添加了两个输入文件
parametersDiffusion_spher_shel.txt
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <math.h>
#include <iomanip> //precision to output
//#include <time.h>
#include <ctime>
#include <cstdlib>
#include <algorithm>
#include <string>
//#include <complex>
#include <chrono> //random generator
#include <random>
using namespace std;
#define PI 3.14159265
#define tN 500000 //# of timepoints (steps) to define the arrays ONLY
#define D_const 3.0E-9 //diffusion constant (m^2/s)
#define Beq 0.16 // Tesla
#define gI 2.6752218744E8 //(sT)^-1
int main(){
//Mersenne Twister random engine
mt19937 rng(chrono::steady_clock::now().time_since_epoch().count());
//uniform_int_distribution<int> intDist(0,1);
uniform_real_distribution<double> realDist(0.,1.);
//for(int i=1; i<100; i++){
//cout<<"R max: "<<Ragg-Rspm<<" r_spm: "<<(Ragg-Rspm)*sqrt(realDist(rng))<<endl;
//}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//input files
double Rionp=1.0E-8, Ragg=2.0E-7, t_tot=2.0E-2, l_tot = 3.0E-4;
int ionpN=10, aggN=10,cubAxN=10, parN=1E5;
int temp_ionpN, temp_aggN, temp_cubAxN, temp_parN;
ifstream inIONP;
inIONP.open("parametersIONP_spher_shel.txt");
if(!inIONP){
cout << "Unable to open IONP parameters file";
exit(1); // terminate with error
}
while (inIONP>>Rionp>>Ragg>>temp_ionpN>>temp_aggN>>l_tot>>temp_cubAxN) {
ionpN = (int)temp_ionpN;
aggN = (int)temp_aggN;
cubAxN = (int)temp_cubAxN;
}
inIONP.close();
cout<<"Rionp: "<<Rionp<<" ionpN: "<<ionpN <<" aggN: "<<aggN<<endl;
cout<<"l_tot: "<<l_tot<<" cubAxN: "<<cubAxN<<endl;
ifstream indiff;
indiff.open("parametersDiffusion_spher_shel.txt");
if(!indiff){
cout << "Unable to open diffusion parameters file";
exit(1); // terminate with error
}
while (indiff>>temp_parN>>t_tot) {
parN = (int)temp_parN;
}
indiff.close();
cout<<"parN: "<<parN<<" t_tot: "<<t_tot<<endl;
/////////////////////////////////////////////////////////////////////////////////////////////////
int cubN = pow(cubAxN,3.); // total cubes
int Nionp_tot = ionpN*aggN*cubN; //total IONP
double f_tot = (double)Nionp_tot*(4.*PI*pow(Rionp,3.)/3.)/pow(l_tot,3.);//volume density
//central cube
double l_c = l_tot/(double)cubAxN;
int Nionp_c = ionpN*aggN; //SPM in central cube
double f_c = (double)Nionp_c*(4.*PI*pow(Rionp,3.)/3.)/pow(l_c,3.);
cout<<"f_tot: "<<f_tot<<" Nionp_tot: "<<Nionp_tot<<" l_tot "<<l_tot<<endl;
cout<<"f_c: "<<f_c<<" Nionp_c: "<<Nionp_c<<" l_c "<<l_c<<endl;
cout<<"Now IONP are generated..."<<endl;
//position of aggregate (spherical distribution IONP)
double *x1_ionp, *x2_ionp, *x3_ionp, *theta_ionp, *phi_ionp, *r_ionp, *x1_agg, *x2_agg, *x3_agg;
x1_ionp = new double [Nionp_tot];
x2_ionp = new double [Nionp_tot];
x3_ionp = new double [Nionp_tot];
theta_ionp = new double [Nionp_tot];
phi_ionp = new double [Nionp_tot];
r_ionp = new double [Nionp_tot];
x1_agg = new double [Nionp_tot];
x2_agg = new double [Nionp_tot];
x3_agg = new double [Nionp_tot];
int ionpCounter = 0;
int aggCounter = 0;
double x1_aggTemp=0., x2_aggTemp=0., x3_aggTemp=0.;
double ionpDist = 0.; //distance SPM-SPM
for(int a=0; a<cubAxN; a++){ //x1-filling cubes
for(int b=0; b<cubAxN; b++){ //x2-
for(int c=0; c<cubAxN; c++){ //x3-
bool far_ionp = true;
cout<<"cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
for(int i=0; i<aggN; i++){ //aggregate iterations
x1_aggTemp=realDist(rng)*l_c + l_c*a - l_tot/2.; //from neg to pos filling
x2_aggTemp=realDist(rng)*l_c + l_c*b - l_tot/2.;
x3_aggTemp=realDist(rng)*l_c + l_c*c - l_tot/2.;
for(int j=0; j<ionpN; j++){ //SPM iterations
// cout<<"SPM: "<<j<<" aggregate: "<<i<<" cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
x1_agg[ionpCounter]=x1_aggTemp;
x2_agg[ionpCounter]=x2_aggTemp;
x3_agg[ionpCounter]=x3_aggTemp;
//uniform 4pi distribution in sphere
while(true){
far_ionp = true; //must be updated!
theta_ionp[ionpCounter] = 2.*PI*realDist(rng);
phi_ionp[ionpCounter] = acos(1. - 2.*realDist(rng));
r_ionp[ionpCounter] = (Ragg-Rionp)*sqrt(realDist(rng)); // to have uniform distribution sqrt
x1_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*cos(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x1_agg[ionpCounter];
x2_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*sin(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x2_agg[ionpCounter];
x3_ionp[ionpCounter] = cos(phi_ionp[ionpCounter])*r_ionp[ionpCounter] + x3_agg[ionpCounter];
for(int m=0; m<ionpCounter; m++){ //impenetrable IONP to each other
ionpDist = sqrt(pow(x1_ionp[m]-x1_ionp[ionpCounter],2.)+pow(x2_ionp[m]-x2_ionp[ionpCounter],2.)+pow(x3_ionp[m]-x3_ionp[ionpCounter],2.));
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 2*Rionp)){
far_ionp = false;
cout<<"CLOSE ionp-ionp! Distanse ionp-ionp: "<<ionpDist<<endl;
}
}
if(far_ionp){
cout<<"IONP can break now! ionpCounter: "<<ionpCounter<<endl;
break;
}
}
cout<<"r_ionp: "<<r_ionp[ionpCounter]<<" x1_ionp: "<<x1_ionp[ionpCounter]<<" x2_ionp: "<<x2_ionp[ionpCounter]<<" x3_ionp: "<<x3_ionp[ionpCounter]<<endl;
cout<<"x1_agg: "<<x1_agg[ionpCounter]<<" x2_agg: "<<x2_agg[ionpCounter]<<" x3_agg: "<<x3_agg[ionpCounter]<<endl;
ionpCounter++;
}
aggCounter++;
}
}
}
}
cout<<"ionpCounter: "<<ionpCounter<<" aggCounter: "<<aggCounter<<endl;
//=====proton diffusion=============//
//outfile
//proton diffusion time-positionSPM_uniform
FILE *outP_tPos;
outP_tPos = fopen("V3_MAT_positionProtons_spherical.dat","wb+");
if(!outP_tPos){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
//proton diffusion time-positionSPM_uniform
FILE *outP_tB;
outP_tB = fopen("V3_MAT_positionB_spherical.dat","wb+");
if(!outP_tB){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
double *cosPhase_S, *sinPhase_S, *m_tot;
cosPhase_S = new double [tN];
sinPhase_S = new double [tN];
m_tot = new double [tN];
double tstep = 0.; // time of each step
int stepCounter = 0; // counter for the steps for each proton
int cnt_stpMin=0; //, cnt_stpMax=0; //counters for the step length conditions
for (int i=0; i<parN; i++){// repetition for all the protons in the sample
stepCounter = 0; //reset
cout<<"Now diffusion calculated for proton: "<<i<<endl;
double x0[3]={0.}, xt[3]={0.}, vt[3]={0.};
double tt=0.;
double stepL_min = Rionp/8.; //min step length
double stepL_max = Rionp; //max step length
double stepL = 0.;
double extraL = 0.; //extra length beyond central cube
bool hit_ionp = false;
double pIONPDist = 0.; // proton-IONP Distance (vector ||)
double pIONPCosTheta = 0.; //proton_IONP vector cosTheta with Z axis
double Bloc = 0.; //B 1 IONP
double Btot = 0.; //SUM B all IONP
double Dphase = 0.; //Delta phase for step 1p
double phase = 0.; //phase 1p
double theta_p=0., phi_p=0.;
//randomized initial position of the particle;
x0[0] = realDist(rng)*l_c - l_c/2.;
x0[1] = realDist(rng)*l_c - l_c/2.;
x0[2] = realDist(rng)*l_c - l_c/2.;
//for (int j=0; j<tN; j++){ //steps
bool diffTime = true; // flag protons are allowed to diffuse (tt<10ms)
while(diffTime){ // steps loop
//unit vector for 4p direction
theta_p = 2.*PI*realDist(rng);
phi_p = acos(1. - 2.*realDist(rng));
vt[0] = sin(phi_p)*cos(theta_p);
vt[1] = sin(phi_p)*sin(theta_p);
vt[2] = cos(phi_p);;
//determine length of step
for(int k=0; k<ionpCounter; k++){
if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) <= 8*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) > 8*Rionp){
stepL = Rionp;
}
else{
cout<<"sth wrong with the proton-IONP distance!"<<endl;
}
}
//determine Dt step duration
tstep = pow(stepL,2.)/(6.*D_const);
tt += tstep;
if(tt>t_tot){
diffTime = false; //proton is not allowed to diffuse any longer
cout<<"Proton id: "<<i<<" has reached diffusion time! -> Move to next one!"<<endl;
cout<<"stepCounter: "<<stepCounter<<" cnt_stpMin: "<<cnt_stpMin<<endl;
}
while(true){
xt[0]=x0[0]+vt[0]*stepL;
xt[1]=x0[1]+vt[1]*stepL;
xt[2]=x0[2]+vt[2]*stepL;
for(int m=0; m<3; m++){
if(abs(xt[m]) > l_c/2.){ //particle outside central cube,// reflected, elastic collision(no!)
//particle enters fron the other way, periodicity
// hit_cx[m] = true; //I don't need it yet
extraL = abs(xt[m]) - l_c/2.;
// xt[m]=-x0[m];
cout<<"proton outside! xt[m]: "<<xt[m]<<" extra lenght: "<<extraL<<endl;
xt[m] = xt[m]-l_c;
cout<<"Relocating => new x[t]: "<<xt[m]<<endl;
}
}
for(int k=0; k<ionpCounter; k++){//check if proton inside SPM
pIONPDist = sqrt(pow((x1_ionp[k]-xt[0]),2.)+pow((x2_ionp[k]-xt[1]),2.)+pow((x3_ionp[k]-xt[2]),2.)) - Rionp;
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else if(pIONPDist > 0.){
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
Bloc = pow(Rionp,3.)*Beq*(3.*pIONPCosTheta - 1.)/pow(pIONPDist,3.);
Btot += Bloc;
//cout<<"pSPMDist: "<<pSPMDist<<" pSPMCosTheta: "<<pSPMCosTheta<<" Bloc: "<<Bloc<<" Btot: "<<Btot<<endl;
}
else{
cout<<"Something wrong with the calculation of pIONPDist! "<<pIONPDist<<endl;
hit_ionp = true;
}
}
if(!hit_ionp){
// hit_spm=false; //reset flag (unnessesary alreaty false)
break;
}
}// end of while for new position -> the new position is determined, Btot calculated
// Dphase, phase
Dphase = gI*Btot*tstep;
phase += Dphase;
//store phase for this step
//filled for each proton at this timepoint (step)
cosPhase_S[stepCounter] += cos(phase);
sinPhase_S[stepCounter] += sin(phase);
//reset Btot
Btot = 0.;
stepCounter++;
} //end of for loop step
} //end of for loop particles
//-----calculate the <m> the total magnetization
for(int t=0; t<tN; t++){
m_tot[t] = sqrt(pow(cosPhase_S[t],2.) + pow(sinPhase_S[t],2.))/(double)parN;
//cout<<"m_tot[t]: "<<m_tot[t]<<endl;
}
fclose(outP_tPos); //proton time-position
fclose(outP_tB); //proton time-B
//====== outfile data=============//
//----- output data of SPM position---------//
FILE *outP_S;
outP_S = fopen("V3_MAT_positionSPM_spherical.dat","wb+");
if(!outP_S){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int i=0; i<ionpCounter; ++i){
fprintf(outP_S,"%.10f \t %.10f \t %.10f\n",x1_ionp[i],x2_ionp[i],x3_ionp[i]);
}
fclose(outP_S);
FILE *outP_agg;
outP_agg = fopen("V3_MAT_positionAggreg_spherical.dat","wb+");
if(!outP_agg){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int j=0; j<ionpCounter; ++j){
fprintf(outP_agg,"%.10f \t %.10f \t %.10f\n",x1_agg[j],x2_agg[j],x3_agg[j]);
}
fclose(outP_agg);
FILE *outSngl;
outSngl = fopen("V3_MAT_positionSingle_spherical.dat","wb+");
if(!outSngl){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
int findAgg = (int)(realDist(rng)*aggN);
int idxMin = findAgg*ionpN;
int idxMax = idxMin + ionpN;
for (int k=idxMin; k<idxMax; ++k){
fprintf(outSngl,"%.10f\t%.10f\t%.10f\t%.10f\t%.10f\t%.10f\n",x1_agg[k],x2_agg[k],x3_agg[k],x1_ionp[k],x2_ionp[k],x3_ionp[k]);
}
fclose(outSngl);
//delete new arrays
delete[] x1_ionp;
delete[] x2_ionp;
delete[] x3_ionp;
delete[] theta_ionp;
delete[] phi_ionp;
delete[] r_ionp;
delete[] x1_agg;
delete[] x2_agg;
delete[] x3_agg;
delete[] cosPhase_S;
delete[] sinPhase_S;
delete[] m_tot;
}
我在更多步骤中讨论了这个问题,首先我使 运行 可重现:
mt19937 rng(127386261); //I want a deterministic seed
然后我创建一个脚本来比较程序生成的三个输出文件:
#!/bin/bash
diff V3_MAT_positionAggreg_spherical.dat V3_MAT_positionAggreg_spherical2.dat
diff V3_MAT_positionSingle_spherical.dat V3_MAT_positionSingle_spherical2.dat
diff V3_MAT_positionSPM_spherical.dat V3_MAT_positionSPM_spherical2.dat
以两个结尾的文件由优化代码创建,另一个由您的版本创建。
我 运行 你的版本用 O3 标志编译并标记了时间(对于 20 个磁性粒子和 10 个质子,在我的盒子上需要 79 秒,我的架构并不那么重要,因为我们只是要比较差异)。
然后我开始逐步重构,运行比较输出文件和时间的每一个小变化,这里是所有迭代:
如果获得 5 秒(总共 运行 74.0 秒)则删除多余的 else
if(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.)) <= 7*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else { //this was an else if and an else for error that will never happen
stepL = Rionp;
}
至此,我运行它在profiler和pow函数下脱颖而出
用方形和立方体替换 pow 获得 61 秒(总计 运行 13.2 秒)
只需将 pow(x,2.) 替换为 square(x) 并将 pow(x,3.) 替换为 cube(x) 即可将 运行 时间减少约 600%
double square(double d)
{
return d*d;
}
double cube(double d)
{
return d*d*d;
}
现在每次改进都会减少很多增益,但仍然如此。
去除多余的sqrt增益(总计运行 12.9 s)
double ionpDist = square(x1_ionp[m]-x1_ionp[ionpCounter])+square(x2_ionp[m]-x2_ionp[ionpCounter])+square(x3_ionp[m]-x3_ionp[ionpCounter]);
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 4*square_Rionp)){
引入常量变量square_Rionp和cube_Rionp(共运行12.7秒)
const double square_Rionp = square(Rionp);
const double cube_Rionp = cube(Rionp);
//replaced in the code like this
if((j>0) && (ionpDist <= 4*square_Rionp)){
为 pi 引入变量(总 运行 12.6 秒)
const double Two_PI = PI*2.0;
const double FourThird_PI = PI*4.0/3.0;
删除一个(另一个)多余的else if(总运行 11.9s)
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else { //this was an else if without any reason
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
...
}
再去掉一个多余的平方根(共运行11.2秒)
const double Seven_Rionp_squared =square(7*Rionp);
...
for(int k=0; k<ionpCounter; k++){
if(square(x1_ionp[k]-x0[0])+square(x2_ionp[k]-x0[1])+square(x3_ionp[k]-x0[2]) <= Seven_Rionp_squared){
//spm closer than 8R
stepL = stepL_min;
cnt_stpMin ++;
break;
}
我没有看到更多明显的东西可以从中挤出更多的性能。进一步的优化可能需要一些思考。
我用 50 个磁性粒子和 10 个质子进行了另一次比较 运行,我发现我的版本比你的快 7 倍,而且它生成的文件完全相同。
我会在源代码管理的帮助下完成这个练习。
您的代码可以简单地并行化,但我会在您优化单线程版本后才走这条路。
编辑
用=运算符改变+=(共运行 6.23秒)
我注意到 += 运算符被无缘无故地使用,替换为运算符 = 是一个巨大的收获:
cosPhase_S[stepCounter] = cos(phase);
sinPhase_S[stepCounter] = sin(phase);