使用长动态数组优化 c++ Monte Carlo 模拟

Optimize c++ Monte Carlo simulation with long dynamic arrays

第一次post来这里,经验不多,请见谅。

我正在为我的博士学位构建一个 Monte Carlo C++ 模拟,我需要帮助来优化它的计算时间和性能。我有一个在每个坐标中重复的 3d 立方体作为模拟体积,并且在每个立方体内部以簇的形式生成磁性粒子。然后,在中央立方体中,创建并移动质子环,并在每一步中计算它们感受到的所有粒子(除其他外)的总磁场。

此时我在主函数中定义了所有内容,因为我需要粒子的位置来进行计算(我计算粒子在放置期间以及质子运动期间的距离),我将它们存储在动态数组。我还没有使用任何 class 或功能。这使得我的模拟非常慢,因为我最终必须使用数百万个粒子和数千个质子。即使有数百个,也需要几天时间。我还使用了很多 for 和 while 循环以及 reading/writing 到 .dat 文件。

我真的需要你的帮助。我花了数周时间尝试优化我的代码,但我的项目落后于计划。你有什么建议吗?我需要数组来存储粒子的位置。你认为 classes 还是函数会更有效率?一般来说,任何建议都是有帮助的。对不起,如果那太长了,但我很绝望...

好的,我编辑了我的原始 post 并分享了我的完整脚本。我希望这能让您对我的模拟有所了解。谢谢。

另外我添加了两个输入文件

parametersDiffusion_spher_shel.txt

parametersIONP_spher_shel.txt

#include <stdio.h> 
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <math.h>
#include <iomanip> //precision to output
//#include <time.h>
#include <ctime>
#include <cstdlib>
#include <algorithm>
#include <string>
//#include <complex>
#include <chrono> //random generator
#include <random>

using namespace std;

#define PI 3.14159265
#define tN 500000 //# of timepoints (steps) to define the arrays ONLY
#define D_const 3.0E-9 //diffusion constant (m^2/s)
#define Beq 0.16 // Tesla
#define gI 2.6752218744E8 //(sT)^-1

int main(){

  //Mersenne Twister random engine
  mt19937 rng(chrono::steady_clock::now().time_since_epoch().count());
  //uniform_int_distribution<int> intDist(0,1);
  uniform_real_distribution<double> realDist(0.,1.);

  //for(int i=1; i<100; i++){
    //cout<<"R max: "<<Ragg-Rspm<<" r_spm: "<<(Ragg-Rspm)*sqrt(realDist(rng))<<endl;
  //}

  /////////////////////////////////////////////////////////////////////////////////////////////////////////
  //input files
  double Rionp=1.0E-8, Ragg=2.0E-7, t_tot=2.0E-2, l_tot = 3.0E-4;
  int ionpN=10, aggN=10,cubAxN=10, parN=1E5;
  int temp_ionpN, temp_aggN, temp_cubAxN, temp_parN;

  ifstream inIONP;

  inIONP.open("parametersIONP_spher_shel.txt");
  if(!inIONP){
    cout << "Unable to open IONP parameters file";
    exit(1); // terminate with error
  }

  while (inIONP>>Rionp>>Ragg>>temp_ionpN>>temp_aggN>>l_tot>>temp_cubAxN) {

  ionpN = (int)temp_ionpN;
  aggN = (int)temp_aggN;
  cubAxN = (int)temp_cubAxN;

  }

  inIONP.close();

  cout<<"Rionp: "<<Rionp<<" ionpN: "<<ionpN <<" aggN: "<<aggN<<endl;
  cout<<"l_tot: "<<l_tot<<" cubAxN: "<<cubAxN<<endl;

  ifstream indiff;

  indiff.open("parametersDiffusion_spher_shel.txt");
  if(!indiff){
    cout << "Unable to open diffusion parameters file";
    exit(1); // terminate with error
  }

  while (indiff>>temp_parN>>t_tot) {

    parN = (int)temp_parN;

  }
  indiff.close();

  cout<<"parN: "<<parN<<" t_tot: "<<t_tot<<endl;

  /////////////////////////////////////////////////////////////////////////////////////////////////
  int cubN = pow(cubAxN,3.); // total cubes
  int Nionp_tot = ionpN*aggN*cubN; //total IONP
  double f_tot = (double)Nionp_tot*(4.*PI*pow(Rionp,3.)/3.)/pow(l_tot,3.);//volume density

  //central cube
  double l_c = l_tot/(double)cubAxN;
  int Nionp_c = ionpN*aggN; //SPM in central cube
  double f_c = (double)Nionp_c*(4.*PI*pow(Rionp,3.)/3.)/pow(l_c,3.);

  cout<<"f_tot: "<<f_tot<<" Nionp_tot: "<<Nionp_tot<<" l_tot "<<l_tot<<endl;
  cout<<"f_c: "<<f_c<<" Nionp_c: "<<Nionp_c<<" l_c "<<l_c<<endl;
  cout<<"Now IONP are generated..."<<endl;

  //position of aggregate (spherical distribution IONP)
  double *x1_ionp, *x2_ionp, *x3_ionp, *theta_ionp, *phi_ionp, *r_ionp, *x1_agg, *x2_agg, *x3_agg;
  x1_ionp = new double [Nionp_tot];
  x2_ionp = new double [Nionp_tot];
  x3_ionp = new double [Nionp_tot];
  theta_ionp = new double [Nionp_tot];
  phi_ionp = new double [Nionp_tot];
  r_ionp = new double [Nionp_tot];
  x1_agg = new double [Nionp_tot];
  x2_agg = new double [Nionp_tot];
  x3_agg = new double [Nionp_tot];

  int ionpCounter = 0;
  int aggCounter = 0;
  double x1_aggTemp=0., x2_aggTemp=0., x3_aggTemp=0.;
  double ionpDist = 0.; //distance SPM-SPM

  for(int a=0; a<cubAxN; a++){ //x1-filling cubes
    for(int b=0; b<cubAxN; b++){ //x2-
      for(int c=0; c<cubAxN; c++){ //x3-

        bool far_ionp = true;

        cout<<"cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;

        for(int i=0; i<aggN; i++){ //aggregate iterations
          x1_aggTemp=realDist(rng)*l_c + l_c*a - l_tot/2.; //from neg to pos filling
          x2_aggTemp=realDist(rng)*l_c + l_c*b - l_tot/2.;
          x3_aggTemp=realDist(rng)*l_c + l_c*c - l_tot/2.;

          for(int j=0; j<ionpN; j++){ //SPM iterations
          //  cout<<"SPM: "<<j<<" aggregate: "<<i<<" cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
            x1_agg[ionpCounter]=x1_aggTemp;
            x2_agg[ionpCounter]=x2_aggTemp;
            x3_agg[ionpCounter]=x3_aggTemp;

            //uniform 4pi distribution in sphere
            while(true){
              far_ionp = true; //must be updated!
              theta_ionp[ionpCounter] = 2.*PI*realDist(rng);
              phi_ionp[ionpCounter] = acos(1. - 2.*realDist(rng));
              r_ionp[ionpCounter] = (Ragg-Rionp)*sqrt(realDist(rng)); // to have uniform distribution sqrt
              x1_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*cos(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x1_agg[ionpCounter];
              x2_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*sin(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x2_agg[ionpCounter];
              x3_ionp[ionpCounter] = cos(phi_ionp[ionpCounter])*r_ionp[ionpCounter] + x3_agg[ionpCounter];

              for(int m=0; m<ionpCounter; m++){ //impenetrable IONP to each other
                ionpDist = sqrt(pow(x1_ionp[m]-x1_ionp[ionpCounter],2.)+pow(x2_ionp[m]-x2_ionp[ionpCounter],2.)+pow(x3_ionp[m]-x3_ionp[ionpCounter],2.));
          //cout<<"spmDist: "<<spmDist<<endl;
                if((j>0) && (ionpDist <= 2*Rionp)){
                  far_ionp = false;
                  cout<<"CLOSE ionp-ionp! Distanse ionp-ionp: "<<ionpDist<<endl;
                }
              }
             if(far_ionp){
               cout<<"IONP can break now! ionpCounter: "<<ionpCounter<<endl;
              break;
             }
           }

           cout<<"r_ionp: "<<r_ionp[ionpCounter]<<" x1_ionp: "<<x1_ionp[ionpCounter]<<" x2_ionp: "<<x2_ionp[ionpCounter]<<" x3_ionp: "<<x3_ionp[ionpCounter]<<endl;
           cout<<"x1_agg: "<<x1_agg[ionpCounter]<<" x2_agg: "<<x2_agg[ionpCounter]<<" x3_agg: "<<x3_agg[ionpCounter]<<endl;

           ionpCounter++;

          }
          aggCounter++;
        }
      }
    }
  }
  cout<<"ionpCounter: "<<ionpCounter<<" aggCounter: "<<aggCounter<<endl;

  //=====proton diffusion=============//

  //outfile
  //proton diffusion time-positionSPM_uniform
  FILE *outP_tPos;
  outP_tPos = fopen("V3_MAT_positionProtons_spherical.dat","wb+");
  if(!outP_tPos){// file couldn't be opened
    cerr << "Error: file could not be opened" << endl;
    exit(1);
  }

  //proton diffusion time-positionSPM_uniform
  FILE *outP_tB;
  outP_tB = fopen("V3_MAT_positionB_spherical.dat","wb+");
  if(!outP_tB){// file couldn't be opened
      cerr << "Error: file could not be opened" << endl;
      exit(1);
    }


  double *cosPhase_S, *sinPhase_S, *m_tot;
  cosPhase_S = new double [tN];
  sinPhase_S = new double [tN];
  m_tot = new double [tN];

  double tstep = 0.; // time of each step
  int stepCounter = 0; // counter for the steps for each proton
  int cnt_stpMin=0; //, cnt_stpMax=0; //counters for the step length conditions


  for (int i=0; i<parN; i++){// repetition for all the protons in the sample
    stepCounter = 0; //reset

    cout<<"Now diffusion calculated for proton: "<<i<<endl;

    double x0[3]={0.}, xt[3]={0.}, vt[3]={0.};
    double tt=0.;
    double stepL_min = Rionp/8.; //min step length
    double stepL_max = Rionp; //max step length
    double stepL = 0.;
    double extraL = 0.; //extra length beyond central cube
    bool hit_ionp = false;
    double pIONPDist = 0.; // proton-IONP Distance (vector ||)
    double pIONPCosTheta = 0.; //proton_IONP vector cosTheta with Z axis
    double Bloc = 0.; //B 1 IONP
    double Btot = 0.; //SUM B all IONP
    double Dphase = 0.; //Delta phase for step 1p
    double phase = 0.; //phase 1p
    double theta_p=0., phi_p=0.;

    //randomized initial position of the particle;
    x0[0] = realDist(rng)*l_c - l_c/2.;
    x0[1] = realDist(rng)*l_c - l_c/2.;
    x0[2] = realDist(rng)*l_c - l_c/2.;

    //for (int j=0; j<tN; j++){ //steps
    bool diffTime = true; // flag protons are allowed to diffuse (tt<10ms)
    while(diffTime){ // steps loop

     //unit vector for 4p direction
     theta_p = 2.*PI*realDist(rng);
     phi_p = acos(1. - 2.*realDist(rng));

     vt[0] = sin(phi_p)*cos(theta_p);
     vt[1] = sin(phi_p)*sin(theta_p);
     vt[2] = cos(phi_p);;

     //determine length of step
     for(int k=0; k<ionpCounter; k++){
      if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) <= 8*Rionp){
        //spm closer than 8R
        stepL = Rionp/8;
        cnt_stpMin ++;
        break;
      }
      else if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) > 8*Rionp){
        stepL = Rionp;
      }
      else{
        cout<<"sth wrong with the proton-IONP distance!"<<endl;
      }
    }

    //determine Dt step duration
    tstep = pow(stepL,2.)/(6.*D_const);
    tt += tstep;
    if(tt>t_tot){
      diffTime = false; //proton is not allowed to diffuse any longer
      cout<<"Proton id: "<<i<<" has reached diffusion time! -> Move to next one!"<<endl;
      cout<<"stepCounter: "<<stepCounter<<" cnt_stpMin: "<<cnt_stpMin<<endl;
    }

    while(true){
      xt[0]=x0[0]+vt[0]*stepL;
      xt[1]=x0[1]+vt[1]*stepL;
      xt[2]=x0[2]+vt[2]*stepL;
      for(int m=0; m<3; m++){
        if(abs(xt[m]) > l_c/2.){ //particle outside central cube,// reflected, elastic collision(no!)
          //particle enters fron the other way, periodicity
        //  hit_cx[m] = true; //I don't need it yet
          extraL = abs(xt[m]) - l_c/2.;
        //  xt[m]=-x0[m];
          cout<<"proton outside! xt[m]: "<<xt[m]<<" extra lenght: "<<extraL<<endl;
          xt[m] = xt[m]-l_c;
          cout<<"Relocating => new x[t]: "<<xt[m]<<endl;
        }
      }
      for(int k=0; k<ionpCounter; k++){//check if proton inside SPM
        pIONPDist = sqrt(pow((x1_ionp[k]-xt[0]),2.)+pow((x2_ionp[k]-xt[1]),2.)+pow((x3_ionp[k]-xt[2]),2.)) - Rionp;
        if(pIONPDist <= 0.){
          cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
          hit_ionp = true;
        }
        else if(pIONPDist > 0.){
          hit_ionp=false; //with this I don't have to reset flag in the end
          //calculations of Bloc for this position
          pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
          Bloc = pow(Rionp,3.)*Beq*(3.*pIONPCosTheta - 1.)/pow(pIONPDist,3.);
          Btot += Bloc;
          //cout<<"pSPMDist: "<<pSPMDist<<" pSPMCosTheta: "<<pSPMCosTheta<<" Bloc: "<<Bloc<<" Btot: "<<Btot<<endl;
        }
        else{
          cout<<"Something wrong with the calculation of pIONPDist! "<<pIONPDist<<endl;
          hit_ionp = true;
        }
      }

      if(!hit_ionp){
      //  hit_spm=false; //reset flag (unnessesary alreaty false)
        break;
      }
    }// end of while for new position -> the new position is determined, Btot calculated

    // Dphase, phase
    Dphase = gI*Btot*tstep;
    phase += Dphase;

    //store phase for this step
    //filled for each proton at this timepoint (step)

    cosPhase_S[stepCounter] += cos(phase);
    sinPhase_S[stepCounter] += sin(phase);

    //reset Btot
    Btot = 0.;
    stepCounter++;

   } //end of for loop step
  } //end of for loop particles

  //-----calculate the <m> the total magnetization
for(int t=0; t<tN; t++){
  m_tot[t] = sqrt(pow(cosPhase_S[t],2.) + pow(sinPhase_S[t],2.))/(double)parN;
  //cout<<"m_tot[t]: "<<m_tot[t]<<endl;
}
fclose(outP_tPos); //proton time-position
fclose(outP_tB); //proton time-B

//====== outfile data=============//

  //----- output data of SPM position---------//
  FILE *outP_S;
  outP_S = fopen("V3_MAT_positionSPM_spherical.dat","wb+");
  if(!outP_S){// file couldn't be opened
    cerr << "Error: file could not be opened" << endl;
    exit(1);
  }
  for (int i=0; i<ionpCounter; ++i){
    fprintf(outP_S,"%.10f \t %.10f \t %.10f\n",x1_ionp[i],x2_ionp[i],x3_ionp[i]);
  }
  fclose(outP_S);

  FILE *outP_agg;
  outP_agg = fopen("V3_MAT_positionAggreg_spherical.dat","wb+");
  if(!outP_agg){// file couldn't be opened
    cerr << "Error: file could not be opened" << endl;
    exit(1);
  }
  for (int j=0; j<ionpCounter; ++j){
    fprintf(outP_agg,"%.10f \t %.10f \t %.10f\n",x1_agg[j],x2_agg[j],x3_agg[j]);
  }
  fclose(outP_agg);

  FILE *outSngl;
  outSngl = fopen("V3_MAT_positionSingle_spherical.dat","wb+");
  if(!outSngl){// file couldn't be opened
    cerr << "Error: file could not be opened" << endl;
    exit(1);
  }
  int findAgg = (int)(realDist(rng)*aggN);
  int idxMin = findAgg*ionpN;
  int idxMax = idxMin + ionpN;
  for (int k=idxMin; k<idxMax; ++k){
    fprintf(outSngl,"%.10f\t%.10f\t%.10f\t%.10f\t%.10f\t%.10f\n",x1_agg[k],x2_agg[k],x3_agg[k],x1_ionp[k],x2_ionp[k],x3_ionp[k]);
  }
  fclose(outSngl);

  //delete new arrays
  delete[] x1_ionp;
  delete[] x2_ionp;
  delete[] x3_ionp;
  delete[] theta_ionp;
  delete[] phi_ionp;
  delete[] r_ionp;
  delete[] x1_agg;
  delete[] x2_agg;
  delete[] x3_agg;

  delete[] cosPhase_S;
  delete[] sinPhase_S;
  delete[] m_tot;



}

我在更多步骤中讨论了这个问题,首先我使 运行 可重现:

  mt19937 rng(127386261); //I want a deterministic seed

然后我创建一个脚本来比较程序生成的三个输出文件:

#!/bin/bash

diff V3_MAT_positionAggreg_spherical.dat V3_MAT_positionAggreg_spherical2.dat
diff V3_MAT_positionSingle_spherical.dat V3_MAT_positionSingle_spherical2.dat
diff V3_MAT_positionSPM_spherical.dat V3_MAT_positionSPM_spherical2.dat

以两个结尾的文件由优化代码创建,另一个由您的版本创建。

我 运行 你的版本用 O3 标志编译并标记了时间(对于 20 个磁性粒子和 10 个质子,在我的盒子上需要 79 秒,我的架构并不那么重要,因为我们只是要比较差异)。

然后我开始逐步重构,运行比较输出文件和时间的每一个小变化,这里是所有迭代:

如果获得 5 秒(总共 运行 74.0 秒)则删除多余的 else

if(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.)) <= 7*Rionp){
  //spm closer than 8R
  stepL = Rionp/8;
  cnt_stpMin ++;
  break;
}
else { //this was an else if and an else for error that will never happen
  stepL = Rionp;
}

至此,我运行它在profiler和pow函数下脱颖而出

用方形和立方体替换 pow 获得 61 秒(总计 运行 13.2 秒)

只需将 pow(x,2.) 替换为 square(x) 并将 pow(x,3.) 替换为 cube(x) 即可将 运行 时间减少约 600%

double square(double d)
{
  return d*d;
}
double cube(double d)
{
  return d*d*d;
}

现在每次改进都会减少很多增益,但仍然如此。

去除多余的sqrt增益(总计运行 12.9 s)

 double ionpDist = square(x1_ionp[m]-x1_ionp[ionpCounter])+square(x2_ionp[m]-x2_ionp[ionpCounter])+square(x3_ionp[m]-x3_ionp[ionpCounter]);
 //cout<<"spmDist: "<<spmDist<<endl;
 if((j>0) && (ionpDist <= 4*square_Rionp)){

引入常量变量square_Rionp和cube_Rionp(共运行12.7秒)

const double square_Rionp = square(Rionp);
const double cube_Rionp = cube(Rionp);
//replaced in the code like this
if((j>0) && (ionpDist <= 4*square_Rionp)){

为 pi 引入变量(总 运行 12.6 秒)

const double Two_PI = PI*2.0;
const double FourThird_PI = PI*4.0/3.0;

删除一个(另一个)多余的else if(总运行 11.9s)

if(pIONPDist <= 0.){
  cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
  hit_ionp = true;
}
else { //this was an else if without any reason
  hit_ionp=false; //with this I don't have to reset flag in the end
  //calculations of Bloc for this position
  pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
...
}

再去掉一个多余的平方根(共运行11.2秒)

const double Seven_Rionp_squared =square(7*Rionp);
...
for(int k=0; k<ionpCounter; k++){
  if(square(x1_ionp[k]-x0[0])+square(x2_ionp[k]-x0[1])+square(x3_ionp[k]-x0[2]) <= Seven_Rionp_squared){
  //spm closer than 8R
  stepL = stepL_min;
  cnt_stpMin ++;
  break;
}

我没有看到更多明显的东西可以从中挤出更多的性能。进一步的优化可能需要一些思考。

我用 50 个磁性粒子和 10 个质子进行了另一次比较 运行,我发现我的版本比你的快 7 倍,而且它生成的文件完全相同。

我会在源代码管理的帮助下完成这个练习。

您的代码可以简单地并行化,但我会在您优化单线程版本后才走这条路。

编辑

用=运算符改变+=(共运行 6.23秒)

我注意到 += 运算符被无缘无故地使用,替换为运算符 = 是一个巨大的收获:

cosPhase_S[stepCounter] = cos(phase);
sinPhase_S[stepCounter] = sin(phase);