优化独立 N 体模拟的方法

Ways to Optimize Independent N-Body Simulations

我对并行计算和 Numba 包比较陌生。我正在为我的极其并行的 N 体模拟寻找优化方法。到目前为止,我已经将我所知道的一切应用于 Numpy 数组、JIT 编译器和多处理。但是,我仍然没有达到我想要的速度(我看过他们的代码仍然快得多的视频)

我目前拥有的是一个相当简单的 python 积分器,它使用 Runge-Kutta 积分和两个运动方程。我在我的领域中经常与数值积分器合作,所以我肯定想从你们那里学到更多技巧。

我已经在下面发布了我的代码,但本质上,我有一个名为“Motion”的主要函数,它需要 2 个初始条件并在一定时间内整合它们的运动。我已经 JITTED 这个函数和它迭代调用的所有函数:“RK4”、“ODE”、“电场”。最后,我从 Multiprocessing 中调用 pool 函数来并行化“Motion”函数,并为其运行的每个模拟插入不同的初始条件。

同样,我已经实施了我所知道的所有类型的优化,但我仍然对其速度不太满意。我已经在下面发布了我的代码。如果有人能发现一个可以进一步优化的算法,那将是非常有帮助和教育意义的(至少对我而言)!谢谢你的时间。

import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from time import time
from tqdm import tqdm
import multiprocessing as mp
from IPython.display import clear_output
from scipy import interpolate


"Electric Field Information"
A = np.float32(1.00E-04)
N_waves = np.int(19)
frequency =  np.linspace(37.5,46.5,N_waves)*1e-3 #Set of frequencies used for Electric Field 
m = np.int(20) #Azimuthal Wave Number
sigma = np.float32(0.5) #Gaussian Width of E wave in L
zeta = np.float32(1)

"Particle Information"
N_Particles = np.int(10000)
q = np.float32(-1) #Charge of electron
mass = np.float32(0.511e6) #Mass of Proton eV/c^2
FirstAdiabatic = np.float32(2000e10) #MeV/Gauss Adiabatic Invariant


"Runge-Kutta Paramters"
Total_Time = np.float32(10) #hours
Step_Size = np.float32(0.2) #second 
Plot_Time = np.float32(60) #seconds
time_array = np.arange(0, Total_Time*3600+Step_Size, Step_Size) #Convert to seconds and Add End Point
N_points = len(time_array)

Skip_How_Many = int(Plot_Time/Step_Size) #Used to shorten our data set and save RAM

"Constants"    
Beq = np.float64(31221.60592e-9) #nT
Re = np.float32(6371e3) #Meters
c = np.float32(2.998e8) #m/s

"Start Electric Field Code"
def wave_peak(omega): #Called once so no need to JIT or Optimize this
    L_sample = np.linspace(1,10,100) 
    phidot = -3*FirstAdiabatic / (q* (L_sample*Re)**2 * np.sqrt(1+ (2*FirstAdiabatic*Beq/ (mass*L_sample**3)) ) )
    phidot_to_L = interpolate.interp1d(phidot,L_sample, kind = 'cubic')
    L0i = phidot_to_L(omega/m)
    return L0i 
omega = 2*np.pi*frequency
L0i_wave = wave_peak(omega)
Phi0i_wave = np.linspace(0,2*np.pi,N_waves)
np.random.shuffle(Phi0i_wave)

@njit(nogil= True)
def Electric_Field(t,r):
    E0 = A*np.exp(-(r[0]-L0i_wave)**2 / (2*sigma**2))
    Delta = np.arctan2( (r[0] * (r[0]-L0i_wave)/sigma**2 - 1), (2*np.pi*r[0]/zeta) )
    Er = E0/m * np.sqrt( (2*np.pi*r[0]/zeta)**2 + (r[0]*(r[0]-L0i_wave)/sigma**2 -1)**2 ) * np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta + Delta)
    Ephi = E0*np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta)
    return np.sum(Er),np.sum(Ephi)
"End of Electric Field Code"

"Particle's ODE - Equation of Motion"
@njit(nogil= True)
def ODE(t,r):
    Er, Ephi = Electric_Field(t,r) #Pull out the electric so we only call it once.
    Ldot = Ephi * r[0]**3 / (Re*Beq)
    Phidot = -Er * r[0]**2 / (Re*Beq) - 3* FirstAdiabatic / (q*r[0]**2*Re**2) * 1/np.sqrt(2*FirstAdiabatic*Beq/ (r[0]**3*mass) + 1)
    return np.array([Ldot,Phidot])

@njit(nogil= True)
def RK4(t,r): #Standard Runge-Kutta Integration Algorthim
    k1 = Step_Size*ODE(t,r)
    k2 = Step_Size*ODE(t+Step_Size/2, r+k1/2)
    k3 = Step_Size*ODE(t+Step_Size/2, r+k2)
    k4 = Step_Size*ODE(t+Step_Size, r+k3)
    return r + k1/6 + k2/3 + k3/3 + k4/6

@njit(nogil= True)
def Motion(L0,Phi0): #Insert Inital Conditions and it will loop through the RK4 integrator and output all its positions.
    L_Array = np.zeros_like(time_array)
    Phi_Array = np.zeros_like(time_array)
    
    L_Array[0] = L0
    Phi_Array[0] = Phi0
    for i in range(1,N_points):
        L_Array[i], Phi_Array[i] = RK4(time_array[i-1], np.array([ L_Array[i-1],Phi_Array[i-1] ]) )
    
    return L_Array[::Skip_How_Many], Phi_Array[::Skip_How_Many] 
    #Skip_How_Many is used to take up less RAM space since we don't need that kind of percsion in our data

# Location = Motion(5,0)
# x = Location[0]*np.cos(Location[1])
# y = Location[0]*np.sin(Location[1])
# plt.plot(x,y,"o", markersize = 0.5)
# ts = time()
# Motion(5,0)
# print('Solo Time:', time() - ts)

"Getting my Arrays ready so I can index it"
Split = int(np.sqrt(N_Particles))
L0i = np.linspace(4.4,5.5,Split)
Phi0i = np.linspace(0,360,Split) / 180 * np.pi
L0_Grid = np.repeat(L0i,Split) 
# ^Here I want to run a meshgrid of L0i and Phi0, so I repeat L0i using this function and mod (%) the index on the Phi Function


#Create Appending Array
results = []
def get_results(result): #Call back to this array from Multiprocessing to append the results it gives per run.
    results.append(result)
    clear_output()
    print("Getting Results %0.2f" %(len(results)/N_Particles * 100), end='\r')
    
if __name__ == '__main__':
    #Call In Multiprocessing
    pool = mp.Pool(mp.cpu_count()) #Counting number of threads to start
    ts = time() #Timing this process. Begins here
    for ii in range(N_Particles): #Not too sure what this does, but it works - I assume it parallelizes this loop
        pool.apply_async(Motion, args = (L0_Grid[ii],Phi0i[int(ii%Split)]), callback=get_results)
        
    pool.close() #I'm not too sure what this does but everyone uses it, and it won't work without it
    pool.join()
    print('Time in MP parallel:', time() - ts) #Output Time

我认为你的代码慢的主要原因是你的Runge-Kutta方法有固定的时间步长。花哨的 ODE 求解器将 select 最大的时间步长,允许可容忍的错误量。一个例子是来自 ODEPACK 的 LSODA ODE 求解器。

下面我使用 NumbaLSODA 重写了您的代码。在我的电脑上,它将您的代码速度提高了大约 200 倍。

import numpy as np
import matplotlib.pyplot as plt
from numba import njit, prange
from time import time
from tqdm import tqdm
import multiprocessing as mp
from scipy import interpolate

from NumbaLSODA import lsoda_sig, lsoda
from numba import cfunc
import numba as nb

"Electric Field Information"
A = np.float32(1.00E-04)
N_waves = np.int(19)
frequency =  np.linspace(37.5,46.5,N_waves)*1e-3 #Set of frequencies used for Electric Field 
m = np.int(20) #Azimuthal Wave Number
sigma = np.float32(0.5) #Gaussian Width of E wave in L
zeta = np.float32(1)

"Particle Information"
N_Particles = np.int(10000)
q = np.float32(-1) #Charge of electron
mass = np.float32(0.511e6) #Mass of Proton eV/c^2
FirstAdiabatic = np.float32(2000e10) #MeV/Gauss Adiabatic Invariant

"Runge-Kutta Paramters"
Total_Time = np.float32(10) #hours
Step_Size = np.float32(0.2) #second 
Plot_Time = np.float32(60) #seconds
time_array = np.arange(0, Total_Time*3600+Step_Size, Step_Size) #Convert to seconds and Add End Point
N_points = len(time_array)

Skip_How_Many = int(Plot_Time/Step_Size) #Used to shorten our data set and save RAM

"Constants"    
Beq = np.float64(31221.60592e-9) #nT
Re = np.float32(6371e3) #Meters
c = np.float32(2.998e8) #m/s

"Start Electric Field Code"
def wave_peak(omega): #Called once so no need to JIT or Optimize this
    L_sample = np.linspace(1,10,100) 
    phidot = -3*FirstAdiabatic / (q* (L_sample*Re)**2 * np.sqrt(1+ (2*FirstAdiabatic*Beq/ (mass*L_sample**3)) ) )
    phidot_to_L = interpolate.interp1d(phidot,L_sample, kind = 'cubic')
    L0i = phidot_to_L(omega/m)
    return L0i 
omega = 2*np.pi*frequency
L0i_wave = wave_peak(omega)
Phi0i_wave = np.linspace(0,2*np.pi,N_waves)
np.random.shuffle(Phi0i_wave)

@njit
def Electric_Field(t,r):
    E0 = A*np.exp(-(r[0]-L0i_wave)**2 / (2*sigma**2))
    Delta = np.arctan2( (r[0] * (r[0]-L0i_wave)/sigma**2 - 1), (2*np.pi*r[0]/zeta) )
    Er = E0/m * np.sqrt( (2*np.pi*r[0]/zeta)**2 + (r[0]*(r[0]-L0i_wave)/sigma**2 -1)**2 ) * np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta + Delta)
    Ephi = E0*np.cos(m*r[1] - omega*t + Phi0i_wave + 2*np.pi*r[0]/zeta)
    return np.sum(Er),np.sum(Ephi)
"End of Electric Field Code"

"Particle's ODE - Equation of Motion"
@cfunc(lsoda_sig)
def ODE(t, r_, dr, p):
    r = nb.carray(r_, (2,))
    Er, Ephi = Electric_Field(t,r)
    Ldot = Ephi * r[0]**3 / (Re*Beq)
    Phidot = -Er * r[0]**2 / (Re*Beq) - 3* FirstAdiabatic / (q*r[0]**2*Re**2) * 1/np.sqrt(2*FirstAdiabatic*Beq/ (r[0]**3*mass) + 1)
    dr[0] = Ldot
    dr[1] = Phidot
funcptr = ODE.address
    
@njit
def Motion(L0,Phi0):
    u0 = np.array([L0,Phi0],np.float64)
    data = np.array([5.0])
    usol, success = lsoda(funcptr, u0, time_array, data)
    L_Array = usol[:,0]
    Phi_Array = usol[:,1]
    return L_Array[::Skip_How_Many], Phi_Array[::Skip_How_Many]
    #Skip_How_Many is used to take up less RAM space since we don't need that kind of percsion in our data

Location = Motion(5,0)
x = Location[0]*np.cos(Location[1])
y = Location[0]*np.sin(Location[1])
plt.plot(x,y,"o", markersize = 0.5)
ts = time()
Motion(5,0)
print('Solo Time:', time() - ts)