有什么好的方法可以优化这段python代码的速度吗?
Is there any good way to optimize the speed of this python code?
我有以下一段代码,它主要计算一些数值表达式,并用它来积分一定范围的值。当前的代码 运行s 在大约 8.6 s
以内,但我只是使用模拟值,而我的实际数组要大得多。特别是,我的实际大小 freq_c= (3800, 101)
和大小 number_bin = (3800, 100)
,这使得下面的代码非常低效,因为实际数组的总执行时间将接近 9 分钟。代码中相当慢的一部分是 k_one_third
和 k_two_third
的计算,为此我也使用了 numexpr.evaluate("..")
,这使代码速度提高了大约 10-20% .但是,我避免了下面的 numexpr
,这样任何人都可以 运行 它而无需安装软件包。还有其他方法可以提高这段代码的速度吗?几个因素的改进也足够好。请注意 for loop
几乎是不可避免的,由于内存问题,因为数组非常大,我通过循环一次操纵每个轴。我也想知道这里是否可以进行 numba jit
优化。
import numpy as np
import scipy
from scipy.integrate import simps as simps
import time
def k_one_third(x):
return (2.*np.exp(-x**2)/x**(1/3) + 4./x**(1/6)*np.exp(-x)/(1+x**(1/3)))**2
def k_two_third(x):
return (np.exp(-x**2)/x**(2/3) + 2.*x**(5/2)*np.exp(-x)/(6.+x**3))**2
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.zeros((np.size(freq_c[:,0]), np.size(theta), np.size(frequency)))
for i in xrange(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, k_one_third(eta))
bessel_eta += k_two_third(eta)
eta = None
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
final[:,:, i] = simps(integrand, gamma)
integrand = None
return final
frequency = np.linspace(1, 100, 100)
theta = np.linspace(1, 3, 100)
gamma = np.linspace(2, 200, 101)
freq_c = np.random.randint(1, 200, size=(50, 101))
number_bin = np.random.randint(1, 100, size=(50, 100))
time1 = time.time()
spectra = spectrum(freq_c, number_bin, frequency, gamma, theta)
print(time.time()-time1)
我分析了代码,发现 k_one_third()
和 k_two_third()
很慢。两个函数中有一些重复的计算。
通过将两个函数合并为一个函数,并用 @numba.jit(parallel=True)
修饰它,我得到了 4 倍的加速。
@jit(parallel=True)
def k_one_two_third(x):
x0 = x ** (1/3)
x1 = np.exp(-x ** 2)
x2 = np.exp(-x)
one = (2*x1/x0 + 4*x2/(x**(1/6)*(x0 + 1)))**2
two = (2*x**(5/2)*x2/(x**3 + 6) + x1/x**(2/3))**2
return one, two
如评论中所述,应重写大部分代码以获得最佳性能。
我只修改了 simpson 集成和修改了@HYRY 的答案。根据您提供的测试数据,这可以加快从 26.15s 到 1.76s (15x) 的计算速度。通过用简单的循环替换 np.einsums 这应该会在不到一秒的时间内结束。 (改进集成大约 0.4 秒,k_one_two_third(x)
大约 24 秒)
使用 Numba 获得性能 read。最新的 Numba 版本 (0.39)、英特尔 SVML 包和诸如 fastmath=True 之类的东西对您的示例产生了相当大的影响。
代码
#a bit faster than HYRY's version
@nb.njit(parallel=True,fastmath=True,error_model='numpy')
def k_one_two_third(x):
one=np.empty(x.shape,dtype=x.dtype)
two=np.empty(x.shape,dtype=x.dtype)
for i in nb.prange(x.shape[0]):
for j in range(x.shape[1]):
for k in range(x.shape[2]):
x0 = x[i,j,k] ** (1/3)
x1 = np.exp(-x[i,j,k] ** 2)
x2 = np.exp(-x[i,j,k])
one[i,j,k] = (2*x1/x0 + 4*x2/(x[i,j,k]**(1/6)*(x0 + 1)))**2
two[i,j,k] = (2*x[i,j,k]**(5/2)*x2/(x[i,j,k]**3 + 6) + x1/x[i,j,k]**(2/3))**2
return one, two
#improved integration
@nb.njit(fastmath=True)
def simpson_nb(y_in,dx):
s = y[0]+y[-1]
n=y.shape[0]//2
for i in range(n-1):
s += 4.*y[i*2+1]
s += 2.*y[i*2+2]
s += 4*y[(n-1)*2+1]
return(dx/ 3.)*s
@nb.jit(fastmath=True)
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.empty((np.size(frequency),np.size(freq_c[:,0]), np.size(theta)))
#assume that dx is const. on integration
#speedimprovement of the scipy.simps is about 4x
#numba version to scipy.simps(y,x) is about 60x
dx=gamma[1]-gamma[0]
for i in range(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
one,two=k_one_two_third(eta)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, one)
bessel_eta += two
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
#reorder array
for j in range(integrand.shape[0]):
for k in range(integrand.shape[1]):
final[i,j, k] = simpson_nb(integrand[j,k,:],dx)
return final
我有以下一段代码,它主要计算一些数值表达式,并用它来积分一定范围的值。当前的代码 运行s 在大约 8.6 s
以内,但我只是使用模拟值,而我的实际数组要大得多。特别是,我的实际大小 freq_c= (3800, 101)
和大小 number_bin = (3800, 100)
,这使得下面的代码非常低效,因为实际数组的总执行时间将接近 9 分钟。代码中相当慢的一部分是 k_one_third
和 k_two_third
的计算,为此我也使用了 numexpr.evaluate("..")
,这使代码速度提高了大约 10-20% .但是,我避免了下面的 numexpr
,这样任何人都可以 运行 它而无需安装软件包。还有其他方法可以提高这段代码的速度吗?几个因素的改进也足够好。请注意 for loop
几乎是不可避免的,由于内存问题,因为数组非常大,我通过循环一次操纵每个轴。我也想知道这里是否可以进行 numba jit
优化。
import numpy as np
import scipy
from scipy.integrate import simps as simps
import time
def k_one_third(x):
return (2.*np.exp(-x**2)/x**(1/3) + 4./x**(1/6)*np.exp(-x)/(1+x**(1/3)))**2
def k_two_third(x):
return (np.exp(-x**2)/x**(2/3) + 2.*x**(5/2)*np.exp(-x)/(6.+x**3))**2
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.zeros((np.size(freq_c[:,0]), np.size(theta), np.size(frequency)))
for i in xrange(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, k_one_third(eta))
bessel_eta += k_two_third(eta)
eta = None
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
final[:,:, i] = simps(integrand, gamma)
integrand = None
return final
frequency = np.linspace(1, 100, 100)
theta = np.linspace(1, 3, 100)
gamma = np.linspace(2, 200, 101)
freq_c = np.random.randint(1, 200, size=(50, 101))
number_bin = np.random.randint(1, 100, size=(50, 100))
time1 = time.time()
spectra = spectrum(freq_c, number_bin, frequency, gamma, theta)
print(time.time()-time1)
我分析了代码,发现 k_one_third()
和 k_two_third()
很慢。两个函数中有一些重复的计算。
通过将两个函数合并为一个函数,并用 @numba.jit(parallel=True)
修饰它,我得到了 4 倍的加速。
@jit(parallel=True)
def k_one_two_third(x):
x0 = x ** (1/3)
x1 = np.exp(-x ** 2)
x2 = np.exp(-x)
one = (2*x1/x0 + 4*x2/(x**(1/6)*(x0 + 1)))**2
two = (2*x**(5/2)*x2/(x**3 + 6) + x1/x**(2/3))**2
return one, two
如评论中所述,应重写大部分代码以获得最佳性能。
我只修改了 simpson 集成和修改了@HYRY 的答案。根据您提供的测试数据,这可以加快从 26.15s 到 1.76s (15x) 的计算速度。通过用简单的循环替换 np.einsums 这应该会在不到一秒的时间内结束。 (改进集成大约 0.4 秒,k_one_two_third(x)
大约 24 秒)
使用 Numba 获得性能 read。最新的 Numba 版本 (0.39)、英特尔 SVML 包和诸如 fastmath=True 之类的东西对您的示例产生了相当大的影响。
代码
#a bit faster than HYRY's version
@nb.njit(parallel=True,fastmath=True,error_model='numpy')
def k_one_two_third(x):
one=np.empty(x.shape,dtype=x.dtype)
two=np.empty(x.shape,dtype=x.dtype)
for i in nb.prange(x.shape[0]):
for j in range(x.shape[1]):
for k in range(x.shape[2]):
x0 = x[i,j,k] ** (1/3)
x1 = np.exp(-x[i,j,k] ** 2)
x2 = np.exp(-x[i,j,k])
one[i,j,k] = (2*x1/x0 + 4*x2/(x[i,j,k]**(1/6)*(x0 + 1)))**2
two[i,j,k] = (2*x[i,j,k]**(5/2)*x2/(x[i,j,k]**3 + 6) + x1/x[i,j,k]**(2/3))**2
return one, two
#improved integration
@nb.njit(fastmath=True)
def simpson_nb(y_in,dx):
s = y[0]+y[-1]
n=y.shape[0]//2
for i in range(n-1):
s += 4.*y[i*2+1]
s += 2.*y[i*2+2]
s += 4*y[(n-1)*2+1]
return(dx/ 3.)*s
@nb.jit(fastmath=True)
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.empty((np.size(frequency),np.size(freq_c[:,0]), np.size(theta)))
#assume that dx is const. on integration
#speedimprovement of the scipy.simps is about 4x
#numba version to scipy.simps(y,x) is about 60x
dx=gamma[1]-gamma[0]
for i in range(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
one,two=k_one_two_third(eta)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, one)
bessel_eta += two
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
#reorder array
for j in range(integrand.shape[0]):
for k in range(integrand.shape[1]):
final[i,j, k] = simpson_nb(integrand[j,k,:],dx)
return final