cython 中的嵌套循环优化:是否有更快的方法来设置此代码示例?
Nested loop optimization in cython: is there a faster way to setup this code example?
作为一大段代码的一部分,我需要使用不同的参数调用(简化的)函数 example
(粘贴在下面)多次(数十万次)。因此,我需要此模块快速 运行。
模块的主要问题似乎是多重嵌套循环。但是,我不确定这些循环是否真的有不必要的开销(如所写),或者代码是否真的尽可能快。
一般来说,在cython中处理多个嵌套的for循环时,是否有循环优化技术可以用来减少开销并加快代码速度?这些技术是否适用于下面粘贴的示例代码?
我也在用 extra_compile_args=["-ffast-math",'-O3']
编译 cython,虽然这似乎并没有太大的区别。
如果这段代码真的不能在 cython 中变得更快(我希望不是这种情况),用 C 或 Fortran 编写这个模块的全部或部分是否有任何优势?
import numpy as np
import math
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cdef extern from "math.h":
double log(double x) nogil
double exp(double x) nogil
double pow(double x, double y) nogil
def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f[p] = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f[k,p] = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0,p] = exp(f0_ary[p])
nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
ll+=log( nu_mat[data[p],p])
if math.isnan(ll) ==True or math.isinf(ll) ==True:
ll = -10.1**10.
return ll
作为参考,当尝试 运行 这段代码时,示例参数是
f_ary=np.array([ 0.05, 0.15, 0.25 , 0.35 , 0.45 ,0.55 , 0.65 , 0.75, 0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928, 2.83852471 , 1.14224844 , 0.61687863 , 0.39948536,
0.30138642 , 0.24715899 , 0.22077999 , 0.21594814 , 0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20
PS_dist_compressed= np.ones(n_p)
然后该示例可以称为 example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)
。对于计时,我发现这个例子在 ~10 loops, best of 3: 147 ms per loop
中计算。由于完整代码需要几个小时才能完成 运行,因此 运行 时间的任何减少都会对整体性能产生很大的影响。
在您的代码上调用 cython -a
表明几乎所有相关部分 运行 都是纯 C 语言,因此这里没有太多收获。
不过,您还是过度使用了数组,其中一个标量就足够了。或者当一维数组就足够时,您正在使用矩阵。进行此优化会消除大量内存访问,如下所示:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double x_m_ary_f
cdef double x_m_sum_f
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double f0_ary
cdef double f1_ary
cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0] = exp(f0_ary)
nu_mat[1] = nu_mat[0] * f1_ary
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
ll+=log( nu_mat[data[p]])
if math.isnan(ll) or math.isinf(ll):
ll = -10.1**10.
return ll
运行 您在此版本上的基准测试结果:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
10 loops, best of 3: 74.1 ms per loop
当原始代码 运行ning 慢得多时:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop
作为一大段代码的一部分,我需要使用不同的参数调用(简化的)函数 example
(粘贴在下面)多次(数十万次)。因此,我需要此模块快速 运行。
模块的主要问题似乎是多重嵌套循环。但是,我不确定这些循环是否真的有不必要的开销(如所写),或者代码是否真的尽可能快。
一般来说,在cython中处理多个嵌套的for循环时,是否有循环优化技术可以用来减少开销并加快代码速度?这些技术是否适用于下面粘贴的示例代码?
我也在用 extra_compile_args=["-ffast-math",'-O3']
编译 cython,虽然这似乎并没有太大的区别。
如果这段代码真的不能在 cython 中变得更快(我希望不是这种情况),用 C 或 Fortran 编写这个模块的全部或部分是否有任何优势?
import numpy as np
import math
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cdef extern from "math.h":
double log(double x) nogil
double exp(double x) nogil
double pow(double x, double y) nogil
def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f[p] = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f[k,p] = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0,p] = exp(f0_ary[p])
nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
ll+=log( nu_mat[data[p],p])
if math.isnan(ll) ==True or math.isinf(ll) ==True:
ll = -10.1**10.
return ll
作为参考,当尝试 运行 这段代码时,示例参数是
f_ary=np.array([ 0.05, 0.15, 0.25 , 0.35 , 0.45 ,0.55 , 0.65 , 0.75, 0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928, 2.83852471 , 1.14224844 , 0.61687863 , 0.39948536,
0.30138642 , 0.24715899 , 0.22077999 , 0.21594814 , 0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20
PS_dist_compressed= np.ones(n_p)
然后该示例可以称为 example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)
。对于计时,我发现这个例子在 ~10 loops, best of 3: 147 ms per loop
中计算。由于完整代码需要几个小时才能完成 运行,因此 运行 时间的任何减少都会对整体性能产生很大的影响。
在您的代码上调用 cython -a
表明几乎所有相关部分 运行 都是纯 C 语言,因此这里没有太多收获。
不过,您还是过度使用了数组,其中一个标量就足够了。或者当一维数组就足够时,您正在使用矩阵。进行此优化会消除大量内存访问,如下所示:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double x_m_ary_f
cdef double x_m_sum_f
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double f0_ary
cdef double f1_ary
cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0] = exp(f0_ary)
nu_mat[1] = nu_mat[0] * f1_ary
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
ll+=log( nu_mat[data[p]])
if math.isnan(ll) or math.isinf(ll):
ll = -10.1**10.
return ll
运行 您在此版本上的基准测试结果:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
10 loops, best of 3: 74.1 ms per loop
当原始代码 运行ning 慢得多时:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop