cython 中的嵌套循环优化：是否有更快的方法来设置此代码示例？

Question

作为一大段代码的一部分，我需要使用不同的参数调用（简化的）函数 example（粘贴在下面）多次（数十万次）。因此，我需要此模块快速运行。

模块的主要问题似乎是多重嵌套循环。但是，我不确定这些循环是否真的有不必要的开销（如所写），或者代码是否真的尽可能快。

一般来说，在cython中处理多个嵌套的for循环时，是否有循环优化技术可以用来减少开销并加快代码速度？这些技术是否适用于下面粘贴的示例代码？

我也在用 extra_compile_args=["-ffast-math",'-O3'] 编译 cython，虽然这似乎并没有太大的区别。

如果这段代码真的不能在 cython 中变得更快（我希望不是这种情况），用 C 或 Fortran 编写这个模块的全部或部分是否有任何优势？

import numpy as np
import math
cimport numpy as np
cimport cython

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cdef extern from "math.h":
    double log(double x) nogil
    double exp(double x) nogil
    double pow(double x, double y) nogil


def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
    return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f[p] = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):            
                x_m_ary_f[k,p] = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE) 
    cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE) 

    cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0,p] = exp(f0_ary[p])
        nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
            nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
        ll+=log( nu_mat[data[p],p])

    if math.isnan(ll) ==True or math.isinf(ll) ==True:
        ll = -10.1**10.

    return ll

作为参考，当尝试运行这段代码时，示例参数是

f_ary=np.array([ 0.05,  0.15,  0.25 , 0.35 , 0.45  ,0.55 , 0.65 , 0.75,  0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928,   2.83852471 ,  1.14224844 ,  0.61687863  , 0.39948536,
   0.30138642 ,  0.24715899 ,  0.22077999 ,  0.21594814 ,  0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20 
PS_dist_compressed= np.ones(n_p)

然后该示例可以称为 example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)。对于计时，我发现这个例子在 ~10 loops, best of 3: 147 ms per loop 中计算。由于完整代码需要几个小时才能完成运行，因此运行时间的任何减少都会对整体性能产生很大的影响。

Answer 1

在您的代码上调用 cython -a 表明几乎所有相关部分运行都是纯 C 语言，因此这里没有太多收获。

不过，您还是过度使用了数组，其中一个标量就足够了。或者当一维数组就足够时，您正在使用矩阵。进行此优化会消除大量内存访问，如下所示：

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double x_m_ary_f
    cdef double x_m_sum_f

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):
                x_m_ary_f = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double f0_ary
    cdef double f1_ary

    cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0] = exp(f0_ary)
        nu_mat[1] = nu_mat[0] * f1_ary

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
            nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
        ll+=log( nu_mat[data[p]])

    if math.isnan(ll) or math.isinf(ll):
        ll = -10.1**10.

    return ll

运行您在此版本上的基准测试结果：

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
10 loops, best of 3: 74.1 ms per loop

当原始代码运行ning 慢得多时：

>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop

cython 中的嵌套循环优化：是否有更快的方法来设置此代码示例？

Nested loop optimization in cython: is there a faster way to setup this code example?

c

python

performance

fortran

cython