用两个数组作为 cython 中的输入向量化 class 的函数

Vectorizing a function of a class with two arrays as inputs in cython

我正在努力优化我的 cython 代码以尽可能提高其速度。我仍然无法弄清楚在 cython 中应该如何完成的挑战之一是将数组映射到函数上,就像在 numpy.vectorize 函数中所做的那样。

我的问题的简化版本是

from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
    cdef public double M, c, z
    cdef public double[::1] ks, zs, pos

    @cython.boundscheck(False)
    @cython.cdivision(True)
    @cython.wraparound(False)
    @cython.nonecheck(False)
    def __cinit__(self, M, c, z, pos, ks, zs=None):


        if path is None:
           raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")

        self.M = M
        self.c = c
        self.z = z
        self.pos = pos

        if zs is None:
           raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
        self.zs=zs
        self.ks=ks
    @cython.cdivision(True)    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
        cdef Py_ssize_t N = x.shape[0]
        cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask


        cdef np.ndarray[double, ndim=1, mode='c'] out  = np.zeros(N, dtype=np.float64 , order='C')

        mask = np.where(x < 0.999)[0]
        out[mask] = 2*ks/(x[mask]**2 - 1) * \
                (1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)

        mask = np.where(x > 1.001)[0]
        out[mask] = 2*ks/(x[mask]**2 - 1) * \
                (1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)


        mask = np.where((x >= 0.999) & (x <= 1.001))[0]
        out[mask] = ks*(22./15. - 0.8*x[mask])           

        return out

    @cython.cdivision(True)    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
        cdef Py_ssize_t N=len(x)
        cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask 
        cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')


        mask = np.where(x > 0.01)[0]
        out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
                x[mask]**(-2) - self.__kappa(x[mask], ks)


        mask = np.where(x <= 0.01)[0]
        out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))

        return out

    cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
        # Convert to numpy arrays for internal usage:
        cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
        cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y 
        if not gpos[:,0].flags.c_contiguous:
           pos_x = gpos[:,0].copy(order='C')
        else:
           pos_x = gpos[:,0]
        if not gpos[:,1].flags.c_contiguous:
           pos_y = gpos[:,1].copy(order='C')
        else:
           pos_y = gpos[:,1]
        cdef Py_ssize_t i, mask, N

        r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5

        ks  = np.ascontiguousarray(self.ks)
        N   = len(ks)
        mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]

        wg  = np.zeros(len(r), dtype=np.float64 , order='C')

        for i from N > i >= 0:  
            g = self.__gamma(r, ks[i])

            kappa = self.__kappa(r, ks[i])
            g /= 1 - kappa
            wg+=g*pdf_z[:,mask+i]

        cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
        dx = pos_x - self.halo_pos[0]
        dy = pos_y - self.halo_pos[1]
        drsq = dx*dx+dy*dy
        drsq[drsq==0.] = 1. # Avoid division by 0
        cos2phi = (dx*dx-dy*dy)/drsq
        sin2phi = 2*dx*dy/drsq
        g1 = -wg*cos2phi
        g2 = -wg*sin2phi

        return g1, g2

我想知道是否有一种方法可以通过 ks 数组对 Test class 的 getSh 方法进行矢量化,并避免使用循环让我的代码更快?

我认为 'vectorize' 不适用于 cython。在 numpy 中,您可以使用快速编译代码进行矢量化,如 +*sum 等操作。有一个 np.vectorize 函数,但它只是将您的代码包装在一个理解广播和多维数组的迭代器中。它不会重写您的函数,也不会加快速度。

Cython用于加速现有编译向量运算中无法表达的numpy代码。它通过将这些编译为快速 C 迭代来获得速度。

从表面上看,getSh 中的 i 循环看起来很快(c 风格),但它调用了 self.__kappaself.__gamma。两者都加载了 np 调用 - np.arraynp.wherenp.log 等。通过这些调用,您无法获得纯 c代码。

您需要关注这两个方法,将它们表示为对数字的简单操作,并根据需要显式迭代 c 样式。

如果您可以将整个数组 ks 传递给方法 self.__gamma()self.__kappa(),那么您的代码的矢量化将完成,从而避免每次循环迭代的函数调用开销因为您会将循环移动到最内部调用的方法。

还有一些其他技巧可以让您获得额外的表现:

  • 只在循环外计算一次掩码,因为它们与数组有关r
  • mask = x > 0.01 而不是 mask = np.where(x > 0.01)[0] 和类似的
  • 重复使用 out 数组,因为它的长度总是=N

编辑: 将上述想法付诸实践后,我想出了以下解决方案,其中方法 __kappa()__gamma() 不再是必需的。虽然没有经过测试:

cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
    # Convert to numpy arrays for internal usage:
    cdef np.ndarray[double, ndim=1] r, ks, wg
    cdef np.ndarray[double, ndim=1] pos_x, pos_y
    cdef np.ndarray[double, ndim=2, mode='c'] gamma, kappa, wgtmp

    if not gpos[:,0].flags.c_contiguous:
       pos_x = gpos[:,0].copy(order='C')
    else:
       pos_x = gpos[:,0]
    if not gpos[:,1].flags.c_contiguous:
       pos_y = gpos[:,1].copy(order='C')
    else:
       pos_y = gpos[:,1]
    cdef Py_ssize_t i, mask, N

    r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5

    m1 = r > 0.01
    m2 = ~m1
    r1 = r[m1]
    r2 = r[m2]

    ma = r < 0.999
    mb = (r >= 0.999) & (r <= 1.001)
    mc = r > 1.001
    ra = r[ma]
    rb = r[mb]
    rc = r[mc]

    ks  = np.ascontiguousarray(self.ks)
    one = np.ones_like(ks)
    N = len(ks)
    P = len(r)

    kappa = np.zeros((P, N), dtype=np.float64 , order='C')
    gamma = np.zeros((P, N), dtype=np.float64 , order='C')
    wgtmp = np.zeros((P, N), dtype=np.float64 , order='C')
    wg = np.zeros((P,), dtype=np.float64)

    kappa[ma] = (2*ks/(ra**2 - 1)[:, None] *
                 one*(1 - np.log((1 + ((1 - ra)/(ra + 1))**0.5)/(1 - ((1 -
                      ra)/(ra + 1))**0.5))/(1 - ra**2)**0.5)[:, None])

    kappa[mb] = ks*(22./15. - 0.8*rb)[:, None]

    kappa[mc] = (2*ks/(rc**2 - 1)[:, None] *
                 one*(1 - 2*np.arctan(((rc - 1)/(rc + 1))**0.5)/(rc**2 -
                     1)**0.5)[:, None])


    gamma[m1 & ma] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[ma])[:, None]
    gamma[m1 & mb] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mb])[:, None]
    gamma[m1 & mc] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mc])[:, None]

    gamma[m2] = 4*ks*(0.25 + 0.125 * r2**2 * (3.25 + 3.0*np.log(r2/2)))[:, None]


    init = np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]

    wgtmp = gamma/(1-kappa) * pdf_z[:, init:init+N]
    wg = wgtmp.sum(axis=1)

    cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
    dx = pos_x - self.halo_pos[0]
    dy = pos_y - self.halo_pos[1]
    drsq = dx*dx+dy*dy
    drsq[drsq==0.] = 1. # Avoid division by 0
    cos2phi = (dx*dx-dy*dy)/drsq
    sin2phi = 2*dx*dy/drsq
    g1 = -wg*cos2phi
    g2 = -wg*sin2phi

    return g1, g2