用两个数组作为 cython 中的输入向量化 class 的函数
Vectorizing a function of a class with two arrays as inputs in cython
我正在努力优化我的 cython
代码以尽可能提高其速度。我仍然无法弄清楚在 cython
中应该如何完成的挑战之一是将数组映射到函数上,就像在 numpy.vectorize
函数中所做的那样。
我的问题的简化版本是
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
cdef public double M, c, z
cdef public double[::1] ks, zs, pos
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nonecheck(False)
def __cinit__(self, M, c, z, pos, ks, zs=None):
if path is None:
raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")
self.M = M
self.c = c
self.z = z
self.pos = pos
if zs is None:
raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
self.zs=zs
self.ks=ks
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N = x.shape[0]
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x < 0.999)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)
mask = np.where(x > 1.001)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)
mask = np.where((x >= 0.999) & (x <= 1.001))[0]
out[mask] = ks*(22./15. - 0.8*x[mask])
return out
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N=len(x)
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x > 0.01)[0]
out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
x[mask]**(-2) - self.__kappa(x[mask], ks)
mask = np.where(x <= 0.01)[0]
out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))
return out
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
ks = np.ascontiguousarray(self.ks)
N = len(ks)
mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wg = np.zeros(len(r), dtype=np.float64 , order='C')
for i from N > i >= 0:
g = self.__gamma(r, ks[i])
kappa = self.__kappa(r, ks[i])
g /= 1 - kappa
wg+=g*pdf_z[:,mask+i]
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
我想知道是否有一种方法可以通过 ks
数组对 Test
class 的 getSh
方法进行矢量化,并避免使用循环让我的代码更快?
我认为 'vectorize' 不适用于 cython。在 numpy
中,您可以使用快速编译代码进行矢量化,如 +
、*
、sum
等操作。有一个 np.vectorize
函数,但它只是将您的代码包装在一个理解广播和多维数组的迭代器中。它不会重写您的函数,也不会加快速度。
Cython用于加速现有编译向量运算中无法表达的numpy代码。它通过将这些编译为快速 C 迭代来获得速度。
从表面上看,getSh
中的 i
循环看起来很快(c 风格),但它调用了 self.__kappa
和 self.__gamma
。两者都加载了 np
调用 - np.array
、np.where
、np.log
等。通过这些调用,您无法获得纯 c
代码。
您需要关注这两个方法,将它们表示为对数字的简单操作,并根据需要显式迭代 c
样式。
如果您可以将整个数组 ks
传递给方法 self.__gamma()
和 self.__kappa()
,那么您的代码的矢量化将完成,从而避免每次循环迭代的函数调用开销因为您会将循环移动到最内部调用的方法。
还有一些其他技巧可以让您获得额外的表现:
- 只在循环外计算一次掩码,因为它们与数组有关
r
mask = x > 0.01
而不是 mask = np.where(x > 0.01)[0]
和类似的
- 重复使用
out
数组,因为它的长度总是=N
编辑:
将上述想法付诸实践后,我想出了以下解决方案,其中方法 __kappa()
和 __gamma()
不再是必需的。虽然没有经过测试:
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1] r, ks, wg
cdef np.ndarray[double, ndim=1] pos_x, pos_y
cdef np.ndarray[double, ndim=2, mode='c'] gamma, kappa, wgtmp
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
m1 = r > 0.01
m2 = ~m1
r1 = r[m1]
r2 = r[m2]
ma = r < 0.999
mb = (r >= 0.999) & (r <= 1.001)
mc = r > 1.001
ra = r[ma]
rb = r[mb]
rc = r[mc]
ks = np.ascontiguousarray(self.ks)
one = np.ones_like(ks)
N = len(ks)
P = len(r)
kappa = np.zeros((P, N), dtype=np.float64 , order='C')
gamma = np.zeros((P, N), dtype=np.float64 , order='C')
wgtmp = np.zeros((P, N), dtype=np.float64 , order='C')
wg = np.zeros((P,), dtype=np.float64)
kappa[ma] = (2*ks/(ra**2 - 1)[:, None] *
one*(1 - np.log((1 + ((1 - ra)/(ra + 1))**0.5)/(1 - ((1 -
ra)/(ra + 1))**0.5))/(1 - ra**2)**0.5)[:, None])
kappa[mb] = ks*(22./15. - 0.8*rb)[:, None]
kappa[mc] = (2*ks/(rc**2 - 1)[:, None] *
one*(1 - 2*np.arctan(((rc - 1)/(rc + 1))**0.5)/(rc**2 -
1)**0.5)[:, None])
gamma[m1 & ma] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[ma])[:, None]
gamma[m1 & mb] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mb])[:, None]
gamma[m1 & mc] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mc])[:, None]
gamma[m2] = 4*ks*(0.25 + 0.125 * r2**2 * (3.25 + 3.0*np.log(r2/2)))[:, None]
init = np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wgtmp = gamma/(1-kappa) * pdf_z[:, init:init+N]
wg = wgtmp.sum(axis=1)
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
我正在努力优化我的 cython
代码以尽可能提高其速度。我仍然无法弄清楚在 cython
中应该如何完成的挑战之一是将数组映射到函数上,就像在 numpy.vectorize
函数中所做的那样。
我的问题的简化版本是
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
cdef public double M, c, z
cdef public double[::1] ks, zs, pos
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nonecheck(False)
def __cinit__(self, M, c, z, pos, ks, zs=None):
if path is None:
raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")
self.M = M
self.c = c
self.z = z
self.pos = pos
if zs is None:
raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
self.zs=zs
self.ks=ks
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N = x.shape[0]
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x < 0.999)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)
mask = np.where(x > 1.001)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)
mask = np.where((x >= 0.999) & (x <= 1.001))[0]
out[mask] = ks*(22./15. - 0.8*x[mask])
return out
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N=len(x)
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x > 0.01)[0]
out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
x[mask]**(-2) - self.__kappa(x[mask], ks)
mask = np.where(x <= 0.01)[0]
out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))
return out
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
ks = np.ascontiguousarray(self.ks)
N = len(ks)
mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wg = np.zeros(len(r), dtype=np.float64 , order='C')
for i from N > i >= 0:
g = self.__gamma(r, ks[i])
kappa = self.__kappa(r, ks[i])
g /= 1 - kappa
wg+=g*pdf_z[:,mask+i]
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
我想知道是否有一种方法可以通过 ks
数组对 Test
class 的 getSh
方法进行矢量化,并避免使用循环让我的代码更快?
我认为 'vectorize' 不适用于 cython。在 numpy
中,您可以使用快速编译代码进行矢量化,如 +
、*
、sum
等操作。有一个 np.vectorize
函数,但它只是将您的代码包装在一个理解广播和多维数组的迭代器中。它不会重写您的函数,也不会加快速度。
Cython用于加速现有编译向量运算中无法表达的numpy代码。它通过将这些编译为快速 C 迭代来获得速度。
从表面上看,getSh
中的 i
循环看起来很快(c 风格),但它调用了 self.__kappa
和 self.__gamma
。两者都加载了 np
调用 - np.array
、np.where
、np.log
等。通过这些调用,您无法获得纯 c
代码。
您需要关注这两个方法,将它们表示为对数字的简单操作,并根据需要显式迭代 c
样式。
如果您可以将整个数组 ks
传递给方法 self.__gamma()
和 self.__kappa()
,那么您的代码的矢量化将完成,从而避免每次循环迭代的函数调用开销因为您会将循环移动到最内部调用的方法。
还有一些其他技巧可以让您获得额外的表现:
- 只在循环外计算一次掩码,因为它们与数组有关
r
mask = x > 0.01
而不是mask = np.where(x > 0.01)[0]
和类似的- 重复使用
out
数组,因为它的长度总是=N
编辑:
将上述想法付诸实践后,我想出了以下解决方案,其中方法 __kappa()
和 __gamma()
不再是必需的。虽然没有经过测试:
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1] r, ks, wg
cdef np.ndarray[double, ndim=1] pos_x, pos_y
cdef np.ndarray[double, ndim=2, mode='c'] gamma, kappa, wgtmp
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
m1 = r > 0.01
m2 = ~m1
r1 = r[m1]
r2 = r[m2]
ma = r < 0.999
mb = (r >= 0.999) & (r <= 1.001)
mc = r > 1.001
ra = r[ma]
rb = r[mb]
rc = r[mc]
ks = np.ascontiguousarray(self.ks)
one = np.ones_like(ks)
N = len(ks)
P = len(r)
kappa = np.zeros((P, N), dtype=np.float64 , order='C')
gamma = np.zeros((P, N), dtype=np.float64 , order='C')
wgtmp = np.zeros((P, N), dtype=np.float64 , order='C')
wg = np.zeros((P,), dtype=np.float64)
kappa[ma] = (2*ks/(ra**2 - 1)[:, None] *
one*(1 - np.log((1 + ((1 - ra)/(ra + 1))**0.5)/(1 - ((1 -
ra)/(ra + 1))**0.5))/(1 - ra**2)**0.5)[:, None])
kappa[mb] = ks*(22./15. - 0.8*rb)[:, None]
kappa[mc] = (2*ks/(rc**2 - 1)[:, None] *
one*(1 - 2*np.arctan(((rc - 1)/(rc + 1))**0.5)/(rc**2 -
1)**0.5)[:, None])
gamma[m1 & ma] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[ma])[:, None]
gamma[m1 & mb] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mb])[:, None]
gamma[m1 & mc] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mc])[:, None]
gamma[m2] = 4*ks*(0.25 + 0.125 * r2**2 * (3.25 + 3.0*np.log(r2/2)))[:, None]
init = np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wgtmp = gamma/(1-kappa) * pdf_z[:, init:init+N]
wg = wgtmp.sum(axis=1)
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2