Cython 不提供加速
Cython not giving speedup
我一直在尝试测试与基本 Python 代码相比使用 Cython 的加速潜力。为此,我写了两个脚本'linearAdvec_mat.py'和'linearAdvec_mat.pyx'如下:
linearAdvec_mat.py:
import numpy as np
def Adv_mat(N):
A = np.zeros((N, N));
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
def Diff_mat(N):
D = np.zeros((N, N));
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
def Compute_eigVals(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
ldt = np.zeros(N, 'complex');
beta = np.zeros(N);
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for j in range(N):
ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
return ldt;
和linearAdvec_mat.pyx:
import numpy as np
cimport numpy as np
DTYPE = np.float64;
DTYPE_c = np.complex128;
ctypedef np.float64_t DTYPE_t;
cdef np.ndarray[DTYPE_t, ndim = 2] Adv_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] A = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
cdef np.ndarray[DTYPE_t, ndim = 2] Diff_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] D = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N);
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N);
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j;
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c);
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE);
cdef int m, k;
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*np.exp(1j*2.0*np.pi*k*m/N);
return ldt;
当我从基础 python 和编译的 .so 文件调用 'Compute_eigVals' 函数时,如下所示,我没有从 cython 脚本中获得任何显着的加速。
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
from libs.linearAdvec_mat import Compute_eigVals as Compute_eigVals_cy
from linearAdvec_mat import Compute_eigVals as Compute_eigVals_py
import time
#%% ------------------ Inputs ---------------------
N = 1000;
alpha = 0.8;
kdt = 0.05;
st = time.time();
eigs = Compute_eigVals_cy(N, alpha, kdt);
t_cy = time.time() - st;
print('Cython time : %0.8fs\n'%(t_cy));
st = time.time();
eigs = Compute_eigVals_py(N, alpha, kdt);
t_py = time.time() - st;
print('Python time : %0.8fs\n'%(t_py));
print('Cython is %0.5f times faster'%(t_py/t_cy));
我试图通过 运行
检查 python 互动的数量
cython -a linearAdvec_mat.pyx
在终端中,但我无法从中算出任何东西。有人可以提供一些见解,说明为什么我在使用 cython 时没有获得显着的加速吗?我的第一个猜测是我的基础 python 脚本严重依赖于 numpy,它已经处于优化状态,但我完全确定并且渴望弄清楚实际发生了什么。
Cython解决方案:
让我们将您的 python 函数作为基准参考:
In [3]: %timeit Compute_eigVals(N, alpha, kdt)
3.85 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
通过在 jupyter notebook
中分析您的 Python 代码
In [4]: %lprun -f Compute_eigVals Compute_eigVals(N, alpha, kdt)
Timer unit: 1e-06 s
Total time: 4.35475 s
File: <ipython-input-1-61dba133ade4>
Function: Compute_eigVals at line 37
Line # Hits Time Per Hit % Time Line Contents
==============================================================
37 def Compute_eigVals(N, alpha, kdt):
38 1 2491.0 2491.0 0.1 A = Adv_mat(N);
39 1 2295.0 2295.0 0.1 D = Diff_mat(N);
40 1 8582.0 8582.0 0.2 ADt = A*(-alpha/2.0) + D*kdt;
41 1 11.0 11.0 0.0 ldt = np.zeros(N, 'complex');
42 1 2.0 2.0 0.0 beta = np.zeros(N);
43 1001 357.0 0.4 0.0 for m in range(N):
44 1000 713.0 0.7 0.0 beta[m] = 2*np.pi*m/N;
45 1000 720.0 0.7 0.0 if beta[m] > np.pi:
46 499 356.0 0.7 0.0 beta[m] = 2*np.pi - beta[m];
47 1001000 390717.0 0.4 9.0 for j in range(N):
48 1000000 3948510.0 3.9 90.7 ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
49 1 1.0 1.0 0.0 return ldt;
我们可以观察到时间关键部分是最内层的循环。那么让我们来看看你的cython代码:
为了减少 python 开销,这里有几个关键点:
- 访问常量
np.pi
有明显的 python 开销。相反,您可以在 libc.math
中使用 C 常量 pi
。此外,您可以缓存 2.0*pi
和 1j*2.0*pi
的结果,因为您多次使用它们。
- 同样,函数
np.exp
也有 python 开销,并且为标量参数调用它并不能证明调用 python 函数的开销是合理的。相反,您可以使用 C cexp
函数。
- 最后,您可以使用 Cython Compiler directives 进一步加速您的代码。在这里,我们启用 C 整数除法 (
cdivision
),禁用索引检查 (boundscheck
) 并禁用负索引 (wraparound
)
在代码中:
cimport cython
from libc.math cimport pi
cdef extern from "complex.h":
double complex cexp(double complex)
# Adv_mat and Diff_mat are the same as above
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N)
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N)
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c)
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE)
cdef int m, k
cdef double two_pi = 2*pi
cdef double complex factor = 1j*2.0*pi+0
for m in range(N):
beta[m] = two_pi*m / N;
if beta[m] > pi:
beta[m] = two_pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*cexp(factor*k*m / N);
return ldt;
这消除了循环内的所有 python 交互。在我的机器上计时给出:
In [6]: %timeit Compute_eigVals(N, alpha, kdt)
45.8 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
改进的Python版本:
另请注意,实际上并不需要 Cython,因为您可以用矢量化 numpy 操作替换 python 循环:
def Compute_eigVals2(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
beta = 2*np.pi*np.arange(N)/N
beta[beta > np.pi] = 2*np.pi - beta[beta > np.pi]
JM = np.arange(N) * np.arange(N)[:, None]
ldt = np.sum(ADt[0, :] * np.exp(1j*2.0*np.pi*JM/N), axis=-1)
return ldt
In [7]: %timeit Compute_eigVals2(N, alpha, kdt)
35.8 ms ± 655 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
我一直在尝试测试与基本 Python 代码相比使用 Cython 的加速潜力。为此,我写了两个脚本'linearAdvec_mat.py'和'linearAdvec_mat.pyx'如下:
linearAdvec_mat.py:
import numpy as np
def Adv_mat(N):
A = np.zeros((N, N));
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
def Diff_mat(N):
D = np.zeros((N, N));
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
def Compute_eigVals(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
ldt = np.zeros(N, 'complex');
beta = np.zeros(N);
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for j in range(N):
ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
return ldt;
和linearAdvec_mat.pyx:
import numpy as np
cimport numpy as np
DTYPE = np.float64;
DTYPE_c = np.complex128;
ctypedef np.float64_t DTYPE_t;
cdef np.ndarray[DTYPE_t, ndim = 2] Adv_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] A = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
A[i, N - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
elif i == N - 1:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, 0] = 1.0;
else:
A[i, i - 1] = -1.0;
A[i, i] = 0.0;
A[i, i + 1] = 1.0;
return A;
cdef np.ndarray[DTYPE_t, ndim = 2] Diff_mat(int N):
cdef np.ndarray[DTYPE_t, ndim = 2] D = np.zeros((N, N), dtype = DTYPE);
cdef int i;
for i in range(N):
if i == 0:
D[i, N - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
elif i == N - 1:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, 0] = 1.0;
else:
D[i, i - 1] = 1.0;
D[i, i] = -2.0;
D[i, i + 1] = 1.0;
return D;
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N);
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N);
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j;
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c);
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE);
cdef int m, k;
for m in range(N):
beta[m] = 2*np.pi*m/N;
if beta[m] > np.pi:
beta[m] = 2*np.pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*np.exp(1j*2.0*np.pi*k*m/N);
return ldt;
当我从基础 python 和编译的 .so 文件调用 'Compute_eigVals' 函数时,如下所示,我没有从 cython 脚本中获得任何显着的加速。
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
from libs.linearAdvec_mat import Compute_eigVals as Compute_eigVals_cy
from linearAdvec_mat import Compute_eigVals as Compute_eigVals_py
import time
#%% ------------------ Inputs ---------------------
N = 1000;
alpha = 0.8;
kdt = 0.05;
st = time.time();
eigs = Compute_eigVals_cy(N, alpha, kdt);
t_cy = time.time() - st;
print('Cython time : %0.8fs\n'%(t_cy));
st = time.time();
eigs = Compute_eigVals_py(N, alpha, kdt);
t_py = time.time() - st;
print('Python time : %0.8fs\n'%(t_py));
print('Cython is %0.5f times faster'%(t_py/t_cy));
我试图通过 运行
检查 python 互动的数量cython -a linearAdvec_mat.pyx
在终端中,但我无法从中算出任何东西。有人可以提供一些见解,说明为什么我在使用 cython 时没有获得显着的加速吗?我的第一个猜测是我的基础 python 脚本严重依赖于 numpy,它已经处于优化状态,但我完全确定并且渴望弄清楚实际发生了什么。
Cython解决方案:
让我们将您的 python 函数作为基准参考:
In [3]: %timeit Compute_eigVals(N, alpha, kdt)
3.85 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
通过在 jupyter notebook
中分析您的 Python 代码In [4]: %lprun -f Compute_eigVals Compute_eigVals(N, alpha, kdt)
Timer unit: 1e-06 s
Total time: 4.35475 s
File: <ipython-input-1-61dba133ade4>
Function: Compute_eigVals at line 37
Line # Hits Time Per Hit % Time Line Contents
==============================================================
37 def Compute_eigVals(N, alpha, kdt):
38 1 2491.0 2491.0 0.1 A = Adv_mat(N);
39 1 2295.0 2295.0 0.1 D = Diff_mat(N);
40 1 8582.0 8582.0 0.2 ADt = A*(-alpha/2.0) + D*kdt;
41 1 11.0 11.0 0.0 ldt = np.zeros(N, 'complex');
42 1 2.0 2.0 0.0 beta = np.zeros(N);
43 1001 357.0 0.4 0.0 for m in range(N):
44 1000 713.0 0.7 0.0 beta[m] = 2*np.pi*m/N;
45 1000 720.0 0.7 0.0 if beta[m] > np.pi:
46 499 356.0 0.7 0.0 beta[m] = 2*np.pi - beta[m];
47 1001000 390717.0 0.4 9.0 for j in range(N):
48 1000000 3948510.0 3.9 90.7 ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
49 1 1.0 1.0 0.0 return ldt;
我们可以观察到时间关键部分是最内层的循环。那么让我们来看看你的cython代码:
为了减少 python 开销,这里有几个关键点:
- 访问常量
np.pi
有明显的 python 开销。相反,您可以在libc.math
中使用 C 常量pi
。此外,您可以缓存2.0*pi
和1j*2.0*pi
的结果,因为您多次使用它们。 - 同样,函数
np.exp
也有 python 开销,并且为标量参数调用它并不能证明调用 python 函数的开销是合理的。相反,您可以使用 Ccexp
函数。 - 最后,您可以使用 Cython Compiler directives 进一步加速您的代码。在这里,我们启用 C 整数除法 (
cdivision
),禁用索引检查 (boundscheck
) 并禁用负索引 (wraparound
)
在代码中:
cimport cython
from libc.math cimport pi
cdef extern from "complex.h":
double complex cexp(double complex)
# Adv_mat and Diff_mat are the same as above
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N)
cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N)
cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j
cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c)
cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE)
cdef int m, k
cdef double two_pi = 2*pi
cdef double complex factor = 1j*2.0*pi+0
for m in range(N):
beta[m] = two_pi*m / N;
if beta[m] > pi:
beta[m] = two_pi - beta[m];
for k in range(N):
ldt[m] = ldt[m] + ADt[0, k]*cexp(factor*k*m / N);
return ldt;
这消除了循环内的所有 python 交互。在我的机器上计时给出:
In [6]: %timeit Compute_eigVals(N, alpha, kdt)
45.8 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
改进的Python版本:
另请注意,实际上并不需要 Cython,因为您可以用矢量化 numpy 操作替换 python 循环:
def Compute_eigVals2(N, alpha, kdt):
A = Adv_mat(N);
D = Diff_mat(N);
ADt = A*(-alpha/2.0) + D*kdt;
beta = 2*np.pi*np.arange(N)/N
beta[beta > np.pi] = 2*np.pi - beta[beta > np.pi]
JM = np.arange(N) * np.arange(N)[:, None]
ldt = np.sum(ADt[0, :] * np.exp(1j*2.0*np.pi*JM/N), axis=-1)
return ldt
In [7]: %timeit Compute_eigVals2(N, alpha, kdt)
35.8 ms ± 655 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)