为什么这个 log gamma numba 函数对于大数组比 scipy 慢,但对于单个值更快?
Why is this log gamma numba function slower than scipy for large arrays, but faster for single values?
我有一个函数可以计算 log gamma function that I am decorating with numba.njit
。
import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5
])
@njit(fastmath=True)
def gammaln_nr(z):
"""Numerical Recipes 6.1"""
y = z
tmp = z + 5.24218750000000000
tmp = (z + 0.5) * log(tmp) - tmp
ser = np.ones_like(y) * 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1
ser = ser + coefs[j] / y
out = tmp + log(2.5066282746310005 * ser / z)
return out
当我对大数组使用 gammaln_nr
时,比如 np.linspace(0.001, 100, 10**7)
,我的 运行 时间比 scipy 慢大约 7 倍(见代码在下面的附录中)。但是,如果我 运行 用于任何单个值,我的 numba 函数总是快 2 倍左右。这是怎么回事?
z = 11.67
%timeit gammaln_nr(z)
%timeit gammaln(z)
>>> 470 ns ± 29.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
>>> 1.22 µs ± 28.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
我的直觉是,如果我的函数对于一个值更快,那么它对于一组值也应该更快。当然,情况可能并非如此,因为我不知道 numba 是否使用 SIMD 指令或某种其他类型的矢量化,而 scipy 可能是。
附录
import matplotlib.pyplot as plt
import seaborn as sns
n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)
for i in range(n_trials):
zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range
# dont take first timing - this is just compilation
start = time.time()
gammaln_nr(zs)
end = time.time()
start = time.time()
gammaln_nr(zs)
end = time.time()
fastats_times[i] = end - start
start = time.time()
gammaln(zs)
end = time.time()
scipy_times[i] = end - start
fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");
在 Numba 中实施 gammaln
重新实现一些常用函数可能需要相当多的工作,不仅要达到性能,还要获得明确定义的精度级别。
所以直接的方法就是 .
如果 gammaln
scipy- 调用此函数的 C-implemntation。因此,scipy-实现的速度还取决于编译器和编译 scipy 依赖项时使用的编译器标志。
同样不足为奇的是,一个值的性能结果可能与较大数组的结果有很大差异。
在第一种情况下,调用开销(包括类型转换、输入检查等)占主导地位,在第二种情况下,
实施变得越来越重要。
改进您的实施
- 编写显式循环。在 Numba 中,向量化操作被扩展为循环,之后 Numba 尝试加入循环。通常最好手动写出并加入这个循环。
- 想想基本算术实现的差异。 Python 总是检查被 0 除并在这种情况下引发异常,这是非常昂贵的。 Numba 也默认使用此行为,但您也可以切换到 Numpy 错误检查。在这种情况下,除以 0 会导致 NaN。在进一步计算中处理 NaN 和 Inf -0/+0 的方式也受 fast-math 标志的影响。
代码
import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit
import numba as nb
@njit(fastmath=True,error_model='numpy')
def gammaln_nr(z):
"""Numerical Recipes 6.1"""
#Don't use global variables.. (They only can be changed if you recompile the function)
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5])
out=np.empty(z.shape[0])
for i in range(z.shape[0]):
y = z[i]
tmp = z[i] + 5.24218750000000000
tmp = (z[i] + 0.5) * np.log(tmp) - tmp
ser = 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1.
ser = ser + coefs[j] / y
out[i] = tmp + log(2.5066282746310005 * ser / z[i])
return out
@njit(fastmath=True,error_model='numpy',parallel=True)
def gammaln_nr_p(z):
"""Numerical Recipes 6.1"""
#Don't use global variables.. (They only can be changed if you recompile the function)
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5])
out=np.empty(z.shape[0])
for i in nb.prange(z.shape[0]):
y = z[i]
tmp = z[i] + 5.24218750000000000
tmp = (z[i] + 0.5) * np.log(tmp) - tmp
ser = 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1.
ser = ser + coefs[j] / y
out[i] = tmp + log(2.5066282746310005 * ser / z[i])
return out
import matplotlib.pyplot as plt
import seaborn as sns
import time
n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)
fastats_times_p = np.zeros(n_trials)
for i in range(n_trials):
zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range
# dont take first timing - this is just compilation
start = time.time()
arr_1=gammaln_nr(zs)
end = time.time()
start = time.time()
arr_1=gammaln_nr(zs)
end = time.time()
fastats_times[i] = end - start
start = time.time()
arr_3=gammaln_nr_p(zs)
end = time.time()
fastats_times_p[i] = end - start
start = time.time()
start = time.time()
arr_3=gammaln_nr_p(zs)
end = time.time()
fastats_times_p[i] = end - start
start = time.time()
arr_2=gammaln(zs)
end = time.time()
scipy_times[i] = end - start
print(np.allclose(arr_1,arr_2))
print(np.allclose(arr_1,arr_3))
fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times_p, label="numba_parallel");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");
fig.show()
我有一个函数可以计算 log gamma function that I am decorating with numba.njit
。
import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5
])
@njit(fastmath=True)
def gammaln_nr(z):
"""Numerical Recipes 6.1"""
y = z
tmp = z + 5.24218750000000000
tmp = (z + 0.5) * log(tmp) - tmp
ser = np.ones_like(y) * 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1
ser = ser + coefs[j] / y
out = tmp + log(2.5066282746310005 * ser / z)
return out
当我对大数组使用 gammaln_nr
时,比如 np.linspace(0.001, 100, 10**7)
,我的 运行 时间比 scipy 慢大约 7 倍(见代码在下面的附录中)。但是,如果我 运行 用于任何单个值,我的 numba 函数总是快 2 倍左右。这是怎么回事?
z = 11.67
%timeit gammaln_nr(z)
%timeit gammaln(z)
>>> 470 ns ± 29.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
>>> 1.22 µs ± 28.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
我的直觉是,如果我的函数对于一个值更快,那么它对于一组值也应该更快。当然,情况可能并非如此,因为我不知道 numba 是否使用 SIMD 指令或某种其他类型的矢量化,而 scipy 可能是。
附录
import matplotlib.pyplot as plt
import seaborn as sns
n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)
for i in range(n_trials):
zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range
# dont take first timing - this is just compilation
start = time.time()
gammaln_nr(zs)
end = time.time()
start = time.time()
gammaln_nr(zs)
end = time.time()
fastats_times[i] = end - start
start = time.time()
gammaln(zs)
end = time.time()
scipy_times[i] = end - start
fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");
在 Numba 中实施 gammaln
重新实现一些常用函数可能需要相当多的工作,不仅要达到性能,还要获得明确定义的精度级别。
所以直接的方法就是
如果 gammaln
scipy- 调用此函数的 C-implemntation。因此,scipy-实现的速度还取决于编译器和编译 scipy 依赖项时使用的编译器标志。
同样不足为奇的是,一个值的性能结果可能与较大数组的结果有很大差异。 在第一种情况下,调用开销(包括类型转换、输入检查等)占主导地位,在第二种情况下, 实施变得越来越重要。
改进您的实施
- 编写显式循环。在 Numba 中,向量化操作被扩展为循环,之后 Numba 尝试加入循环。通常最好手动写出并加入这个循环。
- 想想基本算术实现的差异。 Python 总是检查被 0 除并在这种情况下引发异常,这是非常昂贵的。 Numba 也默认使用此行为,但您也可以切换到 Numpy 错误检查。在这种情况下,除以 0 会导致 NaN。在进一步计算中处理 NaN 和 Inf -0/+0 的方式也受 fast-math 标志的影响。
代码
import numpy as np
from numpy import log
from scipy.special import gammaln
from numba import njit
import numba as nb
@njit(fastmath=True,error_model='numpy')
def gammaln_nr(z):
"""Numerical Recipes 6.1"""
#Don't use global variables.. (They only can be changed if you recompile the function)
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5])
out=np.empty(z.shape[0])
for i in range(z.shape[0]):
y = z[i]
tmp = z[i] + 5.24218750000000000
tmp = (z[i] + 0.5) * np.log(tmp) - tmp
ser = 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1.
ser = ser + coefs[j] / y
out[i] = tmp + log(2.5066282746310005 * ser / z[i])
return out
@njit(fastmath=True,error_model='numpy',parallel=True)
def gammaln_nr_p(z):
"""Numerical Recipes 6.1"""
#Don't use global variables.. (They only can be changed if you recompile the function)
coefs = np.array([
57.1562356658629235, -59.5979603554754912,
14.1360979747417471, -0.491913816097620199,
.339946499848118887e-4, .465236289270485756e-4,
-.983744753048795646e-4, .158088703224912494e-3,
-.210264441724104883e-3, .217439618115212643e-3,
-.164318106536763890e-3, .844182239838527433e-4,
-.261908384015814087e-4, .368991826595316234e-5])
out=np.empty(z.shape[0])
for i in nb.prange(z.shape[0]):
y = z[i]
tmp = z[i] + 5.24218750000000000
tmp = (z[i] + 0.5) * np.log(tmp) - tmp
ser = 0.999999999999997092
n = coefs.shape[0]
for j in range(n):
y = y + 1.
ser = ser + coefs[j] / y
out[i] = tmp + log(2.5066282746310005 * ser / z[i])
return out
import matplotlib.pyplot as plt
import seaborn as sns
import time
n_trials = 8
scipy_times = np.zeros(n_trials)
fastats_times = np.zeros(n_trials)
fastats_times_p = np.zeros(n_trials)
for i in range(n_trials):
zs = np.linspace(0.001, 100, 10**i) # evaluate gammaln over this range
# dont take first timing - this is just compilation
start = time.time()
arr_1=gammaln_nr(zs)
end = time.time()
start = time.time()
arr_1=gammaln_nr(zs)
end = time.time()
fastats_times[i] = end - start
start = time.time()
arr_3=gammaln_nr_p(zs)
end = time.time()
fastats_times_p[i] = end - start
start = time.time()
start = time.time()
arr_3=gammaln_nr_p(zs)
end = time.time()
fastats_times_p[i] = end - start
start = time.time()
arr_2=gammaln(zs)
end = time.time()
scipy_times[i] = end - start
print(np.allclose(arr_1,arr_2))
print(np.allclose(arr_1,arr_3))
fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times, label="numba");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), fastats_times_p, label="numba_parallel");
sns.lineplot(np.logspace(0, n_trials-1, n_trials), scipy_times, label="scipy");
ax.set(xscale="log");
ax.set_xlabel("Array Size", fontsize=15);
ax.set_ylabel("Execution Time (s)", fontsize=15);
ax.set_title("Execution Time of Log Gamma");
fig.show()