Numba 快速数学不会提高速度
Numba fast math does not improve speed
我 运行 以下代码启用和禁用了 fastmath
选项。
import numpy as np
from numba import jit
from threading import Thread
import time
import psutil
from tqdm import tqdm
@jit(nopython=True, fastmath=True)
def compute_angle(vectors):
return 180 + np.degrees(np.arctan2(vectors[:, :, 1], vectors[:, :, 0]))
cpu_usage = list()
times = list()
# Log cpu usage
running = False
def threaded_function():
while not running:
time.sleep(0.1)
print("Start logging CPU")
while running:
cpu_usage.append(psutil.cpu_percent())
print("Stop logging CPU")
thread = Thread(target=threaded_function, args=())
thread.start()
iterations = 1000
# Generate frames
vectors_list = list()
for i in tqdm(range(iterations), total=iterations):
vectors = np.random.randint(-50, 50, (500, 1000, 2))
vectors_list.append(vectors)
for i in tqdm(range(iterations), total=iterations):
s = time.time()
compute_angle(vectors_list[i])
e = time.time()
times.append(e - s)
# Do not count first iteration
running = True
running = False
thread.join()
print("Average time per iteration", np.mean(times[1:]))
print("Average CPU usage:", np.mean(cpu_usage))
fastmath=True
的结果是:
Average time per iteration 0.02076407738992044
Average CPU usage: 6.738916256157635`
fastmath=False
的结果是:
Average time per iteration 0.020854528721149738
Average CPU usage: 6.676455696202531
因为我正在使用数学运算,所以我应该期待一些收获吗?
我也尝试安装 icc-rt
但我不确定如何检查它是否已启用。
谢谢!
要使 SIMD 向量化工作,还缺少一些东西。为了获得最佳性能,还必须避免使用代价高昂的临时数组,如果您使用部分矢量化函数,临时数组可能不会被优化掉。
- 函数调用必须内联
- 内存访问模式必须在编译时已知。在以下示例中,这是使用
assert vectors.shape[2]==2
完成的。通常最后一个数组的形状也可以大于两个,这对 SIMD 向量化来说会复杂得多。
- 除以零检查也可以避免 SIMD 向量化,如果不优化它们会很慢。我通过计算
div_pi=1/np.pi
一次而不是循环内的简单乘法来手动执行此操作。如果无法避免重复除法,您可以使用 error_model="numpy"
来避免被零检查除法。
例子
import numpy as np
import numba as nb
@nb.njit(fastmath=True)
def your_function(vectors):
return 180 + np.degrees(np.arctan2(vectors[:, :, 1], vectors[:, :, 0]))
@nb.njit(fastmath=True)#False
def optimized_function(vectors):
assert vectors.shape[2]==2
res=np.empty((vectors.shape[0],vectors.shape[1]),dtype=vectors.dtype)
div_pi=180/np.pi
for i in range(vectors.shape[0]):
for j in range(vectors.shape[1]):
res[i,j]=np.arctan2(vectors[i,j,1],vectors[i,j,0])*div_pi+180
return res
时间
vectors=np.random.rand(1000,1000,2)
%timeit your_function(vectors)
#no difference between fastmath=True or False, no SIMD-vectorization at all
#23.3 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit optimized_function(vectors)
#with fastmath=False #SIMD-vectorized, but with the slower (more accurate) SVML algorithm
#9.03 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#with fastmath=True #SIMD-vectorized, but with the faster(less accurate) SVML algorithm
#4.45 ms ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
我 运行 以下代码启用和禁用了 fastmath
选项。
import numpy as np
from numba import jit
from threading import Thread
import time
import psutil
from tqdm import tqdm
@jit(nopython=True, fastmath=True)
def compute_angle(vectors):
return 180 + np.degrees(np.arctan2(vectors[:, :, 1], vectors[:, :, 0]))
cpu_usage = list()
times = list()
# Log cpu usage
running = False
def threaded_function():
while not running:
time.sleep(0.1)
print("Start logging CPU")
while running:
cpu_usage.append(psutil.cpu_percent())
print("Stop logging CPU")
thread = Thread(target=threaded_function, args=())
thread.start()
iterations = 1000
# Generate frames
vectors_list = list()
for i in tqdm(range(iterations), total=iterations):
vectors = np.random.randint(-50, 50, (500, 1000, 2))
vectors_list.append(vectors)
for i in tqdm(range(iterations), total=iterations):
s = time.time()
compute_angle(vectors_list[i])
e = time.time()
times.append(e - s)
# Do not count first iteration
running = True
running = False
thread.join()
print("Average time per iteration", np.mean(times[1:]))
print("Average CPU usage:", np.mean(cpu_usage))
fastmath=True
的结果是:
Average time per iteration 0.02076407738992044
Average CPU usage: 6.738916256157635`
fastmath=False
的结果是:
Average time per iteration 0.020854528721149738
Average CPU usage: 6.676455696202531
因为我正在使用数学运算,所以我应该期待一些收获吗?
我也尝试安装 icc-rt
但我不确定如何检查它是否已启用。
谢谢!
要使 SIMD 向量化工作,还缺少一些东西。为了获得最佳性能,还必须避免使用代价高昂的临时数组,如果您使用部分矢量化函数,临时数组可能不会被优化掉。
- 函数调用必须内联
- 内存访问模式必须在编译时已知。在以下示例中,这是使用
assert vectors.shape[2]==2
完成的。通常最后一个数组的形状也可以大于两个,这对 SIMD 向量化来说会复杂得多。 - 除以零检查也可以避免 SIMD 向量化,如果不优化它们会很慢。我通过计算
div_pi=1/np.pi
一次而不是循环内的简单乘法来手动执行此操作。如果无法避免重复除法,您可以使用error_model="numpy"
来避免被零检查除法。
例子
import numpy as np
import numba as nb
@nb.njit(fastmath=True)
def your_function(vectors):
return 180 + np.degrees(np.arctan2(vectors[:, :, 1], vectors[:, :, 0]))
@nb.njit(fastmath=True)#False
def optimized_function(vectors):
assert vectors.shape[2]==2
res=np.empty((vectors.shape[0],vectors.shape[1]),dtype=vectors.dtype)
div_pi=180/np.pi
for i in range(vectors.shape[0]):
for j in range(vectors.shape[1]):
res[i,j]=np.arctan2(vectors[i,j,1],vectors[i,j,0])*div_pi+180
return res
时间
vectors=np.random.rand(1000,1000,2)
%timeit your_function(vectors)
#no difference between fastmath=True or False, no SIMD-vectorization at all
#23.3 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit optimized_function(vectors)
#with fastmath=False #SIMD-vectorized, but with the slower (more accurate) SVML algorithm
#9.03 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#with fastmath=True #SIMD-vectorized, but with the faster(less accurate) SVML algorithm
#4.45 ms ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)