如何使用numba并行?
How to use numba parallel?
我正在尝试使用 numba 并行化或优化以下示例代码。但是,我仍然不明白这背后的逻辑。
并行诊断表明并行结构已经是最优的。
from numba import njit
@njit(parallel=True)
def SwVer(ResSwc,SwNew,Ia,Ja,Ka):
for k in (Ia):
for j in (Ja):
for i in (Ka):
if(SwNew[i,j,k]<ResSwc):
SwNew[i,j,k]=ResSwc
if(SwNew[i,j,k]>(1-ResSwc)):
SwNew[i,j,k]=(1-ResSwc)
return SwNew
import numpy as np
Imax=100
Jmax=100
Kmax=5
Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)
SwNew=np.random.random((Imax,Jmax,Kmax))
SwNew=SwVer(0.25,SwNew,Ia,Ja,Ka)
我怎样才能真正并行化这个函数?
循环展开是否会缩短执行时间?
还有其他方法可以改进我的三重循环吗?
谢谢
Q : "Does loop unrolling improve execution time?"
循环展开可以很好地避免循环开销,但考虑到 A[N,N,N]
的扁平化“长度”很快会增长到 [,更大的多级循环会出现更多问题=14=] 高于 ~ 1E3, 1E4, 1E5
远高于 GB
.
Q : "Is there any other way to improve my triple loop?"
避免传递多余的“参数”。它是昂贵的。这些越多越大。
Ia, Ja, Ka
- 重新表示 A[i,j,k]
的自然域索引,它们已经存在于 A
实例中,不是吗?传递和接收大量冗余参数是我们通常希望避免的一种奢侈。
@njit(parallel=True)
def SwVer( ResSwc, SwNew, Ia, Ja, Ka ):
# [: : :] || || ||
for k in ( Ka ):
for j in ( Ja ):
for i in ( Ia ):
if ( SwNew[i,j,k] < ResSwc ):
SwNew[i,j,k] = ResSwc
if ( SwNew[i,j,k] > ( 1 - ResSwc ) ):
SwNew[i,j,k] = ( 1 - ResSwc )
# SwNew[:,:,:]
return SwNew
原样 STATE-0:
原样代码大约在 ~ 1,824,168 [us] ~ 1.8 [s]
.
内执行
男人的小STEP :
就地修改总是比使用许多相同大小的中间实例来收集最终结果更快,这是另一个高性能级代码反模式。
只是删除最后一行 return SwNew
产生 ~ 714,977 [us] ~ 0.7 [s]
现在
一个一个男人的小步骤,
但是
一个人类的巨大飞跃 ( ... 性能 ... ) :
对于琐碎的 [i,j,k]
-mapable transformations 的独特性能可能喜欢尝试勇敢的 numpy
-矢量化计算。
整个技巧就是这么短:np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ))
您的函数可以 运行 在 12086 [us]
和 97810 [us]
之间的任何位置 numpy
-向量化模式。物理 RAM、缓存、CPU、O/S 工作负载的详细信息将导致可变影响,但 RAM 大小对于智能矢量化代码很重要,numpy
聪明很多,最多:
A[1000,1000,1000] ~ 8 [GB]
-RAM 占用空间。然而细节很重要。很多。
A[ 100, 100, 100] ~ 8 [MB]
-RAM 占用空间。现在它确实适合 L3/L2 缓存……这很重要……
A[ 10, 10, 10] ~ 8 [kB]
-RAM 占用空间。现在它确实适合 L1d 缓存......这很重要。很多...
让我们从非常开始量化:
由于 RAM,我们从二维降维开始。
>>> from zmq import Stopwatch; aClk = Stopwatch() # a [us]-resolution clock
>>> pass; import gc; gc.disable() # ... elementary
>>> pass; import numpy as np
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N, N ) ); aClk.stop()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "mtrand.pyx", line 861, in mtrand.RandomState.random_sample
File "mtrand.pyx", line 167, in mtrand.cont0_array
MemoryError
然而,矢量化代码技巧在原则上对于 1D- ... nD-tensor 处理保持不变:
#_______________________________________________________________________________
# [us]
#_______________________________________________________________________________
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
17801992
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
184895
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
1585
>>> N = int( 1E1 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
44
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
465
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
48651
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
4954694
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
25549190
#_______________________________________________________________________________
# [us] SEE THE GROWING COSTS FOR ram-ALLOCATIONS & STORAGE OF RANDOM num-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
471956
50067
49184
42891
48897
52639
45464
48828
#_______________________________________________________________________________
# [us] SEE THE 1st, resp. WAY-LOWER COSTS FOR 1st, resp. OTHER ram-ACCESS
#_______________________________________________________________________________
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
71044
12086
16277
28192
#_______________________________________________________________________________
# [us] SEE ALSO THE "noise" IN THE LATENCY-COSTS IN 2nd+ RE-RUN-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
45759
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
38362
46640
37927
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
4885472
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
35747022
#_______________________________________________________________________________
# [us] SEE THE SHIFT IN LATENCY-COSTS AS ram-i/o COSTS DOMINATE > 1 [GB]
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
2307509
50446
49464
43006
50958
54800
43418
57091
52135
46451
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
97810
20117
14480
22696
31172
14846
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR 2nd+ RE-RUN-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
47437
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
39298
19422
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR 2nd+ RE-RUN
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
44814
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
42565
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
43120
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
38039
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
45296
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
41898
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR block-RE-randomised-RE-RUN-s
#_______________________________________________________________________________
您的代码中至少有两个性能关键问题。
- Numpy 数组默认为 C 序(最后一个维度变化最快),因此内部循环应该在最后一个轴上迭代。
- 您正在使用索引数组迭代数据。这有很大的开销(内存和性能)并且完全没有必要。
三圈
除了非常简单的问题,通常需要明确地使用 nb.parfor
。另请注意,并行版本在运行时间在 µs 范围内的非常小的问题上速度较慢。
@njit(parallel=False) #True for parallel
def SwVer(ResSwc,SwNew):
for i in range(SwNew.shape[0]):
for j in range(SwNew.shape[1]):
for k in range(SwNew.shape[2]):
if(SwNew[i,j,k]<ResSwc):
SwNew[i,j,k]=ResSwc
if(SwNew[i,j,k]>(1-ResSwc)):
SwNew[i,j,k]=(1-ResSwc)
return SwNe
一个循环
#only works on contigous arrays, otherwise reshape will fail
@njit(parallel=False)
def SwVer_unrolled(ResSwc,SwNew):
shape_orig=SwNew.shape
SwNew_flat=SwNew.reshape(-1) #create a 1d-view
for i in nb.prange(SwNew_flat.shape[0]):
if(SwNew_flat[i]<ResSwc):
SwNew_flat[i]=ResSwc
if(SwNew_flat[i]>(1-ResSwc)):
SwNew_flat[i]=(1-ResSwc)
return SwNew_flat.reshape(shape_orig)
Numpy 向量化版本
def SwVer_np(ResSwc,SwNew):
SwNew[SwNew<ResSwc] =ResSwc
SwNew[SwNew>(1-ResSwc)]=(1-ResSwc)
return SwNew
计时
#very small array
Imax=100
Jmax=100
Kmax=5
Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)
#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#181 µs ± 1.92 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#44.6 µs ± 213 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
#parallel=True
#104 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#11.4 µs ± 96.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#parallel=True
#116 µs ± 4.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_np(0.25,SwNew)
#30.1 µs ± 568 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
#bigger arrays -> parallelization beneficial
Imax=1000
Jmax=1000
Kmax=5
SwNew=np.random.random((Imax,Jmax,Kmax))
#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#17.7 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#4.73 ms ± 96.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.3 ms ± 63.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#2.03 ms ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.17 ms ± 30.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_np(0.25,SwNew)
#7.9 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
结论
如果这部分对性能不是非常关键,我会更喜欢 Numpy 向量化版本,因为它很简单。你可以胜过它,但我认为这里不值得付出努力。
我正在尝试使用 numba 并行化或优化以下示例代码。但是,我仍然不明白这背后的逻辑。
并行诊断表明并行结构已经是最优的。
from numba import njit
@njit(parallel=True)
def SwVer(ResSwc,SwNew,Ia,Ja,Ka):
for k in (Ia):
for j in (Ja):
for i in (Ka):
if(SwNew[i,j,k]<ResSwc):
SwNew[i,j,k]=ResSwc
if(SwNew[i,j,k]>(1-ResSwc)):
SwNew[i,j,k]=(1-ResSwc)
return SwNew
import numpy as np
Imax=100
Jmax=100
Kmax=5
Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)
SwNew=np.random.random((Imax,Jmax,Kmax))
SwNew=SwVer(0.25,SwNew,Ia,Ja,Ka)
我怎样才能真正并行化这个函数? 循环展开是否会缩短执行时间? 还有其他方法可以改进我的三重循环吗?
谢谢
Q : "Does loop unrolling improve execution time?"
循环展开可以很好地避免循环开销,但考虑到 A[N,N,N]
的扁平化“长度”很快会增长到 [,更大的多级循环会出现更多问题=14=] 高于 ~ 1E3, 1E4, 1E5
远高于 GB
.
Q : "Is there any other way to improve my triple loop?"
避免传递多余的“参数”。它是昂贵的。这些越多越大。
Ia, Ja, Ka
- 重新表示 A[i,j,k]
的自然域索引,它们已经存在于 A
实例中,不是吗?传递和接收大量冗余参数是我们通常希望避免的一种奢侈。
@njit(parallel=True)
def SwVer( ResSwc, SwNew, Ia, Ja, Ka ):
# [: : :] || || ||
for k in ( Ka ):
for j in ( Ja ):
for i in ( Ia ):
if ( SwNew[i,j,k] < ResSwc ):
SwNew[i,j,k] = ResSwc
if ( SwNew[i,j,k] > ( 1 - ResSwc ) ):
SwNew[i,j,k] = ( 1 - ResSwc )
# SwNew[:,:,:]
return SwNew
原样 STATE-0:
原样代码大约在 ~ 1,824,168 [us] ~ 1.8 [s]
.
男人的小STEP :
就地修改总是比使用许多相同大小的中间实例来收集最终结果更快,这是另一个高性能级代码反模式。
只是删除最后一行 return SwNew
产生 ~ 714,977 [us] ~ 0.7 [s]
现在
一个一个男人的小步骤,
但是
一个人类的巨大飞跃 ( ... 性能 ... ) :
对于琐碎的 [i,j,k]
-mapable transformations 的独特性能可能喜欢尝试勇敢的 numpy
-矢量化计算。
整个技巧就是这么短:np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ))
您的函数可以 运行 在 12086 [us]
和 97810 [us]
之间的任何位置 numpy
-向量化模式。物理 RAM、缓存、CPU、O/S 工作负载的详细信息将导致可变影响,但 RAM 大小对于智能矢量化代码很重要,numpy
聪明很多,最多:
A[1000,1000,1000] ~ 8 [GB]
-RAM 占用空间。然而细节很重要。很多。
A[ 100, 100, 100] ~ 8 [MB]
-RAM 占用空间。现在它确实适合 L3/L2 缓存……这很重要……
A[ 10, 10, 10] ~ 8 [kB]
-RAM 占用空间。现在它确实适合 L1d 缓存......这很重要。很多...
让我们从非常开始量化:
由于 RAM,我们从二维降维开始。
>>> from zmq import Stopwatch; aClk = Stopwatch() # a [us]-resolution clock
>>> pass; import gc; gc.disable() # ... elementary
>>> pass; import numpy as np
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N, N ) ); aClk.stop()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "mtrand.pyx", line 861, in mtrand.RandomState.random_sample
File "mtrand.pyx", line 167, in mtrand.cont0_array
MemoryError
然而,矢量化代码技巧在原则上对于 1D- ... nD-tensor 处理保持不变:
#_______________________________________________________________________________
# [us]
#_______________________________________________________________________________
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
17801992
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
184895
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
1585
>>> N = int( 1E1 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
44
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
465
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
48651
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
4954694
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
25549190
#_______________________________________________________________________________
# [us] SEE THE GROWING COSTS FOR ram-ALLOCATIONS & STORAGE OF RANDOM num-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
471956
50067
49184
42891
48897
52639
45464
48828
#_______________________________________________________________________________
# [us] SEE THE 1st, resp. WAY-LOWER COSTS FOR 1st, resp. OTHER ram-ACCESS
#_______________________________________________________________________________
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
71044
12086
16277
28192
#_______________________________________________________________________________
# [us] SEE ALSO THE "noise" IN THE LATENCY-COSTS IN 2nd+ RE-RUN-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
45759
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
38362
46640
37927
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
4885472
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
35747022
#_______________________________________________________________________________
# [us] SEE THE SHIFT IN LATENCY-COSTS AS ram-i/o COSTS DOMINATE > 1 [GB]
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
2307509
50446
49464
43006
50958
54800
43418
57091
52135
46451
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
97810
20117
14480
22696
31172
14846
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR 2nd+ RE-RUN-s
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
47437
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
39298
19422
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR 2nd+ RE-RUN
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
44814
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
42565
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
43120
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
38039
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
45296
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
41898
#_______________________________________________________________________________
# [us] SEE THE "noise" FOR block-RE-randomised-RE-RUN-s
#_______________________________________________________________________________
您的代码中至少有两个性能关键问题。
- Numpy 数组默认为 C 序(最后一个维度变化最快),因此内部循环应该在最后一个轴上迭代。
- 您正在使用索引数组迭代数据。这有很大的开销(内存和性能)并且完全没有必要。
三圈
除了非常简单的问题,通常需要明确地使用 nb.parfor
。另请注意,并行版本在运行时间在 µs 范围内的非常小的问题上速度较慢。
@njit(parallel=False) #True for parallel
def SwVer(ResSwc,SwNew):
for i in range(SwNew.shape[0]):
for j in range(SwNew.shape[1]):
for k in range(SwNew.shape[2]):
if(SwNew[i,j,k]<ResSwc):
SwNew[i,j,k]=ResSwc
if(SwNew[i,j,k]>(1-ResSwc)):
SwNew[i,j,k]=(1-ResSwc)
return SwNe
一个循环
#only works on contigous arrays, otherwise reshape will fail
@njit(parallel=False)
def SwVer_unrolled(ResSwc,SwNew):
shape_orig=SwNew.shape
SwNew_flat=SwNew.reshape(-1) #create a 1d-view
for i in nb.prange(SwNew_flat.shape[0]):
if(SwNew_flat[i]<ResSwc):
SwNew_flat[i]=ResSwc
if(SwNew_flat[i]>(1-ResSwc)):
SwNew_flat[i]=(1-ResSwc)
return SwNew_flat.reshape(shape_orig)
Numpy 向量化版本
def SwVer_np(ResSwc,SwNew):
SwNew[SwNew<ResSwc] =ResSwc
SwNew[SwNew>(1-ResSwc)]=(1-ResSwc)
return SwNew
计时
#very small array
Imax=100
Jmax=100
Kmax=5
Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)
#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#181 µs ± 1.92 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#44.6 µs ± 213 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
#parallel=True
#104 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#11.4 µs ± 96.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#parallel=True
#116 µs ± 4.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_np(0.25,SwNew)
#30.1 µs ± 568 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
#bigger arrays -> parallelization beneficial
Imax=1000
Jmax=1000
Kmax=5
SwNew=np.random.random((Imax,Jmax,Kmax))
#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#17.7 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#4.73 ms ± 96.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.3 ms ± 63.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#2.03 ms ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.17 ms ± 30.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_np(0.25,SwNew)
#7.9 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
结论
如果这部分对性能不是非常关键,我会更喜欢 Numpy 向量化版本,因为它很简单。你可以胜过它,但我认为这里不值得付出努力。