如何使用numba并行?

How to use numba parallel?

我正在尝试使用 numba 并行化或优化以下示例代码。但是,我仍然不明白这背后的逻辑。

并行诊断表明并行结构已经是最优的。

from numba import njit
@njit(parallel=True)
def SwVer(ResSwc,SwNew,Ia,Ja,Ka):
    for k in (Ia):
        for j in (Ja):
            for i in (Ka):
                if(SwNew[i,j,k]<ResSwc):
                    SwNew[i,j,k]=ResSwc
                if(SwNew[i,j,k]>(1-ResSwc)):
                    SwNew[i,j,k]=(1-ResSwc)
    
    return SwNew

import numpy as np
Imax=100
Jmax=100
Kmax=5

Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)

SwNew=np.random.random((Imax,Jmax,Kmax))
SwNew=SwVer(0.25,SwNew,Ia,Ja,Ka)

我怎样才能真正并行化这个函数? 循环展开是否会缩短执行时间? 还有其他方法可以改进我的三重循环吗?

谢谢

Q : "Does loop unrolling improve execution time?"

循环展开可以很好地避免循环开销,但考虑到 A[N,N,N] 的扁平化“长度”很快会增长到 [,更大的多级循环会出现更多问题=14=] 高于 ~ 1E3, 1E4, 1E5 远高于 GB.

Q : "Is there any other way to improve my triple loop?"

避免传递多余的“参数”。它是昂贵的。这些越多越大。 Ia, Ja, Ka - 重新表示 A[i,j,k] 的自然域索引,它们已经存在于 A 实例中,不是吗?传递和接收大量冗余参数是我们通常希望避免的一种奢侈。

@njit(parallel=True)
def SwVer( ResSwc,   SwNew,           Ia, Ja, Ka ):
    #                     [: : :]     ||  ||  ||
    for                        k in (         Ka ):
        for                  j   in (     Ja ):
            for            i     in ( Ia ):
                if ( SwNew[i,j,k] <        ResSwc ):
                     SwNew[i,j,k]  =       ResSwc
                if ( SwNew[i,j,k] >  ( 1 - ResSwc ) ):
                     SwNew[i,j,k]  = ( 1 - ResSwc )
    #                SwNew[:,:,:]
    return           SwNew

原样 STATE-0:

原样代码大约在 ~ 1,824,168 [us] ~ 1.8 [s].

内执行

男人的小STEP :

就地修改总是比使用许多相同大小的中间实例来收集最终结果更快,这是另一个高性能级代码反模式。

只是删除最后一行 return SwNew 产生 ~ 714,977 [us] ~ 0.7 [s]


现在
一个一个男人的小步骤
但是
一个人类的巨大飞跃 ( ... 性能 ... ) :

对于琐碎的 [i,j,k]-mapable transformations 的独特性能可能喜欢尝试勇敢的 numpy -矢量化计算。

整个技巧就是这么短:np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ))

您的函数可以 运行 在 12086 [us]97810 [us] 之间的任何位置 numpy-向量化模式。物理 RAM、缓存、CPU、O/S 工作负载的详细信息将导致可变影响,但 RAM 大小对于智能矢量化代码很重要,numpy聪明很多,最多:
A[1000,1000,1000] ~ 8 [GB]-RAM 占用空间。然而细节很重要。很多。
A[ 100, 100, 100] ~ 8 [MB]-RAM 占用空间。现在它确实适合 L3/L2 缓存……这很重要……
A[ 10, 10, 10] ~ 8 [kB]-RAM 占用空间。现在它确实适合 L1d 缓存......这很重要。很多...


让我们从非常开始量化:

由于 RAM,我们从二维降维开始。

>>> from zmq import Stopwatch;  aClk = Stopwatch()     # a [us]-resolution clock
>>> pass;    import gc;               gc.disable()     #   ...  elementary
>>> pass;    import numpy as np
#_______________________________________________________________________________
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N, N ) ); aClk.stop()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "mtrand.pyx", line 861, in mtrand.RandomState.random_sample
  File "mtrand.pyx", line 167, in mtrand.cont0_array
MemoryError

然而,矢量化代码技巧在原则上对于 1D- ... nD-tensor 处理保持不变:

#_______________________________________________________________________________
#       [us]
#_______________________________________________________________________________
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
17801992
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
  184895
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
    1585
>>> N = int( 1E1 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
      44
>>> N = int( 1E2 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
     465
>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
   48651
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
 4954694
>>> N = int( 1E4 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
25549190
#_______________________________________________________________________________
#       [us] SEE THE GROWING COSTS FOR ram-ALLOCATIONS & STORAGE OF RANDOM num-s
#_______________________________________________________________________________


>>> N = int( 1E3 ); aClk.start(); A = np.random.random( ( N, N ) ); aClk.stop()
  471956
   50067
   49184
   42891
   48897
   52639
   45464
   48828
#_______________________________________________________________________________
#       [us] SEE THE 1st, resp. WAY-LOWER COSTS FOR 1st, resp. OTHER ram-ACCESS
#_______________________________________________________________________________


>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   71044
   12086
   16277
   28192
#_______________________________________________________________________________
#       [us] SEE ALSO THE "noise" IN THE LATENCY-COSTS IN 2nd+ RE-RUN-s
#_______________________________________________________________________________

>>> N = int( 1E3 );               aClk.start(); A = np.random.random( ( N, N ) );                              aClk.stop()
   45759
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   38362    
   46640
   37927

>>> N = int( 1E4 );               aClk.start(); A = np.random.random( ( N, N ) );                              aClk.stop()
 4885472
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
35747022
#_______________________________________________________________________________
#       [us] SEE THE SHIFT IN LATENCY-COSTS AS ram-i/o COSTS DOMINATE > 1 [GB]
#_______________________________________________________________________________


>>> N = int( 1E3 );               aClk.start(); A = np.random.random( ( N, N ) );                              aClk.stop()
 2307509
   50446
   49464
   43006
   50958
   54800
   43418
   57091
   52135
   46451

>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   97810
   20117
   14480
   22696
   31172
   14846
#_______________________________________________________________________________
#       [us] SEE THE "noise" FOR 2nd+ RE-RUN-s
#_______________________________________________________________________________

>>> N = int( 1E3 );               aClk.start(); A = np.random.random( ( N, N ) );                              aClk.stop()
   47437
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   39298
   19422
#_______________________________________________________________________________
#       [us] SEE THE "noise" FOR 2nd+ RE-RUN
#_______________________________________________________________________________

>>> N = int( 1E3 ); aClk.start();               A = np.random.random( ( N, N ) );                              aClk.stop()
   44814
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   42565

>>> N = int( 1E3 ); aClk.start();               A = np.random.random( ( N, N ) );                              aClk.stop()
   43120
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   38039

>>> N = int( 1E3 ); aClk.start();               A = np.random.random( ( N, N ) );                              aClk.stop()
   45296
>>> T_lo = 0.25; T_hi = 1 - T_lo; aClk.start(); A = np.where( A > T_hi, T_hi, np.where( A < T_lo, T_lo, A ) ); aClk.stop()
   41898
#_______________________________________________________________________________
#       [us] SEE THE "noise" FOR block-RE-randomised-RE-RUN-s
#_______________________________________________________________________________

您的代码中至少有两个性能关键问题。

  • Numpy 数组默认为 C 序(最后一个维度变化最快),因此内部循环应该在最后一个轴上迭代。
  • 您正在使用索引数组迭代数据。这有很大的开销(内存和性能)并且完全没有必要。

三圈

除了非常简单的问题,通常需要明确地使用 nb.parfor。另请注意,并行版本在运行时间在 µs 范围内的非常小的问题上速度较慢。

@njit(parallel=False) #True for parallel
def SwVer(ResSwc,SwNew):
    for i in range(SwNew.shape[0]):
        for j in range(SwNew.shape[1]):
            for k in range(SwNew.shape[2]):
                if(SwNew[i,j,k]<ResSwc):
                    SwNew[i,j,k]=ResSwc
                if(SwNew[i,j,k]>(1-ResSwc)):
                    SwNew[i,j,k]=(1-ResSwc)
    return SwNe

一个循环

#only works on contigous arrays, otherwise reshape will fail
@njit(parallel=False)
def SwVer_unrolled(ResSwc,SwNew):
    shape_orig=SwNew.shape
    SwNew_flat=SwNew.reshape(-1) #create a 1d-view
    for i in nb.prange(SwNew_flat.shape[0]):
        if(SwNew_flat[i]<ResSwc):
            SwNew_flat[i]=ResSwc
        if(SwNew_flat[i]>(1-ResSwc)):
            SwNew_flat[i]=(1-ResSwc)
    return SwNew_flat.reshape(shape_orig)

Numpy 向量化版本

def SwVer_np(ResSwc,SwNew):
    SwNew[SwNew<ResSwc]    =ResSwc
    SwNew[SwNew>(1-ResSwc)]=(1-ResSwc)
    return SwNew

计时

#very small array
Imax=100
Jmax=100
Kmax=5

Ia=np.arange(0,Imax,1)
Ja=np.arange(0,Jmax,1)
Ka=np.arange(0,Kmax,1)

#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#181 µs ± 1.92 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#44.6 µs ± 213 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
#parallel=True
#104 µs ± 5.61 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#11.4 µs ± 96.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#parallel=True
#116 µs ± 4.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit SwVer_np(0.25,SwNew)
#30.1 µs ± 568 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

#bigger arrays -> parallelization beneficial
Imax=1000
Jmax=1000
Kmax=5
SwNew=np.random.random((Imax,Jmax,Kmax))

#your version
%timeit SwVer_orig(0.25,SwNew,Ia,Ja,Ka)
#17.7 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit SwVer(0.25,SwNew)
#parallel=False
#4.73 ms ± 96.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.3 ms ± 63.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_unrolled(0.25,SwNew)
#parallel=False
#2.03 ms ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
#parallel=True
#1.17 ms ± 30.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SwVer_np(0.25,SwNew)
#7.9 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

结论

如果这部分对性能不是非常关键,我会更喜欢 Numpy 向量化版本,因为它很简单。你可以胜过它,但我认为这里不值得付出努力。