Numba 比 numpy 慢 3 倍
Numba 3x slower than numpy
我们有一个使用掩码的矢量 numpy get_pos_neg_bitwise 函数=[132 20 192]
和我们想要用 numba 加速的 (500e3, 4) 的 df.shape。
from numba import jit
import numpy as np
from time import time
def get_pos_neg_bitwise(df, mask):
"""
In [1]: print mask
[132 20 192]
In [1]: print df
[[ 1 162 97 41]
[ 0 136 135 171]
...,
[ 0 245 30 73]]
"""
check = (np.bitwise_and(mask, df[:, 1:]) == mask).all(axis=1)
pos = (df[:, 0] == 1) & check
neg = (df[:, 0] == 0) & check
pos = np.nonzero(pos)[0]
neg = np.nonzero(neg)[0]
return (pos, neg)
使用来自@morningsun 的提示,我们制作了这个 numba 版本:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
posneg = np.zeros((df.shape[0], 2))
for idx in range(df.shape[0]):
vandmask = np.bitwise_and(df[idx, 1:], mask)
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i, val in enumerate(vandmask):
if val != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
posneg[idx, 0] = 1
else:
posneg[idx, 1] = 1
pos = list(np.nonzero(posneg[:, 0])[0])
neg = list(np.nonzero(posneg[:, 1])[0])
return (pos, neg)
但它仍然比 numpy 慢 3 倍(~0.06s Vs ~0,02s)。
if __name__ == '__main__':
df = np.array(np.random.randint(256, size=(int(500e3), 4)))
df[:, 0] = np.random.randint(2, size=(1, df.shape[0])) # set target to 0 or 1
mask = np.array([132, 20, 192])
start = time()
pos, neg = get_pos_neg_bitwise(df, mask)
msg = '==> pos, neg made; p={}, n={} in [{:.4} s] numpy'
print msg.format(len(pos), len(neg), time() - start)
start = time()
msg = '==> pos, neg made; p={}, n={} in [{:.4} s] numba'
pos, neg = numba_get_pos_neg_bitwise(df, mask)
print msg.format(len(pos), len(neg), time() - start)
start = time()
pos, neg = numba_get_pos_neg_bitwise(df, mask)
print msg.format(len(pos), len(neg), time() - start)
我是不是漏掉了什么?
In [1]: %run numba_test2.py
==> pos, neg made; p=3852, n=3957 in [0.02306 s] numpy
==> pos, neg made; p=3852, n=3957 in [0.3492 s] numba
==> pos, neg made; p=3852, n=3957 in [0.06425 s] numba
In [1]:
尝试将对 np.bitwise_and
的调用移到循环之外,因为 numba 无法做任何事情来加快它的速度:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
posneg = np.zeros((df.shape[0], 2))
vandmask = np.bitwise_and(df[:, 1:], mask)
for idx in range(df.shape[0]):
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i, val in enumerate(vandmask[idx]):
if val != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
posneg[idx, 0] = 1
else:
posneg[idx, 1] = 1
pos = np.nonzero(posneg[:, 0])[0]
neg = np.nonzero(posneg[:, 1])[0]
return (pos, neg)
然后我得到时间:
==> pos, neg made; p=3920, n=4023 in [0.02352 s] numpy
==> pos, neg made; p=3920, n=4023 in [0.2896 s] numba
==> pos, neg made; p=3920, n=4023 in [0.01539 s] numba
所以现在 numba 比 numpy 快一点。
此外,它并没有太大的区别,但是在你的原始函数中你 return numpy 数组,而在 numba 版本中你将 pos
和 neg
转换为列出。
但一般来说,我猜想函数调用主要由 numpy 函数主导,numba 无法加速,并且代码的 numpy 版本已经在使用快速矢量化例程。
更新:
您可以通过删除 enumerate
调用并直接索引到数组中而不是获取切片来使其更快。此外,将 pos
和 neg
拆分为单独的数组有助于避免沿内存中的 non-contiguous 轴切片:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
pos = np.zeros(df.shape[0])
neg = np.zeros(df.shape[0])
vandmask = np.bitwise_and(df[:, 1:], mask)
for idx in range(df.shape[0]):
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i in xrange(vandmask.shape[1]):
if vandmask[idx,i] != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
pos[idx] = 1
else:
neg[idx] = 1
pos = np.nonzero(pos)[0]
neg = np.nonzero(neg)[0]
return pos, neg
以及 ipython 笔记本中的计时:
%timeit pos1, neg1 = get_pos_neg_bitwise(df, mask)
%timeit pos2, neg2 = numba_get_pos_neg_bitwise(df, mask)
100 loops, best of 3: 18.2 ms per loop
100 loops, best of 3: 7.89 ms per loop
我们有一个使用掩码的矢量 numpy get_pos_neg_bitwise 函数=[132 20 192] 和我们想要用 numba 加速的 (500e3, 4) 的 df.shape。
from numba import jit
import numpy as np
from time import time
def get_pos_neg_bitwise(df, mask):
"""
In [1]: print mask
[132 20 192]
In [1]: print df
[[ 1 162 97 41]
[ 0 136 135 171]
...,
[ 0 245 30 73]]
"""
check = (np.bitwise_and(mask, df[:, 1:]) == mask).all(axis=1)
pos = (df[:, 0] == 1) & check
neg = (df[:, 0] == 0) & check
pos = np.nonzero(pos)[0]
neg = np.nonzero(neg)[0]
return (pos, neg)
使用来自@morningsun 的提示,我们制作了这个 numba 版本:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
posneg = np.zeros((df.shape[0], 2))
for idx in range(df.shape[0]):
vandmask = np.bitwise_and(df[idx, 1:], mask)
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i, val in enumerate(vandmask):
if val != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
posneg[idx, 0] = 1
else:
posneg[idx, 1] = 1
pos = list(np.nonzero(posneg[:, 0])[0])
neg = list(np.nonzero(posneg[:, 1])[0])
return (pos, neg)
但它仍然比 numpy 慢 3 倍(~0.06s Vs ~0,02s)。
if __name__ == '__main__':
df = np.array(np.random.randint(256, size=(int(500e3), 4)))
df[:, 0] = np.random.randint(2, size=(1, df.shape[0])) # set target to 0 or 1
mask = np.array([132, 20, 192])
start = time()
pos, neg = get_pos_neg_bitwise(df, mask)
msg = '==> pos, neg made; p={}, n={} in [{:.4} s] numpy'
print msg.format(len(pos), len(neg), time() - start)
start = time()
msg = '==> pos, neg made; p={}, n={} in [{:.4} s] numba'
pos, neg = numba_get_pos_neg_bitwise(df, mask)
print msg.format(len(pos), len(neg), time() - start)
start = time()
pos, neg = numba_get_pos_neg_bitwise(df, mask)
print msg.format(len(pos), len(neg), time() - start)
我是不是漏掉了什么?
In [1]: %run numba_test2.py
==> pos, neg made; p=3852, n=3957 in [0.02306 s] numpy
==> pos, neg made; p=3852, n=3957 in [0.3492 s] numba
==> pos, neg made; p=3852, n=3957 in [0.06425 s] numba
In [1]:
尝试将对 np.bitwise_and
的调用移到循环之外,因为 numba 无法做任何事情来加快它的速度:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
posneg = np.zeros((df.shape[0], 2))
vandmask = np.bitwise_and(df[:, 1:], mask)
for idx in range(df.shape[0]):
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i, val in enumerate(vandmask[idx]):
if val != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
posneg[idx, 0] = 1
else:
posneg[idx, 1] = 1
pos = np.nonzero(posneg[:, 0])[0]
neg = np.nonzero(posneg[:, 1])[0]
return (pos, neg)
然后我得到时间:
==> pos, neg made; p=3920, n=4023 in [0.02352 s] numpy
==> pos, neg made; p=3920, n=4023 in [0.2896 s] numba
==> pos, neg made; p=3920, n=4023 in [0.01539 s] numba
所以现在 numba 比 numpy 快一点。
此外,它并没有太大的区别,但是在你的原始函数中你 return numpy 数组,而在 numba 版本中你将 pos
和 neg
转换为列出。
但一般来说,我猜想函数调用主要由 numpy 函数主导,numba 无法加速,并且代码的 numpy 版本已经在使用快速矢量化例程。
更新:
您可以通过删除 enumerate
调用并直接索引到数组中而不是获取切片来使其更快。此外,将 pos
和 neg
拆分为单独的数组有助于避免沿内存中的 non-contiguous 轴切片:
@jit(nopython=True)
def numba_get_pos_neg_bitwise(df, mask):
pos = np.zeros(df.shape[0])
neg = np.zeros(df.shape[0])
vandmask = np.bitwise_and(df[:, 1:], mask)
for idx in range(df.shape[0]):
# numba fail with # if np.all(vandmask == mask):
vandm_equal_m = 1
for i in xrange(vandmask.shape[1]):
if vandmask[idx,i] != mask[i]:
vandm_equal_m = 0
break
if vandm_equal_m == 1:
if df[idx, 0] == 1:
pos[idx] = 1
else:
neg[idx] = 1
pos = np.nonzero(pos)[0]
neg = np.nonzero(neg)[0]
return pos, neg
以及 ipython 笔记本中的计时:
%timeit pos1, neg1 = get_pos_neg_bitwise(df, mask)
%timeit pos2, neg2 = numba_get_pos_neg_bitwise(df, mask)
100 loops, best of 3: 18.2 ms per loop
100 loops, best of 3: 7.89 ms per loop