在 Python 中实现 MATLAB 的 im2col 'sliding'
Implement MATLAB's im2col 'sliding' in Python
问:如何加快速度?
下面是我对 Matlab 的 im2col 'sliding' 的实现,具有返回每第 n 列的附加功能。该函数获取一个图像(或任何 2 个 dim 数组)并从左到右、从上到下滑动,挑选出每个给定大小的重叠子图像,并返回一个列为子图像的数组。
import numpy as np
def im2col_sliding(image, block_size, skip=1):
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
output_vectors = np.zeros((block_size[0] * block_size[1], horz_blocks * vert_blocks))
itr = 0
for v_b in xrange(vert_blocks):
for h_b in xrange(horz_blocks):
output_vectors[:, itr] = image[v_b: v_b + block_size[0], h_b: h_b + block_size[1]].ravel()
itr += 1
return output_vectors[:, ::skip]
示例:
a = np.arange(16).reshape(4, 4)
print a
print im2col_sliding(a, (2, 2)) # return every overlapping 2x2 patch
print im2col_sliding(a, (2, 2), 4) # return every 4th vector
returns:
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]
[12 13 14 15]]
[[ 0. 1. 2. 4. 5. 6. 8. 9. 10.]
[ 1. 2. 3. 5. 6. 7. 9. 10. 11.]
[ 4. 5. 6. 8. 9. 10. 12. 13. 14.]
[ 5. 6. 7. 9. 10. 11. 13. 14. 15.]]
[[ 0. 5. 10.]
[ 1. 6. 11.]
[ 4. 9. 14.]
[ 5. 10. 15.]]
性能不是很好,特别是考虑到我调用 im2col_sliding(big_matrix, (8, 8))
(62001 列)还是 im2col_sliding(big_matrix, (8, 8), 10)
(6201 列;只保留每 10 个向量)将花费相同的时间[其中 big_matrix 的大小为 256 x 256]。
我正在寻找任何可以加快速度的想法。
我不认为你能做得更好。显然,您必须 运行 一个大小为
的循环
cols - block_size[1] * rows - block_size[0]
但是您在示例中使用的是 3、3 补丁,而不是 2、2。
方法 #1
我们可以使用一些 broadcasting
here to get all the indices of all those sliding windows in one go and thus with indexing achieve a vectorized solution
. This is inspired by Efficient Implementation of im2col and col2im
.
这是实现 -
def im2col_sliding_broadcasting(A, BSZ, stepsize=1):
# Parameters
M,N = A.shape
col_extent = N - BSZ[1] + 1
row_extent = M - BSZ[0] + 1
# Get Starting block indices
start_idx = np.arange(BSZ[0])[:,None]*N + np.arange(BSZ[1])
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
return np.take (A,start_idx.ravel()[:,None] + offset_idx.ravel()[::stepsize])
方法 #2
使用新获得的 NumPy array strides
知识让我们创建这样的滑动 windows,我们将有另一个有效的解决方案 -
def im2col_sliding_strided(A, BSZ, stepsize=1):
# Parameters
m,n = A.shape
s0, s1 = A.strides
nrows = m-BSZ[0]+1
ncols = n-BSZ[1]+1
shp = BSZ[0],BSZ[1],nrows,ncols
strd = s0,s1,s0,s1
out_view = np.lib.stride_tricks.as_strided(A, shape=shp, strides=strd)
return out_view.reshape(BSZ[0]*BSZ[1],-1)[:,::stepsize]
方法 #3
之前方法中列出的跨步方法已合并到 scikit-image
module 中以减少混乱,例如 -
from skimage.util import view_as_windows as viewW
def im2col_sliding_strided_v2(A, BSZ, stepsize=1):
return viewW(A, (BSZ[0],BSZ[1])).reshape(-1,BSZ[0]*BSZ[1]).T[:,::stepsize]
样品运行 -
In [106]: a # Input array
Out[106]:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19]])
In [107]: im2col_sliding_broadcasting(a, (2,3))
Out[107]:
array([[ 0, 1, 2, 5, 6, 7, 10, 11, 12],
[ 1, 2, 3, 6, 7, 8, 11, 12, 13],
[ 2, 3, 4, 7, 8, 9, 12, 13, 14],
[ 5, 6, 7, 10, 11, 12, 15, 16, 17],
[ 6, 7, 8, 11, 12, 13, 16, 17, 18],
[ 7, 8, 9, 12, 13, 14, 17, 18, 19]])
In [108]: im2col_sliding_broadcasting(a, (2,3), stepsize=2)
Out[108]:
array([[ 0, 2, 6, 10, 12],
[ 1, 3, 7, 11, 13],
[ 2, 4, 8, 12, 14],
[ 5, 7, 11, 15, 17],
[ 6, 8, 12, 16, 18],
[ 7, 9, 13, 17, 19]])
运行时测试
In [183]: a = np.random.randint(0,255,(1024,1024))
In [184]: %timeit im2col_sliding(img, (8,8), skip=1)
...: %timeit im2col_sliding_broadcasting(img, (8,8), stepsize=1)
...: %timeit im2col_sliding_strided(img, (8,8), stepsize=1)
...: %timeit im2col_sliding_strided_v2(img, (8,8), stepsize=1)
...:
1 loops, best of 3: 1.29 s per loop
1 loops, best of 3: 226 ms per loop
10 loops, best of 3: 84.5 ms per loop
10 loops, best of 3: 111 ms per loop
In [185]: %timeit im2col_sliding(img, (8,8), skip=4)
...: %timeit im2col_sliding_broadcasting(img, (8,8), stepsize=4)
...: %timeit im2col_sliding_strided(img, (8,8), stepsize=4)
...: %timeit im2col_sliding_strided_v2(img, (8,8), stepsize=4)
...:
1 loops, best of 3: 1.31 s per loop
10 loops, best of 3: 104 ms per loop
10 loops, best of 3: 84.4 ms per loop
10 loops, best of 3: 109 ms per loop
在 16x
附近,使用 strided 方法比原来的 loopy 版本提速!
为了在不同的图像通道上滑动window,我们可以使用Divakar@提供的代码的更新版本,即
import numpy as np
A = np.random.randint(0,9,(2,4,4)) # Sample input array
# Sample blocksize (rows x columns)
B = [2,2]
skip=[2,2]
# Parameters
D,M,N = A.shape
col_extent = N - B[1] + 1
row_extent = M - B[0] + 1
# Get Starting block indices
start_idx = np.arange(B[0])[:,None]*N + np.arange(B[1])
# Generate Depth indeces
didx=M*N*np.arange(D)
start_idx=(didx[:,None]+start_idx.ravel()).reshape((-1,B[0],B[1]))
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
out = np.take (A,start_idx.ravel()[:,None] + offset_idx[::skip[0],::skip[1]].ravel())
测试
示例 运行
A=
[[[6 2 8 5]
[6 4 7 6]
[8 6 5 2]
[3 1 3 7]]
[[6 0 4 3]
[7 6 4 6]
[2 6 7 1]
[7 6 7 7]]]
out=
[6 8 8 5]
[2 5 6 2]
[6 7 3 3]
[4 6 1 7]
[6 4 2 7]
[0 3 6 1]
[7 4 7 7]
[6 6 6 7]
您还可以对 M Eliya 的 添加进一步的优化(虽然不是那么重要)
而不是 "applying" 在最后跳过,你可以在生成偏移数组时应用它,所以而不是:
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
out = np.take (A,start_idx.ravel()[:,None] + offset_idx[::skip[0],::skip[1]].ravel())
您可以使用 numpy 的 arange 函数的 step 参数添加跳过:
# Get offsetted indices across the height and width of input array and add skips
offset_idx = np.arange(row_extent, step=skip[0])[:, None] * N + np.arange(col_extent, step=skip[1])
之后只需添加不带 [::] 索引的偏移数组
# Get all actual indices & index into input array for final output
out = np.take(A, start_idx.ravel()[:, None] + offset_idx.ravel())
对于较小的跳过值,它几乎不会节省任何时间:
In[25]:
A = np.random.randint(0,9,(3, 1024, 1024))
B = [2, 2]
skip = [2, 2]
In[26]: %timeit im2col(A, B, skip)
10 loops, best of 3: 19.7 ms per loop
In[27]: %timeit im2col_optimized(A, B, skip)
100 loops, best of 3: 17.5 ms per loop
然而,使用较大的跳过值可以节省更多时间:
In[28]: skip = [10, 10]
In[29]: %timeit im2col(A, B, skip)
100 loops, best of 3: 3.85 ms per loop
In[30]: %timeit im2col_optimized(A, B, skip)
1000 loops, best of 3: 1.02 ms per loop
A = np.random.randint(0,9,(3, 2000, 2000))
B = [10, 10]
skip = [10, 10]
In[43]: %timeit im2col(A, B, skip)
10 loops, best of 3: 87.8 ms per loop
In[44]: %timeit im2col_optimized(A, B, skip)
10 loops, best of 3: 76.3 ms per loop
为了进一步提高性能(例如在卷积上),我们还可以使用基于扩展代码的批处理实现,由 M Elyia@ 提供,即
import numpy as np
A = np.arange(3*1*4*4).reshape(3,1,4,4)+1 # 3 Sample input array with 1 channel
B = [2,2] # Sample blocksize (rows x columns)
skip = [2,2]
# Parameters
batch, D,M,N = A.shape
col_extent = N - B[1] + 1
row_extent = M - B[0] + 1
# Get batch block indices
batch_idx = np.arange(batch)[:, None, None] * D * M * N
# Get Starting block indices
start_idx = np.arange(B[0])[None, :,None]*N + np.arange(B[1])
# Generate Depth indeces
didx=M*N*np.arange(D)
start_idx=(didx[None, :, None]+start_idx.ravel()).reshape((-1,B[0],B[1]))
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[None, :, None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
act_idx = (batch_idx +
start_idx.ravel()[None, :, None] +
offset_idx[:,::skip[0],::skip[1]].ravel())
out = np.take (A, act_idx)
测试样本运行:
A =
[[[[ 1 2 3 4]
[ 5 6 7 8]
[ 9 10 11 12]
[13 14 15 16]]]
[[[17 18 19 20]
[21 22 23 24]
[25 26 27 28]
[29 30 31 32]]]
[[[33 34 35 36]
[37 38 39 40]
[41 42 43 44]
[45 46 47 48]]]]
out =
[[[ 1 2 3 9 10 11]
[ 2 3 4 10 11 12]
[ 5 6 7 13 14 15]
[ 6 7 8 14 15 16]]
[[17 18 19 25 26 27]
[18 19 20 26 27 28]
[21 22 23 29 30 31]
[22 23 24 30 31 32]]
[[33 34 35 41 42 43]
[34 35 36 42 43 44]
[37 38 39 45 46 47]
[38 39 40 46 47 48]]]
我使用 Numba JIT 编译器实现了快速解决方案。根据块大小和跳过大小,它提供从 5.67x
到 3597x
的加速。
Speedup 表示 numba 算法比原始算法快多少倍,例如20x
的加速意味着如果原始算法采用 200ms
那么快速 numba 算法采用 10ms
.
我的代码需要通过 python -m pip install numpy numba timerit matplotlib
.
安装一次以下 pip 模块
接下来是定位代码,然后是加速图,然后是时间测量的控制台输出。
import numpy as np
# ----- Original Implementation -----
def im2col_sliding(image, block_size, skip = 1):
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
if vert_blocks <= 0 or horz_blocks <= 0:
return np.zeros((block_size[0] * block_size[1], 0), dtype = image.dtype)
output_vectors = np.zeros((block_size[0] * block_size[1], horz_blocks * vert_blocks), dtype = image.dtype)
itr = 0
for v_b in range(vert_blocks):
for h_b in range(horz_blocks):
output_vectors[:, itr] = image[v_b: v_b + block_size[0], h_b: h_b + block_size[1]].ravel()
itr += 1
return output_vectors[:, ::skip]
# ----- Fast Numba Implementation -----
import numba
@numba.njit(cache = True)
def im2col_sliding_numba(image, block_size, skip = 1):
assert skip >= 1
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
if vert_blocks <= 0 or horz_blocks <= 0:
return np.zeros((block_size[0] * block_size[1], 0), dtype = image.dtype)
res = np.zeros((block_size[0] * block_size[1], (horz_blocks * vert_blocks + skip - 1) // skip), dtype = image.dtype)
itr, to_skip, v_b = 0, 0, 0
while True:
v_b += to_skip // horz_blocks
if v_b >= vert_blocks:
break
h_b_start = to_skip % horz_blocks
h_cnt = (horz_blocks - h_b_start + skip - 1) // skip
for i, h_b in zip(range(itr, itr + h_cnt), range(h_b_start, horz_blocks, skip)):
ii = 0
for iv in range(v_b, v_b + block_size[0]):
for ih in range(h_b, h_b + block_size[1]):
res[ii, i] = image[iv, ih]
ii += 1
to_skip = skip - (horz_blocks - h_b_start - skip * (h_cnt - 1))
itr += h_cnt
v_b += 1
assert itr == res.shape[1]#, (itr, res.shape)
return res
# ----- Testing -----
from timerit import Timerit
Timerit._default_asciimode = True
side = 256
a = np.random.randint(0, 256, (side, side), dtype = np.uint8)
stats = []
for block_size in [16, 8, 4, 2, 1]:
for skip_size in [1, 2, 5, 11, 23]:
print(f'block_size {block_size} skip_size {skip_size}', flush = True)
for ifn, f in enumerate([im2col_sliding, im2col_sliding_numba]):
print(f'{f.__name__}: ', end = '', flush = True)
tim = Timerit(num = 3, verbose = 1)
for i, t in enumerate(tim):
if i == 0 and ifn == 1:
f(a, (block_size, block_size), skip_size)
with t:
r = f(a, (block_size, block_size), skip_size)
rt = tim.mean()
if ifn == 0:
bt, ba = rt, r
else:
assert np.array_equal(ba, r)
print(f'speedup {round(bt / rt, 2)}x')
stats.append({
'block_size': block_size,
'skip_size': skip_size,
'speedup': bt / rt,
})
stats = sorted(stats, key = lambda e: e['speedup'])
import math, matplotlib, matplotlib.pyplot as plt
x = np.arange(len(stats))
y = np.array([e['speedup'] for e in stats])
plt.rcParams['figure.figsize'] = (12.8, 7.2)
for scale in ['linear', 'log']:
plt.clf()
plt.xlabel('iteration')
plt.ylabel(f'speedup_{scale}')
plt.yscale(scale)
plt.scatter(x, y, marker = '.')
for i in range(x.size):
plt.annotate(
(f"b{str(stats[i]['block_size']).zfill(2)}s{str(stats[i]['skip_size']).zfill(2)}\n" +
f"x{round(stats[i]['speedup'], 2 if stats[i]['speedup'] < 100 else 1 if stats[i]['speedup'] < 1000 else None)}"),
(x[i], y[i]), fontsize = 'small',
)
plt.subplots_adjust(left = 0.055, right = 0.99, bottom = 0.08, top = 0.99)
plt.xlim(left = -0.1)
if scale == 'linear':
ymin, ymax = np.amin(y), np.amax(y)
plt.ylim((ymin - (ymax - ymin) * 0.02, ymax + (ymax - ymin) * 0.05))
plt.yticks([ymin] + [e for e in plt.yticks()[0] if ymin + 0.01 < e < ymax - 0.01] + [ymax])
#plt.gca().get_yaxis().set_major_formatter(matplotlib.ticker.FormatStrFormatter('%.1f'))
plt.savefig(f'im2col_numba_{scale}.png', dpi = 150)
plt.show()
下一个图的迭代为 x
轴,加速为 y
轴,第一个图有 linear
y
轴,第二个图有 logarithmic
y
轴。此外,每个点都有标签 bXXsYYxZZ
,其中 XX
等于块大小,YY
等于跳过(步长)大小,ZZ
等于加速。
线性图:
对数图:
控制台输出:
block_size 16 skip_size 1
im2col_sliding: Timed best=549.069 ms, mean=549.069 +- 0.0 ms
im2col_sliding_numba: Timed best=96.841 ms, mean=96.841 +- 0.0 ms
speedup 5.67x
block_size 16 skip_size 2
im2col_sliding: Timed best=559.396 ms, mean=559.396 +- 0.0 ms
im2col_sliding_numba: Timed best=71.132 ms, mean=71.132 +- 0.0 ms
speedup 7.86x
block_size 16 skip_size 5
im2col_sliding: Timed best=561.030 ms, mean=561.030 +- 0.0 ms
im2col_sliding_numba: Timed best=15.000 ms, mean=15.000 +- 0.0 ms
speedup 37.4x
block_size 16 skip_size 11
im2col_sliding: Timed best=559.045 ms, mean=559.045 +- 0.0 ms
im2col_sliding_numba: Timed best=6.719 ms, mean=6.719 +- 0.0 ms
speedup 83.21x
block_size 16 skip_size 23
im2col_sliding: Timed best=562.462 ms, mean=562.462 +- 0.0 ms
im2col_sliding_numba: Timed best=2.514 ms, mean=2.514 +- 0.0 ms
speedup 223.72x
block_size 8 skip_size 1
im2col_sliding: Timed best=373.790 ms, mean=373.790 +- 0.0 ms
im2col_sliding_numba: Timed best=17.441 ms, mean=17.441 +- 0.0 ms
speedup 21.43x
block_size 8 skip_size 2
im2col_sliding: Timed best=375.858 ms, mean=375.858 +- 0.0 ms
im2col_sliding_numba: Timed best=8.791 ms, mean=8.791 +- 0.0 ms
speedup 42.75x
block_size 8 skip_size 5
im2col_sliding: Timed best=376.767 ms, mean=376.767 +- 0.0 ms
im2col_sliding_numba: Timed best=3.115 ms, mean=3.115 +- 0.0 ms
speedup 120.94x
block_size 8 skip_size 11
im2col_sliding: Timed best=378.284 ms, mean=378.284 +- 0.0 ms
im2col_sliding_numba: Timed best=1.406 ms, mean=1.406 +- 0.0 ms
speedup 268.97x
block_size 8 skip_size 23
im2col_sliding: Timed best=376.268 ms, mean=376.268 +- 0.0 ms
im2col_sliding_numba: Timed best=661.404 us, mean=661.404 +- 0.0 us
speedup 568.89x
block_size 4 skip_size 1
im2col_sliding: Timed best=378.813 ms, mean=378.813 +- 0.0 ms
im2col_sliding_numba: Timed best=4.950 ms, mean=4.950 +- 0.0 ms
speedup 76.54x
block_size 4 skip_size 2
im2col_sliding: Timed best=377.620 ms, mean=377.620 +- 0.0 ms
im2col_sliding_numba: Timed best=2.119 ms, mean=2.119 +- 0.0 ms
speedup 178.24x
block_size 4 skip_size 5
im2col_sliding: Timed best=374.792 ms, mean=374.792 +- 0.0 ms
im2col_sliding_numba: Timed best=854.986 us, mean=854.986 +- 0.0 us
speedup 438.36x
block_size 4 skip_size 11
im2col_sliding: Timed best=373.296 ms, mean=373.296 +- 0.0 ms
im2col_sliding_numba: Timed best=415.028 us, mean=415.028 +- 0.0 us
speedup 899.45x
block_size 4 skip_size 23
im2col_sliding: Timed best=374.075 ms, mean=374.075 +- 0.0 ms
im2col_sliding_numba: Timed best=219.491 us, mean=219.491 +- 0.0 us
speedup 1704.28x
block_size 2 skip_size 1
im2col_sliding: Timed best=377.698 ms, mean=377.698 +- 0.0 ms
im2col_sliding_numba: Timed best=1.477 ms, mean=1.477 +- 0.0 ms
speedup 255.67x
block_size 2 skip_size 2
im2col_sliding: Timed best=378.155 ms, mean=378.155 +- 0.0 ms
im2col_sliding_numba: Timed best=841.298 us, mean=841.298 +- 0.0 us
speedup 449.49x
block_size 2 skip_size 5
im2col_sliding: Timed best=376.381 ms, mean=376.381 +- 0.0 ms
im2col_sliding_numba: Timed best=392.541 us, mean=392.541 +- 0.0 us
speedup 958.83x
block_size 2 skip_size 11
im2col_sliding: Timed best=374.720 ms, mean=374.720 +- 0.0 ms
im2col_sliding_numba: Timed best=193.093 us, mean=193.093 +- 0.0 us
speedup 1940.62x
block_size 2 skip_size 23
im2col_sliding: Timed best=378.092 ms, mean=378.092 +- 0.0 ms
im2col_sliding_numba: Timed best=105.101 us, mean=105.101 +- 0.0 us
speedup 3597.42x
block_size 1 skip_size 1
im2col_sliding: Timed best=203.410 ms, mean=203.410 +- 0.0 ms
im2col_sliding_numba: Timed best=686.335 us, mean=686.335 +- 0.0 us
speedup 296.37x
block_size 1 skip_size 2
im2col_sliding: Timed best=202.865 ms, mean=202.865 +- 0.0 ms
im2col_sliding_numba: Timed best=361.255 us, mean=361.255 +- 0.0 us
speedup 561.56x
block_size 1 skip_size 5
im2col_sliding: Timed best=200.929 ms, mean=200.929 +- 0.0 ms
im2col_sliding_numba: Timed best=164.740 us, mean=164.740 +- 0.0 us
speedup 1219.68x
block_size 1 skip_size 11
im2col_sliding: Timed best=202.163 ms, mean=202.163 +- 0.0 ms
im2col_sliding_numba: Timed best=96.791 us, mean=96.791 +- 0.0 us
speedup 2088.65x
block_size 1 skip_size 23
im2col_sliding: Timed best=202.492 ms, mean=202.492 +- 0.0 ms
im2col_sliding_numba: Timed best=64.527 us, mean=64.527 +- 0.0 us
speedup 3138.1x
问:如何加快速度?
下面是我对 Matlab 的 im2col 'sliding' 的实现,具有返回每第 n 列的附加功能。该函数获取一个图像(或任何 2 个 dim 数组)并从左到右、从上到下滑动,挑选出每个给定大小的重叠子图像,并返回一个列为子图像的数组。
import numpy as np
def im2col_sliding(image, block_size, skip=1):
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
output_vectors = np.zeros((block_size[0] * block_size[1], horz_blocks * vert_blocks))
itr = 0
for v_b in xrange(vert_blocks):
for h_b in xrange(horz_blocks):
output_vectors[:, itr] = image[v_b: v_b + block_size[0], h_b: h_b + block_size[1]].ravel()
itr += 1
return output_vectors[:, ::skip]
示例:
a = np.arange(16).reshape(4, 4)
print a
print im2col_sliding(a, (2, 2)) # return every overlapping 2x2 patch
print im2col_sliding(a, (2, 2), 4) # return every 4th vector
returns:
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]
[12 13 14 15]]
[[ 0. 1. 2. 4. 5. 6. 8. 9. 10.]
[ 1. 2. 3. 5. 6. 7. 9. 10. 11.]
[ 4. 5. 6. 8. 9. 10. 12. 13. 14.]
[ 5. 6. 7. 9. 10. 11. 13. 14. 15.]]
[[ 0. 5. 10.]
[ 1. 6. 11.]
[ 4. 9. 14.]
[ 5. 10. 15.]]
性能不是很好,特别是考虑到我调用 im2col_sliding(big_matrix, (8, 8))
(62001 列)还是 im2col_sliding(big_matrix, (8, 8), 10)
(6201 列;只保留每 10 个向量)将花费相同的时间[其中 big_matrix 的大小为 256 x 256]。
我正在寻找任何可以加快速度的想法。
我不认为你能做得更好。显然,您必须 运行 一个大小为
的循环cols - block_size[1] * rows - block_size[0]
但是您在示例中使用的是 3、3 补丁,而不是 2、2。
方法 #1
我们可以使用一些 broadcasting
here to get all the indices of all those sliding windows in one go and thus with indexing achieve a vectorized solution
. This is inspired by Efficient Implementation of im2col and col2im
.
这是实现 -
def im2col_sliding_broadcasting(A, BSZ, stepsize=1):
# Parameters
M,N = A.shape
col_extent = N - BSZ[1] + 1
row_extent = M - BSZ[0] + 1
# Get Starting block indices
start_idx = np.arange(BSZ[0])[:,None]*N + np.arange(BSZ[1])
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
return np.take (A,start_idx.ravel()[:,None] + offset_idx.ravel()[::stepsize])
方法 #2
使用新获得的 NumPy array strides
知识让我们创建这样的滑动 windows,我们将有另一个有效的解决方案 -
def im2col_sliding_strided(A, BSZ, stepsize=1):
# Parameters
m,n = A.shape
s0, s1 = A.strides
nrows = m-BSZ[0]+1
ncols = n-BSZ[1]+1
shp = BSZ[0],BSZ[1],nrows,ncols
strd = s0,s1,s0,s1
out_view = np.lib.stride_tricks.as_strided(A, shape=shp, strides=strd)
return out_view.reshape(BSZ[0]*BSZ[1],-1)[:,::stepsize]
方法 #3
之前方法中列出的跨步方法已合并到 scikit-image
module 中以减少混乱,例如 -
from skimage.util import view_as_windows as viewW
def im2col_sliding_strided_v2(A, BSZ, stepsize=1):
return viewW(A, (BSZ[0],BSZ[1])).reshape(-1,BSZ[0]*BSZ[1]).T[:,::stepsize]
样品运行 -
In [106]: a # Input array
Out[106]:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19]])
In [107]: im2col_sliding_broadcasting(a, (2,3))
Out[107]:
array([[ 0, 1, 2, 5, 6, 7, 10, 11, 12],
[ 1, 2, 3, 6, 7, 8, 11, 12, 13],
[ 2, 3, 4, 7, 8, 9, 12, 13, 14],
[ 5, 6, 7, 10, 11, 12, 15, 16, 17],
[ 6, 7, 8, 11, 12, 13, 16, 17, 18],
[ 7, 8, 9, 12, 13, 14, 17, 18, 19]])
In [108]: im2col_sliding_broadcasting(a, (2,3), stepsize=2)
Out[108]:
array([[ 0, 2, 6, 10, 12],
[ 1, 3, 7, 11, 13],
[ 2, 4, 8, 12, 14],
[ 5, 7, 11, 15, 17],
[ 6, 8, 12, 16, 18],
[ 7, 9, 13, 17, 19]])
运行时测试
In [183]: a = np.random.randint(0,255,(1024,1024))
In [184]: %timeit im2col_sliding(img, (8,8), skip=1)
...: %timeit im2col_sliding_broadcasting(img, (8,8), stepsize=1)
...: %timeit im2col_sliding_strided(img, (8,8), stepsize=1)
...: %timeit im2col_sliding_strided_v2(img, (8,8), stepsize=1)
...:
1 loops, best of 3: 1.29 s per loop
1 loops, best of 3: 226 ms per loop
10 loops, best of 3: 84.5 ms per loop
10 loops, best of 3: 111 ms per loop
In [185]: %timeit im2col_sliding(img, (8,8), skip=4)
...: %timeit im2col_sliding_broadcasting(img, (8,8), stepsize=4)
...: %timeit im2col_sliding_strided(img, (8,8), stepsize=4)
...: %timeit im2col_sliding_strided_v2(img, (8,8), stepsize=4)
...:
1 loops, best of 3: 1.31 s per loop
10 loops, best of 3: 104 ms per loop
10 loops, best of 3: 84.4 ms per loop
10 loops, best of 3: 109 ms per loop
在 16x
附近,使用 strided 方法比原来的 loopy 版本提速!
为了在不同的图像通道上滑动window,我们可以使用Divakar@
import numpy as np
A = np.random.randint(0,9,(2,4,4)) # Sample input array
# Sample blocksize (rows x columns)
B = [2,2]
skip=[2,2]
# Parameters
D,M,N = A.shape
col_extent = N - B[1] + 1
row_extent = M - B[0] + 1
# Get Starting block indices
start_idx = np.arange(B[0])[:,None]*N + np.arange(B[1])
# Generate Depth indeces
didx=M*N*np.arange(D)
start_idx=(didx[:,None]+start_idx.ravel()).reshape((-1,B[0],B[1]))
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
out = np.take (A,start_idx.ravel()[:,None] + offset_idx[::skip[0],::skip[1]].ravel())
测试 示例 运行
A=
[[[6 2 8 5]
[6 4 7 6]
[8 6 5 2]
[3 1 3 7]]
[[6 0 4 3]
[7 6 4 6]
[2 6 7 1]
[7 6 7 7]]]
out=
[6 8 8 5]
[2 5 6 2]
[6 7 3 3]
[4 6 1 7]
[6 4 2 7]
[0 3 6 1]
[7 4 7 7]
[6 6 6 7]
您还可以对 M Eliya 的
而不是 "applying" 在最后跳过,你可以在生成偏移数组时应用它,所以而不是:
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[:,None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
out = np.take (A,start_idx.ravel()[:,None] + offset_idx[::skip[0],::skip[1]].ravel())
您可以使用 numpy 的 arange 函数的 step 参数添加跳过:
# Get offsetted indices across the height and width of input array and add skips
offset_idx = np.arange(row_extent, step=skip[0])[:, None] * N + np.arange(col_extent, step=skip[1])
之后只需添加不带 [::] 索引的偏移数组
# Get all actual indices & index into input array for final output
out = np.take(A, start_idx.ravel()[:, None] + offset_idx.ravel())
对于较小的跳过值,它几乎不会节省任何时间:
In[25]:
A = np.random.randint(0,9,(3, 1024, 1024))
B = [2, 2]
skip = [2, 2]
In[26]: %timeit im2col(A, B, skip)
10 loops, best of 3: 19.7 ms per loop
In[27]: %timeit im2col_optimized(A, B, skip)
100 loops, best of 3: 17.5 ms per loop
然而,使用较大的跳过值可以节省更多时间:
In[28]: skip = [10, 10]
In[29]: %timeit im2col(A, B, skip)
100 loops, best of 3: 3.85 ms per loop
In[30]: %timeit im2col_optimized(A, B, skip)
1000 loops, best of 3: 1.02 ms per loop
A = np.random.randint(0,9,(3, 2000, 2000))
B = [10, 10]
skip = [10, 10]
In[43]: %timeit im2col(A, B, skip)
10 loops, best of 3: 87.8 ms per loop
In[44]: %timeit im2col_optimized(A, B, skip)
10 loops, best of 3: 76.3 ms per loop
为了进一步提高性能(例如在卷积上),我们还可以使用基于扩展代码的批处理实现,由 M Elyia@
import numpy as np
A = np.arange(3*1*4*4).reshape(3,1,4,4)+1 # 3 Sample input array with 1 channel
B = [2,2] # Sample blocksize (rows x columns)
skip = [2,2]
# Parameters
batch, D,M,N = A.shape
col_extent = N - B[1] + 1
row_extent = M - B[0] + 1
# Get batch block indices
batch_idx = np.arange(batch)[:, None, None] * D * M * N
# Get Starting block indices
start_idx = np.arange(B[0])[None, :,None]*N + np.arange(B[1])
# Generate Depth indeces
didx=M*N*np.arange(D)
start_idx=(didx[None, :, None]+start_idx.ravel()).reshape((-1,B[0],B[1]))
# Get offsetted indices across the height and width of input array
offset_idx = np.arange(row_extent)[None, :, None]*N + np.arange(col_extent)
# Get all actual indices & index into input array for final output
act_idx = (batch_idx +
start_idx.ravel()[None, :, None] +
offset_idx[:,::skip[0],::skip[1]].ravel())
out = np.take (A, act_idx)
测试样本运行:
A =
[[[[ 1 2 3 4]
[ 5 6 7 8]
[ 9 10 11 12]
[13 14 15 16]]]
[[[17 18 19 20]
[21 22 23 24]
[25 26 27 28]
[29 30 31 32]]]
[[[33 34 35 36]
[37 38 39 40]
[41 42 43 44]
[45 46 47 48]]]]
out =
[[[ 1 2 3 9 10 11]
[ 2 3 4 10 11 12]
[ 5 6 7 13 14 15]
[ 6 7 8 14 15 16]]
[[17 18 19 25 26 27]
[18 19 20 26 27 28]
[21 22 23 29 30 31]
[22 23 24 30 31 32]]
[[33 34 35 41 42 43]
[34 35 36 42 43 44]
[37 38 39 45 46 47]
[38 39 40 46 47 48]]]
我使用 Numba JIT 编译器实现了快速解决方案。根据块大小和跳过大小,它提供从 5.67x
到 3597x
的加速。
Speedup 表示 numba 算法比原始算法快多少倍,例如20x
的加速意味着如果原始算法采用 200ms
那么快速 numba 算法采用 10ms
.
我的代码需要通过 python -m pip install numpy numba timerit matplotlib
.
接下来是定位代码,然后是加速图,然后是时间测量的控制台输出。
import numpy as np
# ----- Original Implementation -----
def im2col_sliding(image, block_size, skip = 1):
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
if vert_blocks <= 0 or horz_blocks <= 0:
return np.zeros((block_size[0] * block_size[1], 0), dtype = image.dtype)
output_vectors = np.zeros((block_size[0] * block_size[1], horz_blocks * vert_blocks), dtype = image.dtype)
itr = 0
for v_b in range(vert_blocks):
for h_b in range(horz_blocks):
output_vectors[:, itr] = image[v_b: v_b + block_size[0], h_b: h_b + block_size[1]].ravel()
itr += 1
return output_vectors[:, ::skip]
# ----- Fast Numba Implementation -----
import numba
@numba.njit(cache = True)
def im2col_sliding_numba(image, block_size, skip = 1):
assert skip >= 1
rows, cols = image.shape
horz_blocks = cols - block_size[1] + 1
vert_blocks = rows - block_size[0] + 1
if vert_blocks <= 0 or horz_blocks <= 0:
return np.zeros((block_size[0] * block_size[1], 0), dtype = image.dtype)
res = np.zeros((block_size[0] * block_size[1], (horz_blocks * vert_blocks + skip - 1) // skip), dtype = image.dtype)
itr, to_skip, v_b = 0, 0, 0
while True:
v_b += to_skip // horz_blocks
if v_b >= vert_blocks:
break
h_b_start = to_skip % horz_blocks
h_cnt = (horz_blocks - h_b_start + skip - 1) // skip
for i, h_b in zip(range(itr, itr + h_cnt), range(h_b_start, horz_blocks, skip)):
ii = 0
for iv in range(v_b, v_b + block_size[0]):
for ih in range(h_b, h_b + block_size[1]):
res[ii, i] = image[iv, ih]
ii += 1
to_skip = skip - (horz_blocks - h_b_start - skip * (h_cnt - 1))
itr += h_cnt
v_b += 1
assert itr == res.shape[1]#, (itr, res.shape)
return res
# ----- Testing -----
from timerit import Timerit
Timerit._default_asciimode = True
side = 256
a = np.random.randint(0, 256, (side, side), dtype = np.uint8)
stats = []
for block_size in [16, 8, 4, 2, 1]:
for skip_size in [1, 2, 5, 11, 23]:
print(f'block_size {block_size} skip_size {skip_size}', flush = True)
for ifn, f in enumerate([im2col_sliding, im2col_sliding_numba]):
print(f'{f.__name__}: ', end = '', flush = True)
tim = Timerit(num = 3, verbose = 1)
for i, t in enumerate(tim):
if i == 0 and ifn == 1:
f(a, (block_size, block_size), skip_size)
with t:
r = f(a, (block_size, block_size), skip_size)
rt = tim.mean()
if ifn == 0:
bt, ba = rt, r
else:
assert np.array_equal(ba, r)
print(f'speedup {round(bt / rt, 2)}x')
stats.append({
'block_size': block_size,
'skip_size': skip_size,
'speedup': bt / rt,
})
stats = sorted(stats, key = lambda e: e['speedup'])
import math, matplotlib, matplotlib.pyplot as plt
x = np.arange(len(stats))
y = np.array([e['speedup'] for e in stats])
plt.rcParams['figure.figsize'] = (12.8, 7.2)
for scale in ['linear', 'log']:
plt.clf()
plt.xlabel('iteration')
plt.ylabel(f'speedup_{scale}')
plt.yscale(scale)
plt.scatter(x, y, marker = '.')
for i in range(x.size):
plt.annotate(
(f"b{str(stats[i]['block_size']).zfill(2)}s{str(stats[i]['skip_size']).zfill(2)}\n" +
f"x{round(stats[i]['speedup'], 2 if stats[i]['speedup'] < 100 else 1 if stats[i]['speedup'] < 1000 else None)}"),
(x[i], y[i]), fontsize = 'small',
)
plt.subplots_adjust(left = 0.055, right = 0.99, bottom = 0.08, top = 0.99)
plt.xlim(left = -0.1)
if scale == 'linear':
ymin, ymax = np.amin(y), np.amax(y)
plt.ylim((ymin - (ymax - ymin) * 0.02, ymax + (ymax - ymin) * 0.05))
plt.yticks([ymin] + [e for e in plt.yticks()[0] if ymin + 0.01 < e < ymax - 0.01] + [ymax])
#plt.gca().get_yaxis().set_major_formatter(matplotlib.ticker.FormatStrFormatter('%.1f'))
plt.savefig(f'im2col_numba_{scale}.png', dpi = 150)
plt.show()
下一个图的迭代为 x
轴,加速为 y
轴,第一个图有 linear
y
轴,第二个图有 logarithmic
y
轴。此外,每个点都有标签 bXXsYYxZZ
,其中 XX
等于块大小,YY
等于跳过(步长)大小,ZZ
等于加速。
线性图:
对数图:
控制台输出:
block_size 16 skip_size 1
im2col_sliding: Timed best=549.069 ms, mean=549.069 +- 0.0 ms
im2col_sliding_numba: Timed best=96.841 ms, mean=96.841 +- 0.0 ms
speedup 5.67x
block_size 16 skip_size 2
im2col_sliding: Timed best=559.396 ms, mean=559.396 +- 0.0 ms
im2col_sliding_numba: Timed best=71.132 ms, mean=71.132 +- 0.0 ms
speedup 7.86x
block_size 16 skip_size 5
im2col_sliding: Timed best=561.030 ms, mean=561.030 +- 0.0 ms
im2col_sliding_numba: Timed best=15.000 ms, mean=15.000 +- 0.0 ms
speedup 37.4x
block_size 16 skip_size 11
im2col_sliding: Timed best=559.045 ms, mean=559.045 +- 0.0 ms
im2col_sliding_numba: Timed best=6.719 ms, mean=6.719 +- 0.0 ms
speedup 83.21x
block_size 16 skip_size 23
im2col_sliding: Timed best=562.462 ms, mean=562.462 +- 0.0 ms
im2col_sliding_numba: Timed best=2.514 ms, mean=2.514 +- 0.0 ms
speedup 223.72x
block_size 8 skip_size 1
im2col_sliding: Timed best=373.790 ms, mean=373.790 +- 0.0 ms
im2col_sliding_numba: Timed best=17.441 ms, mean=17.441 +- 0.0 ms
speedup 21.43x
block_size 8 skip_size 2
im2col_sliding: Timed best=375.858 ms, mean=375.858 +- 0.0 ms
im2col_sliding_numba: Timed best=8.791 ms, mean=8.791 +- 0.0 ms
speedup 42.75x
block_size 8 skip_size 5
im2col_sliding: Timed best=376.767 ms, mean=376.767 +- 0.0 ms
im2col_sliding_numba: Timed best=3.115 ms, mean=3.115 +- 0.0 ms
speedup 120.94x
block_size 8 skip_size 11
im2col_sliding: Timed best=378.284 ms, mean=378.284 +- 0.0 ms
im2col_sliding_numba: Timed best=1.406 ms, mean=1.406 +- 0.0 ms
speedup 268.97x
block_size 8 skip_size 23
im2col_sliding: Timed best=376.268 ms, mean=376.268 +- 0.0 ms
im2col_sliding_numba: Timed best=661.404 us, mean=661.404 +- 0.0 us
speedup 568.89x
block_size 4 skip_size 1
im2col_sliding: Timed best=378.813 ms, mean=378.813 +- 0.0 ms
im2col_sliding_numba: Timed best=4.950 ms, mean=4.950 +- 0.0 ms
speedup 76.54x
block_size 4 skip_size 2
im2col_sliding: Timed best=377.620 ms, mean=377.620 +- 0.0 ms
im2col_sliding_numba: Timed best=2.119 ms, mean=2.119 +- 0.0 ms
speedup 178.24x
block_size 4 skip_size 5
im2col_sliding: Timed best=374.792 ms, mean=374.792 +- 0.0 ms
im2col_sliding_numba: Timed best=854.986 us, mean=854.986 +- 0.0 us
speedup 438.36x
block_size 4 skip_size 11
im2col_sliding: Timed best=373.296 ms, mean=373.296 +- 0.0 ms
im2col_sliding_numba: Timed best=415.028 us, mean=415.028 +- 0.0 us
speedup 899.45x
block_size 4 skip_size 23
im2col_sliding: Timed best=374.075 ms, mean=374.075 +- 0.0 ms
im2col_sliding_numba: Timed best=219.491 us, mean=219.491 +- 0.0 us
speedup 1704.28x
block_size 2 skip_size 1
im2col_sliding: Timed best=377.698 ms, mean=377.698 +- 0.0 ms
im2col_sliding_numba: Timed best=1.477 ms, mean=1.477 +- 0.0 ms
speedup 255.67x
block_size 2 skip_size 2
im2col_sliding: Timed best=378.155 ms, mean=378.155 +- 0.0 ms
im2col_sliding_numba: Timed best=841.298 us, mean=841.298 +- 0.0 us
speedup 449.49x
block_size 2 skip_size 5
im2col_sliding: Timed best=376.381 ms, mean=376.381 +- 0.0 ms
im2col_sliding_numba: Timed best=392.541 us, mean=392.541 +- 0.0 us
speedup 958.83x
block_size 2 skip_size 11
im2col_sliding: Timed best=374.720 ms, mean=374.720 +- 0.0 ms
im2col_sliding_numba: Timed best=193.093 us, mean=193.093 +- 0.0 us
speedup 1940.62x
block_size 2 skip_size 23
im2col_sliding: Timed best=378.092 ms, mean=378.092 +- 0.0 ms
im2col_sliding_numba: Timed best=105.101 us, mean=105.101 +- 0.0 us
speedup 3597.42x
block_size 1 skip_size 1
im2col_sliding: Timed best=203.410 ms, mean=203.410 +- 0.0 ms
im2col_sliding_numba: Timed best=686.335 us, mean=686.335 +- 0.0 us
speedup 296.37x
block_size 1 skip_size 2
im2col_sliding: Timed best=202.865 ms, mean=202.865 +- 0.0 ms
im2col_sliding_numba: Timed best=361.255 us, mean=361.255 +- 0.0 us
speedup 561.56x
block_size 1 skip_size 5
im2col_sliding: Timed best=200.929 ms, mean=200.929 +- 0.0 ms
im2col_sliding_numba: Timed best=164.740 us, mean=164.740 +- 0.0 us
speedup 1219.68x
block_size 1 skip_size 11
im2col_sliding: Timed best=202.163 ms, mean=202.163 +- 0.0 ms
im2col_sliding_numba: Timed best=96.791 us, mean=96.791 +- 0.0 us
speedup 2088.65x
block_size 1 skip_size 23
im2col_sliding: Timed best=202.492 ms, mean=202.492 +- 0.0 ms
im2col_sliding_numba: Timed best=64.527 us, mean=64.527 +- 0.0 us
speedup 3138.1x