如何优化这个 MaxPool2d 实现
How to optimize this MaxPool2d implementation
我做了一些 MaxPool2d 的实现(运行 正确,与 pytorch 相比)。在 mnist 数据集上进行测试时,此函数 (updateOutput) 需要很长时间才能完成。如何使用 numpy 优化此代码?
class MaxPool2d(Module):
def __init__(self, kernel_size):
super(MaxPool2d, self).__init__()
self.kernel_size = kernel_size
self.gradInput = None
def updateOutput(self, input):
#print("MaxPool updateOutput")
#start_time = time.time()
kernel = self.kernel_size
poolH = input.shape[2] // kernel
poolW = input.shape[3] // kernel
self.output = np.zeros((input.shape[0],
input.shape[1],
poolH,
poolW))
self.index = np.zeros((input.shape[0],
input.shape[1],
poolH,
poolW,
2),
dtype='int32')
for i in range(input.shape[0]):
for j in range(input.shape[1]):
for k in range(0, input.shape[2] - kernel+1, kernel):
for m in range(0, input.shape[3] - kernel+1, kernel):
M = input[i, j, k : k+kernel, m : m+kernel]
self.output[i, j, k // kernel, m // kernel] = M.max()
self.index[i, j, k // kernel, m // kernel] = np.array(np.unravel_index(M.argmax(), M.shape)) + np.array((k, m))
#print(f"time: {time.time() - start_time:.3f}s")
return self.output
输入形状 = (batch_size, n_input_channels, h, w)
输出形状 = (batch_size, n_output_channels, h // kern_size, w // kern_size)
为清楚起见,我通过删除批量大小和渠道维度简化了您的示例。
大部分时间花在M.max()
的计算上。我创建了基准函数 update_output_b
来使用常量数组执行此循环。
import time
import numpy as np
def timeit(cycles):
def timed(func):
def wrapper(*args, **kwargs):
start_t = time.time()
for _ in range(cycles):
func(*args, **kwargs)
t = (time.time() - start_t) / cycles
print(f'{func.__name__} mean execution time: {t:.3f}s')
return wrapper
return timed
@timeit(100)
def update_output_b(input, kernel):
ones = np.ones((kernel, kernel))
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
output = np.zeros((pool_h, pool_w))
for i in range(0, input.shape[0] - kernel + 1, kernel):
for j in range(0, input.shape[1] - kernel + 1, kernel):
output[i // kernel, j // kernel] = ones.max()
return output
in_arr = np.random.rand(3001, 200)
update_output_b(in_arr, 3)
它的输出是update_output_b mean execution time: 0.277s
因为它不使用 numpy 完全矢量化的操作。如果可能,您应该始终优先使用本机 numpy 函数而不是循环。
此外,使用输入数组的切片执行速度较慢,因为在大多数情况下访问连续内存速度更快。
@timeit(100)
def update_output_1(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
output = np.zeros((pool_h, pool_w))
for i in range(0, input.shape[0] - kernel + 1, kernel):
for j in range(0, input.shape[1] - kernel + 1, kernel):
M = input[i : i + kernel, j : j + kernel]
output[i // kernel, j // kernel] = M.max()
return output
update_output_1(in_arr, 3)
代码 returns update_output_1 mean execution time: 0.332s
(与上一个相比增加了 55 毫秒)
我在下面添加了矢量化代码。它的运行速度快了约 20 倍 (update_output_2 mean execution time: 0.015s
),但它可能远非最佳。
@timeit(100)
def update_output_2(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
input_h = pool_h * kernel
input_w = pool_w * kernel
# crop input
output = input[:input_h, :input_w]
# calculate max along second axis
output = output.reshape((-1, kernel))
output = output.max(axis=1)
# calculate max along first axis
output = output.reshape((pool_h, kernel, pool_w))
output = output.max(axis=1)
return output
update_output_2(in_arr, 3)
它分 3 个步骤生成输出:
- 将输入裁剪为可被内核整除的大小
- 沿第二个轴计算最大值(它减少了第一个轴中切片之间的偏移量)
- 计算第一个轴的最大值
编辑:
我添加了用于检索最大值索引的修改。但是,您应该检查索引算法,因为我只在随机数组上测试过它。
它在 ech window 中沿第二个轴计算 output_indices
,然后沿第二个轴使用 output_indices_selector
到 select 最大值。
def update_output_3(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
input_h = pool_h * kernel
input_w = pool_w * kernel
# crop input
output = input[:input_h, :input_w]
# calculate max along second axis
output_tmp = output.reshape((-1, kernel))
output_indices = output_tmp.argmax(axis=1)
output_indices += np.arange(output_indices.shape[0]) * kernel
output_indices = np.unravel_index(output_indices, output.shape)
output_tmp = output[output_indices]
# calculate max along first axis
output_tmp = output_tmp.reshape((pool_h, kernel, pool_w))
output_indices_selector = (kernel * pool_w * np.arange(pool_h).reshape(pool_h, 1))
output_indices_selector = output_indices_selector.repeat(pool_w, axis=1)
output_indices_selector += pool_w * output_tmp.argmax(axis=1)
output_indices_selector += np.arange(pool_w)
output_indices_selector = output_indices_selector.flatten()
output_indices = (output_indices[0][output_indices_selector],
output_indices[1][output_indices_selector])
output = output[output_indices].reshape(pool_h, pool_w)
return output, output_indices
我做了一些 MaxPool2d 的实现(运行 正确,与 pytorch 相比)。在 mnist 数据集上进行测试时,此函数 (updateOutput) 需要很长时间才能完成。如何使用 numpy 优化此代码?
class MaxPool2d(Module):
def __init__(self, kernel_size):
super(MaxPool2d, self).__init__()
self.kernel_size = kernel_size
self.gradInput = None
def updateOutput(self, input):
#print("MaxPool updateOutput")
#start_time = time.time()
kernel = self.kernel_size
poolH = input.shape[2] // kernel
poolW = input.shape[3] // kernel
self.output = np.zeros((input.shape[0],
input.shape[1],
poolH,
poolW))
self.index = np.zeros((input.shape[0],
input.shape[1],
poolH,
poolW,
2),
dtype='int32')
for i in range(input.shape[0]):
for j in range(input.shape[1]):
for k in range(0, input.shape[2] - kernel+1, kernel):
for m in range(0, input.shape[3] - kernel+1, kernel):
M = input[i, j, k : k+kernel, m : m+kernel]
self.output[i, j, k // kernel, m // kernel] = M.max()
self.index[i, j, k // kernel, m // kernel] = np.array(np.unravel_index(M.argmax(), M.shape)) + np.array((k, m))
#print(f"time: {time.time() - start_time:.3f}s")
return self.output
输入形状 = (batch_size, n_input_channels, h, w)
输出形状 = (batch_size, n_output_channels, h // kern_size, w // kern_size)
为清楚起见,我通过删除批量大小和渠道维度简化了您的示例。
大部分时间花在M.max()
的计算上。我创建了基准函数 update_output_b
来使用常量数组执行此循环。
import time
import numpy as np
def timeit(cycles):
def timed(func):
def wrapper(*args, **kwargs):
start_t = time.time()
for _ in range(cycles):
func(*args, **kwargs)
t = (time.time() - start_t) / cycles
print(f'{func.__name__} mean execution time: {t:.3f}s')
return wrapper
return timed
@timeit(100)
def update_output_b(input, kernel):
ones = np.ones((kernel, kernel))
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
output = np.zeros((pool_h, pool_w))
for i in range(0, input.shape[0] - kernel + 1, kernel):
for j in range(0, input.shape[1] - kernel + 1, kernel):
output[i // kernel, j // kernel] = ones.max()
return output
in_arr = np.random.rand(3001, 200)
update_output_b(in_arr, 3)
它的输出是update_output_b mean execution time: 0.277s
因为它不使用 numpy 完全矢量化的操作。如果可能,您应该始终优先使用本机 numpy 函数而不是循环。
此外,使用输入数组的切片执行速度较慢,因为在大多数情况下访问连续内存速度更快。
@timeit(100)
def update_output_1(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
output = np.zeros((pool_h, pool_w))
for i in range(0, input.shape[0] - kernel + 1, kernel):
for j in range(0, input.shape[1] - kernel + 1, kernel):
M = input[i : i + kernel, j : j + kernel]
output[i // kernel, j // kernel] = M.max()
return output
update_output_1(in_arr, 3)
代码 returns update_output_1 mean execution time: 0.332s
(与上一个相比增加了 55 毫秒)
我在下面添加了矢量化代码。它的运行速度快了约 20 倍 (update_output_2 mean execution time: 0.015s
),但它可能远非最佳。
@timeit(100)
def update_output_2(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
input_h = pool_h * kernel
input_w = pool_w * kernel
# crop input
output = input[:input_h, :input_w]
# calculate max along second axis
output = output.reshape((-1, kernel))
output = output.max(axis=1)
# calculate max along first axis
output = output.reshape((pool_h, kernel, pool_w))
output = output.max(axis=1)
return output
update_output_2(in_arr, 3)
它分 3 个步骤生成输出:
- 将输入裁剪为可被内核整除的大小
- 沿第二个轴计算最大值(它减少了第一个轴中切片之间的偏移量)
- 计算第一个轴的最大值
编辑:
我添加了用于检索最大值索引的修改。但是,您应该检查索引算法,因为我只在随机数组上测试过它。
它在 ech window 中沿第二个轴计算 output_indices
,然后沿第二个轴使用 output_indices_selector
到 select 最大值。
def update_output_3(input, kernel):
pool_h = input.shape[0] // kernel
pool_w = input.shape[1] // kernel
input_h = pool_h * kernel
input_w = pool_w * kernel
# crop input
output = input[:input_h, :input_w]
# calculate max along second axis
output_tmp = output.reshape((-1, kernel))
output_indices = output_tmp.argmax(axis=1)
output_indices += np.arange(output_indices.shape[0]) * kernel
output_indices = np.unravel_index(output_indices, output.shape)
output_tmp = output[output_indices]
# calculate max along first axis
output_tmp = output_tmp.reshape((pool_h, kernel, pool_w))
output_indices_selector = (kernel * pool_w * np.arange(pool_h).reshape(pool_h, 1))
output_indices_selector = output_indices_selector.repeat(pool_w, axis=1)
output_indices_selector += pool_w * output_tmp.argmax(axis=1)
output_indices_selector += np.arange(pool_w)
output_indices_selector = output_indices_selector.flatten()
output_indices = (output_indices[0][output_indices_selector],
output_indices[1][output_indices_selector])
output = output[output_indices].reshape(pool_h, pool_w)
return output, output_indices