Theano:如何有效地 undo/reverse 最大池化
Theano: how to efficiently undo/reverse max-pooling
我正在使用 Theano 0.7 创建一个 convolutional neural net which uses max-pooling(即通过仅保留局部最大值来缩小矩阵)。
为了 "undo" 或 "reverse" 最大池化步骤,一种方法是将最大值的位置存储为辅助数据,然后通过制作零的大数组并使用这些辅助位置将最大值放在适当的位置。
这是我目前的做法:
import numpy as np
import theano
import theano.tensor as T
minibatchsize = 2
numfilters = 3
numsamples = 4
upsampfactor = 5
# HERE is the function that I hope could be improved
def upsamplecode(encoded, auxpos):
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
for whichitem in range(minibatchsize):
for whichfilt in range(numfilters):
upsampled = T.set_subtensor(upsampled[whichitem, whichfilt, auxpos[whichitem, whichfilt, :]], encoded[whichitem, whichfilt, :])
return upsampled
totalitems = minibatchsize * numfilters * numsamples
code = theano.shared(np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)))
auxpos = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)) % upsampfactor # arbitrary positions within a bin
auxpos += (np.arange(4) * 5).reshape((1,1,-1)) # shifted to the actual temporal bin location
auxpos = theano.shared(auxpos.astype(np.int))
print "code:"
print code.get_value()
print "locations:"
print auxpos.get_value()
get_upsampled = theano.function([], upsamplecode(code, auxpos))
print "the un-pooled data:"
print get_upsampled()
(By the way, in this case I have a 3D tensor, and it's only the third axis that gets max-pooled. People who work with image data might expect to see two dimensions getting max-pooled.)
输出为:
code:
[[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
[[12 13 14 15]
[16 17 18 19]
[20 21 22 23]]]
locations:
[[[ 0 6 12 18]
[ 4 5 11 17]
[ 3 9 10 16]]
[[ 2 8 14 15]
[ 1 7 13 19]
[ 0 6 12 18]]]
the un-pooled data:
[[[ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 2. 0.
0. 0. 0. 0. 3. 0.]
[ 0. 0. 0. 0. 4. 5. 0. 0. 0. 0. 0. 6. 0. 0.
0. 0. 0. 7. 0. 0.]
[ 0. 0. 0. 8. 0. 0. 0. 0. 0. 9. 10. 0. 0. 0.
0. 0. 11. 0. 0. 0.]]
[[ 0. 0. 12. 0. 0. 0. 0. 0. 13. 0. 0. 0. 0. 0.
14. 15. 0. 0. 0. 0.]
[ 0. 16. 0. 0. 0. 0. 0. 17. 0. 0. 0. 0. 0. 18.
0. 0. 0. 0. 0. 19.]
[ 20. 0. 0. 0. 0. 0. 21. 0. 0. 0. 0. 0. 22. 0.
0. 0. 0. 0. 23. 0.]]]
这种方法 有效 但它是一个 瓶颈 ,占用了我计算机的大部分时间(我认为 set_subtensor 调用可能暗示 cpu<->gpu 数据复制)。那么:这可以更有效地实施吗?
我怀疑有一种方法可以将其表达为单个 set_subtensor()
调用,这可能会更快,但我不知道如何让张量索引正确广播。
更新:我想到了一种通过处理扁平张量在一次调用中完成的方法:
def upsamplecode2(encoded, auxpos):
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
add_to_flattened_indices = theano.shared(np.array([ [[(y + z * numfilters) * numsamples * upsampfactor for x in range(numsamples)] for y in range(numfilters)] for z in range(minibatchsize)], dtype=theano.config.floatX).flatten(), name="add_to_flattened_indices")
upsampled = T.set_subtensor(upsampled.flatten()[T.cast(auxpos.flatten() + add_to_flattened_indices, 'int32')], encoded.flatten()).reshape(upsampled.shape)
return upsampled
get_upsampled2 = theano.function([], upsamplecode2(code, auxpos))
print "the un-pooled data v2:"
ups2 = get_upsampled2()
print ups2
然而,这在效率方面仍然不是很好,因为当我 运行 这个(添加到上面脚本的末尾)时,我发现 Cuda 库目前不能做整数索引有效操作:
ERROR (theano.gof.opt): Optimization failure due to: local_gpu_advanced_incsubtensor1
ERROR (theano.gof.opt): TRACEBACK:
ERROR (theano.gof.opt): Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/theano/gof/opt.py", line 1493, in process_node
replacements = lopt.transform(node)
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/opt.py", line 952, in local_gpu_advanced_incsubtensor1
gpu_y = gpu_from_host(y)
File "/usr/local/lib/python2.7/dist-packages/theano/gof/op.py", line 507, in __call__
node = self.make_node(*inputs, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/basic_ops.py", line 133, in make_node
dtype=x.dtype)()])
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/type.py", line 69, in __init__
(self.__class__.__name__, dtype, name))
TypeError: CudaNdarrayType only supports dtype float32 for now. Tried using dtype int64 for variable None
我不知道这样是不是更快,但可能更简洁一点。看看对你的情况是否有用。
import numpy as np
import theano
import theano.tensor as T
minibatchsize = 2
numfilters = 3
numsamples = 4
upsampfactor = 5
totalitems = minibatchsize * numfilters * numsamples
code = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples))
auxpos = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)) % upsampfactor
auxpos += (np.arange(4) * 5).reshape((1,1,-1))
# first in numpy
shp = code.shape
upsampled_np = np.zeros((shp[0], shp[1], shp[2] * upsampfactor))
upsampled_np[np.arange(shp[0]).reshape(-1, 1, 1), np.arange(shp[1]).reshape(1, -1, 1), auxpos] = code
print "numpy output:"
print upsampled_np
# now the same idea in theano
encoded = T.tensor3()
positions = T.tensor3(dtype='int64')
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
upsampled = T.set_subtensor(upsampled[T.arange(shp[0]).reshape((-1, 1, 1)), T.arange(shp[1]).reshape((1, -1, 1)), positions], encoded)
print "theano output:"
print upsampled.eval({encoded: code, positions: auxpos})
我正在使用 Theano 0.7 创建一个 convolutional neural net which uses max-pooling(即通过仅保留局部最大值来缩小矩阵)。
为了 "undo" 或 "reverse" 最大池化步骤,一种方法是将最大值的位置存储为辅助数据,然后通过制作零的大数组并使用这些辅助位置将最大值放在适当的位置。
这是我目前的做法:
import numpy as np
import theano
import theano.tensor as T
minibatchsize = 2
numfilters = 3
numsamples = 4
upsampfactor = 5
# HERE is the function that I hope could be improved
def upsamplecode(encoded, auxpos):
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
for whichitem in range(minibatchsize):
for whichfilt in range(numfilters):
upsampled = T.set_subtensor(upsampled[whichitem, whichfilt, auxpos[whichitem, whichfilt, :]], encoded[whichitem, whichfilt, :])
return upsampled
totalitems = minibatchsize * numfilters * numsamples
code = theano.shared(np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)))
auxpos = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)) % upsampfactor # arbitrary positions within a bin
auxpos += (np.arange(4) * 5).reshape((1,1,-1)) # shifted to the actual temporal bin location
auxpos = theano.shared(auxpos.astype(np.int))
print "code:"
print code.get_value()
print "locations:"
print auxpos.get_value()
get_upsampled = theano.function([], upsamplecode(code, auxpos))
print "the un-pooled data:"
print get_upsampled()
(By the way, in this case I have a 3D tensor, and it's only the third axis that gets max-pooled. People who work with image data might expect to see two dimensions getting max-pooled.)
输出为:
code:
[[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
[[12 13 14 15]
[16 17 18 19]
[20 21 22 23]]]
locations:
[[[ 0 6 12 18]
[ 4 5 11 17]
[ 3 9 10 16]]
[[ 2 8 14 15]
[ 1 7 13 19]
[ 0 6 12 18]]]
the un-pooled data:
[[[ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 2. 0.
0. 0. 0. 0. 3. 0.]
[ 0. 0. 0. 0. 4. 5. 0. 0. 0. 0. 0. 6. 0. 0.
0. 0. 0. 7. 0. 0.]
[ 0. 0. 0. 8. 0. 0. 0. 0. 0. 9. 10. 0. 0. 0.
0. 0. 11. 0. 0. 0.]]
[[ 0. 0. 12. 0. 0. 0. 0. 0. 13. 0. 0. 0. 0. 0.
14. 15. 0. 0. 0. 0.]
[ 0. 16. 0. 0. 0. 0. 0. 17. 0. 0. 0. 0. 0. 18.
0. 0. 0. 0. 0. 19.]
[ 20. 0. 0. 0. 0. 0. 21. 0. 0. 0. 0. 0. 22. 0.
0. 0. 0. 0. 23. 0.]]]
这种方法 有效 但它是一个 瓶颈 ,占用了我计算机的大部分时间(我认为 set_subtensor 调用可能暗示 cpu<->gpu 数据复制)。那么:这可以更有效地实施吗?
我怀疑有一种方法可以将其表达为单个 set_subtensor()
调用,这可能会更快,但我不知道如何让张量索引正确广播。
更新:我想到了一种通过处理扁平张量在一次调用中完成的方法:
def upsamplecode2(encoded, auxpos):
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
add_to_flattened_indices = theano.shared(np.array([ [[(y + z * numfilters) * numsamples * upsampfactor for x in range(numsamples)] for y in range(numfilters)] for z in range(minibatchsize)], dtype=theano.config.floatX).flatten(), name="add_to_flattened_indices")
upsampled = T.set_subtensor(upsampled.flatten()[T.cast(auxpos.flatten() + add_to_flattened_indices, 'int32')], encoded.flatten()).reshape(upsampled.shape)
return upsampled
get_upsampled2 = theano.function([], upsamplecode2(code, auxpos))
print "the un-pooled data v2:"
ups2 = get_upsampled2()
print ups2
然而,这在效率方面仍然不是很好,因为当我 运行 这个(添加到上面脚本的末尾)时,我发现 Cuda 库目前不能做整数索引有效操作:
ERROR (theano.gof.opt): Optimization failure due to: local_gpu_advanced_incsubtensor1
ERROR (theano.gof.opt): TRACEBACK:
ERROR (theano.gof.opt): Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/theano/gof/opt.py", line 1493, in process_node
replacements = lopt.transform(node)
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/opt.py", line 952, in local_gpu_advanced_incsubtensor1
gpu_y = gpu_from_host(y)
File "/usr/local/lib/python2.7/dist-packages/theano/gof/op.py", line 507, in __call__
node = self.make_node(*inputs, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/basic_ops.py", line 133, in make_node
dtype=x.dtype)()])
File "/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/type.py", line 69, in __init__
(self.__class__.__name__, dtype, name))
TypeError: CudaNdarrayType only supports dtype float32 for now. Tried using dtype int64 for variable None
我不知道这样是不是更快,但可能更简洁一点。看看对你的情况是否有用。
import numpy as np
import theano
import theano.tensor as T
minibatchsize = 2
numfilters = 3
numsamples = 4
upsampfactor = 5
totalitems = minibatchsize * numfilters * numsamples
code = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples))
auxpos = np.arange(totalitems).reshape((minibatchsize, numfilters, numsamples)) % upsampfactor
auxpos += (np.arange(4) * 5).reshape((1,1,-1))
# first in numpy
shp = code.shape
upsampled_np = np.zeros((shp[0], shp[1], shp[2] * upsampfactor))
upsampled_np[np.arange(shp[0]).reshape(-1, 1, 1), np.arange(shp[1]).reshape(1, -1, 1), auxpos] = code
print "numpy output:"
print upsampled_np
# now the same idea in theano
encoded = T.tensor3()
positions = T.tensor3(dtype='int64')
shp = encoded.shape
upsampled = T.zeros((shp[0], shp[1], shp[2] * upsampfactor))
upsampled = T.set_subtensor(upsampled[T.arange(shp[0]).reshape((-1, 1, 1)), T.arange(shp[1]).reshape((1, -1, 1)), positions], encoded)
print "theano output:"
print upsampled.eval({encoded: code, positions: auxpos})