我可以(有选择地)在反向传播过程中反转 Theano 梯度吗?
Can I (selectively) invert Theano gradients during backpropagation?
我很想在 Lasagne/Theano 框架中使用最近的论文“Unsupervised Domain Adaptation by Backpropagation”中提出的架构。
这篇论文有点不寻常的地方在于它包含了一个 'gradient reversal layer',它 在反向传播过程中反转 梯度:
(图像底部的箭头是反向传播的梯度)。
在论文中,作者声称该方法 "can be implemented using any deep learning package",实际上他们提供了 version made in caffe.
但是,出于各种原因,我正在使用 Lasagne/Theano 框架。
是否可以在Lasagne/Theano中创建这样的梯度反转层?我还没有看到任何可以将自定义标量变换应用于这样的渐变的示例。如果是这样,我可以通过在烤宽面条中创建自定义图层来实现吗?
这是使用普通 Theano 的草图实现。这可以很容易地集成到烤宽面条中。
您需要创建一个自定义操作,它在正向传播中充当身份操作,但在反向传播中反转梯度。
以下是关于如何实施的建议。它没有经过测试,我不能 100% 确定我已经正确理解了所有内容,但您可以根据需要进行验证和修复。
class ReverseGradient(theano.gof.Op):
view_map = {0: [0]}
__props__ = ('hp_lambda',)
def __init__(self, hp_lambda):
super(ReverseGradient, self).__init__()
self.hp_lambda = hp_lambda
def make_node(self, x):
return theano.gof.graph.Apply(self, [x], [x.type.make_variable()])
def perform(self, node, inputs, output_storage):
xin, = inputs
xout, = output_storage
xout[0] = xin
def grad(self, input, output_gradients):
return [-self.hp_lambda * output_gradients[0]]
使用论文符号和命名约定,这是他们提出的完整通用模型的简单 Theano 实现。
import numpy
import theano
import theano.tensor as tt
def g_f(z, theta_f):
for w_f, b_f in theta_f:
z = tt.tanh(theano.dot(z, w_f) + b_f)
return z
def g_y(z, theta_y):
for w_y, b_y in theta_y[:-1]:
z = tt.tanh(theano.dot(z, w_y) + b_y)
w_y, b_y = theta_y[-1]
z = tt.nnet.softmax(theano.dot(z, w_y) + b_y)
return z
def g_d(z, theta_d):
for w_d, b_d in theta_d[:-1]:
z = tt.tanh(theano.dot(z, w_d) + b_d)
w_d, b_d = theta_d[-1]
z = tt.nnet.sigmoid(theano.dot(z, w_d) + b_d)
return z
def l_y(z, y):
return tt.nnet.categorical_crossentropy(z, y).mean()
def l_d(z, d):
return tt.nnet.binary_crossentropy(z, d).mean()
def mlp_parameters(input_size, layer_sizes):
parameters = []
previous_size = input_size
for layer_size in layer_sizes:
parameters.append((theano.shared(numpy.random.randn(previous_size, layer_size).astype(theano.config.floatX)),
theano.shared(numpy.zeros(layer_size, dtype=theano.config.floatX))))
previous_size = layer_size
return parameters, previous_size
def compile(input_size, f_layer_sizes, y_layer_sizes, d_layer_sizes, hp_lambda, hp_mu):
r = ReverseGradient(hp_lambda)
theta_f, f_size = mlp_parameters(input_size, f_layer_sizes)
theta_y, _ = mlp_parameters(f_size, y_layer_sizes)
theta_d, _ = mlp_parameters(f_size, d_layer_sizes)
xs = tt.matrix('xs')
xs.tag.test_value = numpy.random.randn(9, input_size).astype(theano.config.floatX)
xt = tt.matrix('xt')
xt.tag.test_value = numpy.random.randn(10, input_size).astype(theano.config.floatX)
ys = tt.ivector('ys')
ys.tag.test_value = numpy.random.randint(y_layer_sizes[-1], size=9).astype(numpy.int32)
fs = g_f(xs, theta_f)
e = l_y(g_y(fs, theta_y), ys) + l_d(g_d(r(fs), theta_d), 0) + l_d(g_d(r(g_f(xt, theta_f)), theta_d), 1)
updates = [(p, p - hp_mu * theano.grad(e, p)) for theta in theta_f + theta_y + theta_d for p in theta]
train = theano.function([xs, xt, ys], outputs=e, updates=updates)
return train
def main():
theano.config.compute_test_value = 'raise'
numpy.random.seed(1)
compile(input_size=2, f_layer_sizes=[3, 4], y_layer_sizes=[7, 8], d_layer_sizes=[5, 6], hp_lambda=.5, hp_mu=.01)
main()
这是未经测试的,但以下可能允许将此自定义操作用作烤宽面条层:
class ReverseGradientLayer(lasagne.layers.Layer):
def __init__(self, incoming, hp_lambda, **kwargs):
super(ReverseGradientLayer, self).__init__(incoming, **kwargs)
self.op = ReverseGradient(hp_lambda)
def get_output_for(self, input, **kwargs):
return self.op(input)
我很想在 Lasagne/Theano 框架中使用最近的论文“Unsupervised Domain Adaptation by Backpropagation”中提出的架构。
这篇论文有点不寻常的地方在于它包含了一个 'gradient reversal layer',它 在反向传播过程中反转 梯度:
(图像底部的箭头是反向传播的梯度)。
在论文中,作者声称该方法 "can be implemented using any deep learning package",实际上他们提供了 version made in caffe.
但是,出于各种原因,我正在使用 Lasagne/Theano 框架。
是否可以在Lasagne/Theano中创建这样的梯度反转层?我还没有看到任何可以将自定义标量变换应用于这样的渐变的示例。如果是这样,我可以通过在烤宽面条中创建自定义图层来实现吗?
这是使用普通 Theano 的草图实现。这可以很容易地集成到烤宽面条中。
您需要创建一个自定义操作,它在正向传播中充当身份操作,但在反向传播中反转梯度。
以下是关于如何实施的建议。它没有经过测试,我不能 100% 确定我已经正确理解了所有内容,但您可以根据需要进行验证和修复。
class ReverseGradient(theano.gof.Op):
view_map = {0: [0]}
__props__ = ('hp_lambda',)
def __init__(self, hp_lambda):
super(ReverseGradient, self).__init__()
self.hp_lambda = hp_lambda
def make_node(self, x):
return theano.gof.graph.Apply(self, [x], [x.type.make_variable()])
def perform(self, node, inputs, output_storage):
xin, = inputs
xout, = output_storage
xout[0] = xin
def grad(self, input, output_gradients):
return [-self.hp_lambda * output_gradients[0]]
使用论文符号和命名约定,这是他们提出的完整通用模型的简单 Theano 实现。
import numpy
import theano
import theano.tensor as tt
def g_f(z, theta_f):
for w_f, b_f in theta_f:
z = tt.tanh(theano.dot(z, w_f) + b_f)
return z
def g_y(z, theta_y):
for w_y, b_y in theta_y[:-1]:
z = tt.tanh(theano.dot(z, w_y) + b_y)
w_y, b_y = theta_y[-1]
z = tt.nnet.softmax(theano.dot(z, w_y) + b_y)
return z
def g_d(z, theta_d):
for w_d, b_d in theta_d[:-1]:
z = tt.tanh(theano.dot(z, w_d) + b_d)
w_d, b_d = theta_d[-1]
z = tt.nnet.sigmoid(theano.dot(z, w_d) + b_d)
return z
def l_y(z, y):
return tt.nnet.categorical_crossentropy(z, y).mean()
def l_d(z, d):
return tt.nnet.binary_crossentropy(z, d).mean()
def mlp_parameters(input_size, layer_sizes):
parameters = []
previous_size = input_size
for layer_size in layer_sizes:
parameters.append((theano.shared(numpy.random.randn(previous_size, layer_size).astype(theano.config.floatX)),
theano.shared(numpy.zeros(layer_size, dtype=theano.config.floatX))))
previous_size = layer_size
return parameters, previous_size
def compile(input_size, f_layer_sizes, y_layer_sizes, d_layer_sizes, hp_lambda, hp_mu):
r = ReverseGradient(hp_lambda)
theta_f, f_size = mlp_parameters(input_size, f_layer_sizes)
theta_y, _ = mlp_parameters(f_size, y_layer_sizes)
theta_d, _ = mlp_parameters(f_size, d_layer_sizes)
xs = tt.matrix('xs')
xs.tag.test_value = numpy.random.randn(9, input_size).astype(theano.config.floatX)
xt = tt.matrix('xt')
xt.tag.test_value = numpy.random.randn(10, input_size).astype(theano.config.floatX)
ys = tt.ivector('ys')
ys.tag.test_value = numpy.random.randint(y_layer_sizes[-1], size=9).astype(numpy.int32)
fs = g_f(xs, theta_f)
e = l_y(g_y(fs, theta_y), ys) + l_d(g_d(r(fs), theta_d), 0) + l_d(g_d(r(g_f(xt, theta_f)), theta_d), 1)
updates = [(p, p - hp_mu * theano.grad(e, p)) for theta in theta_f + theta_y + theta_d for p in theta]
train = theano.function([xs, xt, ys], outputs=e, updates=updates)
return train
def main():
theano.config.compute_test_value = 'raise'
numpy.random.seed(1)
compile(input_size=2, f_layer_sizes=[3, 4], y_layer_sizes=[7, 8], d_layer_sizes=[5, 6], hp_lambda=.5, hp_mu=.01)
main()
这是未经测试的,但以下可能允许将此自定义操作用作烤宽面条层:
class ReverseGradientLayer(lasagne.layers.Layer):
def __init__(self, incoming, hp_lambda, **kwargs):
super(ReverseGradientLayer, self).__init__(incoming, **kwargs)
self.op = ReverseGradient(hp_lambda)
def get_output_for(self, input, **kwargs):
return self.op(input)