在 pet 示例中实现自定义负对数似然损失时遇到问题
Trouble implementing custom Negative Log-Likelihood loss in pet example
我正在使用一个宠物示例来优化连接到 softplus 的密集层的参数并输出负二项分布的参数。我的流程是:
1) 使用前向和后向方法为负对数似然损失 构建自定义class。它获取分布参数和目标作为输入并输出损失(在目标上评估的建模分布的负对数似然):
from scipy.special import gamma, digamma, binom
from cntk.ops.functions import UserFunction
import numpy as np
class NegBin_Loss_Deep_NoSP(UserFunction):
def __init__(self, dis_params, target, name='NegBin_Loss_Deep_NoSP'):
super(NegBin_Loss_Deep_NoSP, self).__init__([dis_params, target], name=name)
def forward(self, arguments, device=None, outputs_to_retain=None):
self.miu = arguments[0][0][0]
self.alpha = arguments[0][0][1]
target = arguments[1][0]
#Compute the likelihood of the target
LL = binom(target+(1/self.alpha)-1,target)*((1/(1+self.alpha*self.miu))**(1/self.alpha))*(((self.alpha*self.miu)/(1+self.alpha*self.miu))**target)
log_ll = np.log(LL)
# Loss is the negative log likelihood
return target, -log_ll
def backward(self, state, root_gradients, variables):
target = state
for idx in range(len(self.inputs)):
if self.inputs[idx] in variables:
gradient = np.array([-(target-self.miu)/(self.alpha*(self.miu**2)+self.miu), # miu derivative
-((self.miu*self.alpha+1)*np.log(self.miu*self.alpha+1)+(-self.miu*digamma((1/self.alpha)+target)+self.miu*digamma(1/self.alpha)+target-self.miu)*self.alpha-digamma((1/self.alpha)+target)+digamma(1/self.alpha))/(self.miu*(self.alpha**3)+(self.alpha**2))])[:,0] # alpha derivative
variables[self.inputs[idx]] = gradient
return root_gradients*gradient
def infer_outputs(self):
return [output_variable(self.inputs[1].shape, self.inputs[0].dtype, self.inputs[1].dynamic_axes)]
@staticmethod
def deserialize(inputs, name, state):
f = NegBin_Loss_Deep_NoSP(inputs[0], inputs[1], name)
return f
2) 使用自定义脚本对前向和后向方法进行数值测试。实例化上面的 class,传递一个输入数组并使用前向和后向更新输入数组并验证递减损失。 按预期工作
def grad_clipper(grad_v, trunc_max=0.005,trunc_min=-0.005):
if trunc_min >= trunc_max:
raise Exception("lower clipping bound has to be lower than upper bound")
for idx, value in enumerate(grad_v):
grad_v[idx] = max(min(value,trunc_max),trunc_min)
return grad_v
def convergence_test():
iter_num = 500000
noise_magnitude = 0.15
target = np.array([3.]) # [0.5, 0]
params = [5, 0.5]
v = C.input_variable(shape=(2,), needs_gradient=True)
t = C.input_variable(shape=(1,))
lr = 0.0005
a = NegBin_Loss_Deep_NoSP(v,t)
for i in range(iter_num):
target_new = (1+(0.5-np.random.rand())*noise_magnitude)*target
state, loss = a.forward([[params],target_new], [a.output], set([a.output]))
if i%1000 == 0:
print("loss", loss)
grad = grad_clipper(a.backward(state, np.ones_like(params), set([v])))
if i%1000 == 0:
print("params:", params, "gradient:", grad)
params = np.sum([params,[l * lr for l in grad]],axis=0)
params[1] = max(params[1], 5e-6)
convergence_test()
3) 用CNTK的learner搭建pet实例模型,优化分布参数。我无法获取要更新的致密层参数,我错过了什么?
import cntk as C
from cntk.layers import *
# Simple model to experiment gradient descent on the weights of the dense layer
def model0(x):
with default_options(initial_state = 0.1, enable_self_stabilization=True,init=glorot_uniform()):
m = Dense(2, activation = None)(x)
m = C.ops.softplus(m)
return m
# input sequences
x = C.input_variable(shape=(2,))
t = C.input_variable(shape=(1,))
# create the model
m0 = model0(x)
# define the learning rate
learning_rate = 0.02
lr_schedule = C.learning_parameter_schedule(learning_rate)
# define loss and error function
loss = NegBin_Loss_Deep_NoSP(m0.output, t)
error = NegBin_Loss_Deep_NoSP(m0.output, t)
# use fsadagrad optimizer
momentum_schedule = momentum_schedule = C.momentum_schedule(0.9)
learner = C.fsadagrad(m0.parameters,
lr = lr_schedule,
momentum = momentum_schedule,
unit_gain = True)
trainer = C.Trainer(m0, (loss, error), [learner])
loss_summary = []
start = time.time()
x1 = np.array([3,0.002], dtype=np.float32)
y1 = np.array([2.5], dtype=np.float32)
for epoch in range(0, 10):
trainer.train_minibatch({x: x1, t: y1})
if epoch % (10 / 10) == 0:
training_loss = trainer.previous_minibatch_loss_average
loss_summary.append(training_loss)
print("epoch: {}, loss: {:.5f}".format(epoch, training_loss))
4) 按照 snowflake 的建议,使用本机 CNTK 损失函数测试 pet 示例。通过调整 3) 中的存根,我引入了一个带有平方损失的类似测试。我可以确认以这种方式模型正在更新其参数并且损失随着训练而减少。我相信 这个测试将问题归结为我在 1)
中放置的自定义损失函数
x = C.input_variable(shape=(2,))
t = C.input_variable(shape=(2,)) # -> Changed in 4)
# create the model
m0 = model0(x)
# the learning rate
learning_rate = 0.02
lr_schedule = C.learning_parameter_schedule(learning_rate)
# loss function
loss = C.squared_error(m0.output, t) # -> Changed in 4)
error = C.squared_error(m0.output, t) # -> Changed in 4)
# use fsadagrad optimizer
momentum_schedule = C.momentum_schedule(0.9)
learner = C.fsadagrad(m0.parameters,
lr = lr_schedule,
momentum = momentum_schedule,
unit_gain = True)
trainer = C.Trainer(m0, (loss, error), [learner])
loss_summary = []
start = time.time()
x1 = np.array([3, 0.002], dtype=np.float32)
y1 = np.array([2.5, 0.005], dtype=np.float32) # -> Changed in 4)
for epoch in range(0, 10):
trainer.train_minibatch({x: x1, t: y1})
if epoch % (10 / 10) == 0:
training_loss = trainer.previous_minibatch_loss_average
loss_summary.append(training_loss)
print("epoch: {}, loss: {:.5f}".format(epoch, training_loss))
是什么导致 1) 中的损失函数不允许我的模型训练?
谢谢
编辑:已解决,谢谢雪花!
鉴于您的损失函数接受多个输入。 variables 将是一个变量字典。
这样做 variables = gradient
会覆盖字典。
你应该做类似 variables[var] = ... # compute the gradient for var
的事情
我正在使用一个宠物示例来优化连接到 softplus 的密集层的参数并输出负二项分布的参数。我的流程是:
1) 使用前向和后向方法为负对数似然损失 构建自定义class。它获取分布参数和目标作为输入并输出损失(在目标上评估的建模分布的负对数似然):
from scipy.special import gamma, digamma, binom
from cntk.ops.functions import UserFunction
import numpy as np
class NegBin_Loss_Deep_NoSP(UserFunction):
def __init__(self, dis_params, target, name='NegBin_Loss_Deep_NoSP'):
super(NegBin_Loss_Deep_NoSP, self).__init__([dis_params, target], name=name)
def forward(self, arguments, device=None, outputs_to_retain=None):
self.miu = arguments[0][0][0]
self.alpha = arguments[0][0][1]
target = arguments[1][0]
#Compute the likelihood of the target
LL = binom(target+(1/self.alpha)-1,target)*((1/(1+self.alpha*self.miu))**(1/self.alpha))*(((self.alpha*self.miu)/(1+self.alpha*self.miu))**target)
log_ll = np.log(LL)
# Loss is the negative log likelihood
return target, -log_ll
def backward(self, state, root_gradients, variables):
target = state
for idx in range(len(self.inputs)):
if self.inputs[idx] in variables:
gradient = np.array([-(target-self.miu)/(self.alpha*(self.miu**2)+self.miu), # miu derivative
-((self.miu*self.alpha+1)*np.log(self.miu*self.alpha+1)+(-self.miu*digamma((1/self.alpha)+target)+self.miu*digamma(1/self.alpha)+target-self.miu)*self.alpha-digamma((1/self.alpha)+target)+digamma(1/self.alpha))/(self.miu*(self.alpha**3)+(self.alpha**2))])[:,0] # alpha derivative
variables[self.inputs[idx]] = gradient
return root_gradients*gradient
def infer_outputs(self):
return [output_variable(self.inputs[1].shape, self.inputs[0].dtype, self.inputs[1].dynamic_axes)]
@staticmethod
def deserialize(inputs, name, state):
f = NegBin_Loss_Deep_NoSP(inputs[0], inputs[1], name)
return f
2) 使用自定义脚本对前向和后向方法进行数值测试。实例化上面的 class,传递一个输入数组并使用前向和后向更新输入数组并验证递减损失。 按预期工作
def grad_clipper(grad_v, trunc_max=0.005,trunc_min=-0.005):
if trunc_min >= trunc_max:
raise Exception("lower clipping bound has to be lower than upper bound")
for idx, value in enumerate(grad_v):
grad_v[idx] = max(min(value,trunc_max),trunc_min)
return grad_v
def convergence_test():
iter_num = 500000
noise_magnitude = 0.15
target = np.array([3.]) # [0.5, 0]
params = [5, 0.5]
v = C.input_variable(shape=(2,), needs_gradient=True)
t = C.input_variable(shape=(1,))
lr = 0.0005
a = NegBin_Loss_Deep_NoSP(v,t)
for i in range(iter_num):
target_new = (1+(0.5-np.random.rand())*noise_magnitude)*target
state, loss = a.forward([[params],target_new], [a.output], set([a.output]))
if i%1000 == 0:
print("loss", loss)
grad = grad_clipper(a.backward(state, np.ones_like(params), set([v])))
if i%1000 == 0:
print("params:", params, "gradient:", grad)
params = np.sum([params,[l * lr for l in grad]],axis=0)
params[1] = max(params[1], 5e-6)
convergence_test()
3) 用CNTK的learner搭建pet实例模型,优化分布参数。我无法获取要更新的致密层参数,我错过了什么?
import cntk as C
from cntk.layers import *
# Simple model to experiment gradient descent on the weights of the dense layer
def model0(x):
with default_options(initial_state = 0.1, enable_self_stabilization=True,init=glorot_uniform()):
m = Dense(2, activation = None)(x)
m = C.ops.softplus(m)
return m
# input sequences
x = C.input_variable(shape=(2,))
t = C.input_variable(shape=(1,))
# create the model
m0 = model0(x)
# define the learning rate
learning_rate = 0.02
lr_schedule = C.learning_parameter_schedule(learning_rate)
# define loss and error function
loss = NegBin_Loss_Deep_NoSP(m0.output, t)
error = NegBin_Loss_Deep_NoSP(m0.output, t)
# use fsadagrad optimizer
momentum_schedule = momentum_schedule = C.momentum_schedule(0.9)
learner = C.fsadagrad(m0.parameters,
lr = lr_schedule,
momentum = momentum_schedule,
unit_gain = True)
trainer = C.Trainer(m0, (loss, error), [learner])
loss_summary = []
start = time.time()
x1 = np.array([3,0.002], dtype=np.float32)
y1 = np.array([2.5], dtype=np.float32)
for epoch in range(0, 10):
trainer.train_minibatch({x: x1, t: y1})
if epoch % (10 / 10) == 0:
training_loss = trainer.previous_minibatch_loss_average
loss_summary.append(training_loss)
print("epoch: {}, loss: {:.5f}".format(epoch, training_loss))
4) 按照 snowflake 的建议,使用本机 CNTK 损失函数测试 pet 示例。通过调整 3) 中的存根,我引入了一个带有平方损失的类似测试。我可以确认以这种方式模型正在更新其参数并且损失随着训练而减少。我相信 这个测试将问题归结为我在 1)
中放置的自定义损失函数x = C.input_variable(shape=(2,))
t = C.input_variable(shape=(2,)) # -> Changed in 4)
# create the model
m0 = model0(x)
# the learning rate
learning_rate = 0.02
lr_schedule = C.learning_parameter_schedule(learning_rate)
# loss function
loss = C.squared_error(m0.output, t) # -> Changed in 4)
error = C.squared_error(m0.output, t) # -> Changed in 4)
# use fsadagrad optimizer
momentum_schedule = C.momentum_schedule(0.9)
learner = C.fsadagrad(m0.parameters,
lr = lr_schedule,
momentum = momentum_schedule,
unit_gain = True)
trainer = C.Trainer(m0, (loss, error), [learner])
loss_summary = []
start = time.time()
x1 = np.array([3, 0.002], dtype=np.float32)
y1 = np.array([2.5, 0.005], dtype=np.float32) # -> Changed in 4)
for epoch in range(0, 10):
trainer.train_minibatch({x: x1, t: y1})
if epoch % (10 / 10) == 0:
training_loss = trainer.previous_minibatch_loss_average
loss_summary.append(training_loss)
print("epoch: {}, loss: {:.5f}".format(epoch, training_loss))
是什么导致 1) 中的损失函数不允许我的模型训练?
谢谢
编辑:已解决,谢谢雪花!
鉴于您的损失函数接受多个输入。 variables 将是一个变量字典。
这样做 variables = gradient
会覆盖字典。
你应该做类似 variables[var] = ... # compute the gradient for var