Return DNN 训练结束时的逆 Hessian 矩阵和输入的偏导数

Question

使用 Keras 和 Tensorflow 作为后端，我构建了一个 DNN，它将恒星光谱作为输入（7213 个数据点）并输出三个恒星参数（温度、重力和金属丰度）。网络在我的测试集上训练得很好并且预测得很好，但为了使结果具有科学意义，我需要能够估计我的错误。这样做的第一步是获得逆 Hessian 矩阵，这似乎仅使用 Keras 是不可能的。因此，我尝试使用 scipy 创建一个解决方法，使用 scipy.optimize.minimize 和 BFGS、L-BFGS-B 或 Netwon-CG 作为方法。这些中的任何一个都将 return 逆 Hessian 矩阵。

想法是使用 Adam 优化器训练模型 100 个时期（或直到模型收敛），然后运行 BFGS（或其他之一）的一次迭代或函数 return 我模型的 Hessian 矩阵。

这是我的代码：

from scipy.optimize import minimize

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam


# Define vars
activation = 'relu'
init = 'he_normal'
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-08

input_shape = (None,n)
n_hidden = [2048,1024,512,256,128,32]
output_dim = 3

epochs = 100
lr = 0.0008
batch_size = 64
decay = 0.00

# Design DNN Layers

model = Sequential([

    Dense(n_hidden[0], batch_input_shape=input_shape, init=init, activation=activation),

    Dense(n_hidden[1], init=init, activation=activation), 

    Dense(n_hidden[2], init=init, activation=activation),

    Dense(n_hidden[3], init=init, activation=activation),

    Dense(n_hidden[4], init=init, activation=activation),

    Dense(n_hidden[5], init=init, activation=activation),

    Dense(output_dim, init=init, activation='linear'),
])


# Optimization function
optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay)


# Compile and train network
model.compile(optimizer=optimizer, loss='mean_squared_error')

#train_X.shape = (50000,7213)
#train_Y.shape = (50000,3)
#cv_X.shape = (10000,7213)
#cv_Y.shape = (10000,3)

history = model.fit(train_X, train_Y, validation_data=(cv_X, cv_Y),
             nb_epoch=epochs, batch_size=batch_size, verbose=2)


weights = []
for layer in model.layers:
    weights.append(layer.get_weights())

def loss(W):
    weightsList = W
    weightsList = np.array(W)
    new_weights = []
    for i, layer in enumerate((weightsList)):
        new_weights.append(np.array(weightsList[i]))
    model.set_weights(np.array(new_weights))
    preds = model.predict(train_X)
    mse = np.sum(np.square(np.subtract(preds,train_Y)))/len(train_X[:,0])
    print(mse)
    return mse


x0=weights    
res = minimize(loss, x0, args=(), method = 'BFGS', options={'maxiter':1,'eps':1e-6,'disp':True})
#res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})
#res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})
inv_hess = res['hess_inv']

1) 该模型训练得非常好，但是当尝试运行使用先前训练的权重进行单次迭代的 scipy 最小化时，我运行遇到了问题。

尝试 method=BFGS 时的输出：

0.458706819754
0.457811632697
0.458706716791
...
0.350124572422
0.350186770445
0.350125320636

ValueErrorTraceback (most recent call last)
---> 19 res = minimize(loss, x0, args=(), method = 'BFGS', tol=1, options={'maxiter':1,'eps':1e-6,'disp':True})#,'gtol':0.1}, tol=5)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    442         return _minimize_cg(fun, x0, args, jac, callback, **options)
    443     elif meth == 'bfgs':
--> 444         return _minimize_bfgs(fun, x0, args, jac, callback, **options)

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_bfgs(fun, x0, args, jac, callback, gtol, norm, eps, maxiter, disp, return_all, **unknown_options)
    963         try:  # this was handled in numeric, let it remaines for more safety
--> 964             rhok = 1.0 / (numpy.dot(yk, sk))
    965         except ZeroDivisionError:
    966             rhok = 1000.0

ValueError: operands could not be broadcast together with shapes (7213,2048) (2048,1024)

尝试方法时的输出=L-BFGS-B:

ValueErrorTraceback (most recent call last)

---> 20 res = minimize(loss, x0, method='L-BFGS-B', options={'disp': True, 'maxls': 1, 'gtol': 1e-05, 'eps': 1e-08, 'maxiter': 1, 'ftol': 0.5, 'maxcor': 1, 'maxfun': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 450                                 callback=callback, **options)


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/lbfgsb.pyc in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    300         raise ValueError('maxls must be positive.')
    301 
--> 302     x = array(x0, float64)
    303     f = array(0.0, float64)
    304     g = zeros((n,), float64)

ValueError: setting an array element with a sequence.

尝试 method=Newton-CG 时的输出

ValueErrorTraceback (most recent call last)

---> 21 res = minimize(loss, x0, args=(), method='Newton-CG', jac=None, hess=None, hessp=None, tol=None, callback=None, options={'disp': False, 'xtol': 1e-05, 'eps': 1.4901161193847656e-08, 'return_all': False, 'maxiter': 1})


/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    445     elif meth == 'newton-cg':
    446         return _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
--> 447                                   **options)
    448     elif meth == 'l-bfgs-b':
    449         return _minimize_lbfgsb(fun, x0, args, jac, bounds,

/opt/anaconda3/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback, xtol, eps, maxiter, disp, return_all, **unknown_options)
   1438     _check_unknown_options(unknown_options)
   1439     if jac is None:
-> 1440         raise ValueError('Jacobian is required for Newton-CG method')

ValueError: Jacobian is required for Newton-CG method

2) 下一个任务是获得模型输出相对于模型输入的导数。例如，对于一个恒星参数（输出之一），比如温度，我需要找到关于 7213 个输入中每个输入的偏导数。然后对 3 个输出中的每一个执行相同的操作。

所以基本上，我的第一个任务 (1) 是找到一种方法 return 我的模型的逆 Hessian 矩阵，接下来 (2) 我需要找到一种方法 return我的输出关于我的输入的一阶偏导数。

有人对这两项任务有一些了解吗？谢谢。

编辑

我正在尝试使用 theano.gradient.jacobian() 来 return 我的输出 w.r.t 的雅可比矩阵。我的输入。我已将我的模型转换为模型权重的函数，并将该函数用作 theano.gradient.jacobian() 中的第一个参数。当我尝试运行多维数组的梯度时，我的模型权重和输入数据的形式为

，我的问题就出现了。

import theano.tensor as T

weights_in_model = T.dvector('model_weights')
x = T.dvector('x')

def pred(x,weights_in_model):
    weights = T.stack((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.shape_padright(x, n_ones=1)

    prediction=T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.stack((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.stack((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.shape_padright(prediction, n_ones=1)
    prediction = T.dot(prediction, weights)
    T.flatten(prediction)

    return prediction


f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


x = train_X
weights_in_model = model.get_weights()
h(x,weights_in_model)

最后一行给出了错误：

TypeError: ('Bad input argument to theano function with name "<ipython-input-365-a1ab256aa220>:1"  at index 0(0-based)', 'Wrong number of dimensions: expected 1, got 2 with shape (2000, 7213).')

但是当我将输入更改为：

weights_in_model = T.matrix('model_weights')
x = T.matrix('x')

我从以下行收到错误：

f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

阅读：

AssertionError: tensor.jacobian expects a 1 dimensional variable as `expression`. If not use flatten to make it a vector

关于如何解决这个问题的任何想法？

Answer 1

已找到答案！：此代码用于预测模型的一个输出值。目前我正在修改它以计算 3 个雅可比矩阵；每个输出一个。

import theano
import theano.tensor as T
import theano.typed_list
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'

# Declare function input placeholders
weights_in_model = theano.typed_list.TypedListType(theano.tensor.dmatrix)()
x = T.matrix('x')

# Define model function
def pred(x,weights_in_model): 
    weights = T.concatenate((weights_in_model[0],weights_in_model[1]), axis=0)
    x = T.concatenate((x, T.ones((T.shape(x)[0], 1))), axis=1)

    prediction = T.dot(x, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[2],weights_in_model[3]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[4],weights_in_model[5]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[6],weights_in_model[7]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[8],weights_in_model[9]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)

    weights = T.concatenate((weights_in_model[10],weights_in_model[11]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.clip(prediction, 0, 9999.)


    weights = T.concatenate((weights_in_model[12],weights_in_model[13]), axis=0)
    prediction = T.concatenate((prediction, T.ones((T.shape(prediction)[0], 1))), axis=1)
    prediction = T.dot(prediction, weights)
    prediction = T.flatten(prediction)
    return prediction

# Create gradient function
f=theano.gradient.jacobian(pred(x,weights_in_model),wrt=x)

# Compile function
h=theano.function([x,weights_in_model],f,allow_input_downcast=True)


# Get function inputs
weights_in_model_ = model.get_weights()
x_=train_data

# Reshape bias layers
weights_in_model_[1] = np.reshape(weights_in_model_[1], (1, 2048))
weights_in_model_[3] = np.reshape(weights_in_model_[3], (1, 1024))
weights_in_model_[5] = np.reshape(weights_in_model_[5], (1, 512))
weights_in_model_[7] = np.reshape(weights_in_model_[7], (1, 256))
weights_in_model_[9] = np.reshape(weights_in_model_[9], (1, 128))
weights_in_model_[11] = np.reshape(weights_in_model_[11], (1, 32))
weights_in_model_[13] = np.reshape(weights_in_model_[13], (1, 1))

# Compute Jacobian (returns format with a bunch of zero rows)
jacs = h(x_, weights_in_model_)

# Put Jacobian matrix in proper format (ie. shape = (number_of_input_examples, number_of_input_features)

jacobian_matrix = np.zeros((jacs.shape[0],jacs.shape[2]))
for i, jac in enumerate(jacs): 
    jacobian_matrix[i] = jac[i]

下一个任务是找到输出的 Hessian 矩阵 w.r.t。模型权重！

Return DNN 训练结束时的逆 Hessian 矩阵和输入的偏导数

Return Inverse Hessian Matrix at the end of DNN Training and Partial Derivatives wrt the Inputs

python

scipy

neural-network

keras

hessian-matrix