tf.hessians 的 keras 模型的 hessian 矩阵

hessian matrix of a keras model with tf.hessians

我想计算 keras 模型的 Hessian 矩阵 w.r.t。它使用 tf.hessians 在图形模式下输入。 这是一个最小的例子

import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.Input((10,)),
    keras.layers.Dense(1)
])
model.summary()

@tf.function
def get_grads(inputs):
    loss = tf.reduce_sum(model(inputs))
    return tf.gradients(loss, inputs)

@tf.function
def get_hessian(inputs):
    loss = tf.reduce_sum(model(inputs))
    return tf.hessians(loss, inputs)

batch_size = 3
test_input = tf.random.uniform((batch_size, 10))
out = model(test_input) # works fine
grads = get_grads(test_input) # works fine
hessian = get_hessian(test_input) # raises ValueError: None values not supported.

前向传递和 get_grads 函数工作正常,get_hessian 函数引发 ValueError: None values not supported..

在这个例子中

@tf.function
def get_hessian_(inputs):
    loss = tf.reduce_sum(inputs**2)
    return tf.hessians(loss, inputs)

get_hessian_(tf.random.uniform((3,)))[0]
# <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
# array([[2., 0., 0.],
#        [0., 2., 0.],
#        [0., 0., 2.]], dtype=float32)>

tf.hessians 产生了预期的结果,没有错误。

在您的代码示例中,

您正在尝试获取 f(x)(模型输出)w.r.t 的粗麻布矩阵。 x(输入)和f是线性的(模型是线性的)。

f(x)w.r.tHessian。 x 实际上应该是零张量,但 tf.hessians 无法正确处理,导致错误。添加具有非线性激活的附加层将消除错误。

代码示例:

使用tf.hessians得到粗麻布:

model = tf.keras.Sequential([
    Dense(10,activation='sigmoid'), #remove this line and you will get error
    Dense(1)
])
@tf.function
def get_hessian(inputs):
    loss = tf.reduce_sum(model(inputs))
    return tf.hessians(loss, inputs)

batch_size = 3
tf.random.set_seed(123)
test_input = tf.random.uniform((3,10),minval=1.5,maxval=2.5)
hessian = get_hessian(test_input)
print(type(hessian))
print(len(hessian))
print(hessian[0].shape)
print(hessian[0][0,0,0,0])
print(hessian[0][0,0,0,1])
'''
<class 'list'>
1
(3, 10, 3, 10)
tf.Tensor(0.0028595054, shape=(), dtype=float32)
tf.Tensor(0.0009458237, shape=(), dtype=float32)
''' 

使用tf.GradientTape()得到hessian:

model = tf.keras.Sequential([
    Dense(10,activation='sigmoid'), #remove this line and get_hessian return None
    Dense(1)
])
@tf.function
def get_hessian(inputs):
    with tf.GradientTape() as t2:
      t2.watch(inputs)
      with tf.GradientTape() as t1:
        t1.watch(inputs)
        loss = tf.reduce_sum(model(inputs))
      g=t1.gradient(loss,inputs)
    return t2.jacobian(g,inputs)

batch_size = 3
tf.random.set_seed(123)
test_input = tf.random.uniform((3,10),minval=1.5,maxval=2.5)
hessian = get_hessian(test_input)
print(type(hessian))
print(hessian.shape if hessian is not None else None)
print(hessian[0,0,0,0] if hessian is not None else None)
print(hessian[0,0,0,1] if hessian is not None else None)
'''
<class 'tensorflow.python.framework.ops.EagerTensor'>
(3, 10, 3, 10)
tf.Tensor(0.0028595058, shape=(), dtype=float32)
tf.Tensor(0.0009458238, shape=(), dtype=float32)
'''

如果你想得到一个零张量,你可以使用unconnected_gradients=tf.UnconnectedGradients.ZERO

model = tf.keras.Sequential([
    Dense(1)
])
@tf.function
def get_hessian(inputs):
    with tf.GradientTape() as t2:
      t2.watch(inputs)
      with tf.GradientTape() as t1:
        t1.watch(inputs)
        loss = tf.reduce_sum(model(inputs))
      g=t1.gradient(loss,inputs,unconnected_gradients=tf.UnconnectedGradients.ZERO)
    return t2.jacobian(g,inputs,unconnected_gradients=tf.UnconnectedGradients.ZERO)

batch_size = 3
tf.random.set_seed(123)
test_input = tf.random.uniform((3,10),minval=1.5,maxval=2.5)
hessian = get_hessian(test_input)
print(type(hessian))
print(hessian.shape)
print(tf.math.count_nonzero(hessian))
'''
<class 'tensorflow.python.framework.ops.EagerTensor'>
(3, 10, 3, 10)
tf.Tensor(0, shape=(), dtype=int64)
'''