在 Keras 中使用带有 Adam Optimizer 的 "Learning Rate Step Decay" Scheduler Nan 损失?
Nan losses using "Learning Rate Step Decay" Scheduler with Adam Optimizer in Keras?
我有这个非常深的模型:
def get_model2(mask_kind):
decay = 0.0
inp_1 = keras.Input(shape=(64, 101, 1), name="RST_inputs")
x = layers.Conv2D(256, kernel_size=(3, 3), kernel_regularizer=l2(1e-6), strides=(3, 3), padding="same")(inp_1)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(128, kernel_size=(3, 3), kernel_regularizer=l2(1e-6), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(64, kernel_size=(2, 2), kernel_regularizer=l2(1e-6), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(32, kernel_size=(2, 2), kernel_regularizer=l2(1e-6), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Flatten()(x)
x = layers.Dense(512)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(256)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
out1 = layers.Dense(128, name="ls_weights")(x)
if mask_kind == 1: # APPLICA LA PRIMA MASCHERA
binary_mask = layers.Lambda(mask_layer1, name="lambda_layer1", dtype='float64')(out1)
print('shape', binary_mask.shape[0])
elif mask_kind == 2: # APPLICA LA SECONDA MASCHERA
binary_mask = layers.Lambda(mask_layer2, name="lambda_layer2", dtype='float64')(out1)
else: # NON APPLICA NULLA
binary_mask = out1
x = layers.Dense(256)(binary_mask)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(512)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(192)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Reshape((2, 2, 48))(x)
x = layers.Conv2DTranspose(32, kernel_size=(2, 2), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(64, kernel_size=(3, 3), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(128, kernel_size=(3, 3), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(256, kernel_size=(3, 3), strides=(5, 5), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
soundfield_layer = layers.Conv2DTranspose(1, kernel_size=(1, 1), strides=(1, 1), padding='same')(x)
# soundfield_layer = layers.Dense(40000, name="sf_vec")(x)
if mask_kind == 1:
model = keras.Model(inp_1, [binary_mask, soundfield_layer], name="2_out_model")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, decay=decay), # in caso
# rimettere 0.001
loss=["mse", "mse"], loss_weights=[1, 1])
# plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
model.summary()
else:
model = keras.Model(inp_1, [binary_mask, soundfield_layer], name="2_out_model")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, decay=decay), # in caso
# rimettere 0.001
loss=["mse", "mse"], loss_weights=[0, 1])
# plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
model.summary()
return model
并且我正在尝试使用学习率步进衰减来查看我是否可以在训练期间改进我的验证损失函数。我为调度程序定义 class 如下:
class StepDecay:
def __init__(self, initAlpha=0.1, factor=0.25, dropEvery=30):
# store the base initial learning rate, drop factor, and
# epochs to drop every
self.initAlpha = initAlpha
self.factor = factor
self.dropEvery = dropEvery
def __call__(self, epoch):
# compute the learning rate for the current epoch
exp = np.floor((1 + epoch) / self.dropEvery)
alpha = self.initAlpha * (self.factor ** exp)
# return the learning rate
return float(alpha)
然后我运行我的训练:
schedule = StepDecay(initAlpha=1e-1, factor=0.25, dropEvery=30)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
callbacks = [es, LearningRateScheduler(schedule)]
model = get_model2(mask_kind=1)
history = model.fit(X_train, [Y_train, Z_train], validation_data=(X_val, [Y_val, Z_val]), epochs=300,
batch_size=32,
callbacks=callbacks, verbose=1)
test_loss, _, _ = model.evaluate(X_test, [Y_test, Z_test], verbose=1)
print('Test: %.3f' % test_loss)
但是当我训练时我得到了“nan”损失:
25/25 [==============================] - 17s 684ms/step - loss: nan - lambda_layer1_loss: nan - conv2d_transpose_4_loss: nan - val_loss: nan - val_lambda_layer1_loss: nan etc....
我不明白为什么。问题可能是衰减率,它是 SGD 优化器中存在的一个参数,但文档中的参数对于 Adam 不存在,但我没有收到任何错误信息,所以……有什么想法吗?
您可以调整参数以找到良好的平衡,但这是使用指数衰减作为 Adam 优化器的回调函数的一种方法。
LR_MAX = 0.0001
LR_MIN = 0.00001
LR_EXP_DECAY = 0.85
def lrfn(epoch):
lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch) + LR_MIN
return lr
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
像下面的例子一样简单地定义回调。
model.fit(..
..
callbacks = [lr_callback],
..
..)
我有这个非常深的模型:
def get_model2(mask_kind):
decay = 0.0
inp_1 = keras.Input(shape=(64, 101, 1), name="RST_inputs")
x = layers.Conv2D(256, kernel_size=(3, 3), kernel_regularizer=l2(1e-6), strides=(3, 3), padding="same")(inp_1)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(128, kernel_size=(3, 3), kernel_regularizer=l2(1e-6), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(64, kernel_size=(2, 2), kernel_regularizer=l2(1e-6), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2D(32, kernel_size=(2, 2), kernel_regularizer=l2(1e-6), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Flatten()(x)
x = layers.Dense(512)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(256)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
out1 = layers.Dense(128, name="ls_weights")(x)
if mask_kind == 1: # APPLICA LA PRIMA MASCHERA
binary_mask = layers.Lambda(mask_layer1, name="lambda_layer1", dtype='float64')(out1)
print('shape', binary_mask.shape[0])
elif mask_kind == 2: # APPLICA LA SECONDA MASCHERA
binary_mask = layers.Lambda(mask_layer2, name="lambda_layer2", dtype='float64')(out1)
else: # NON APPLICA NULLA
binary_mask = out1
x = layers.Dense(256)(binary_mask)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(512)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Dense(192)(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Reshape((2, 2, 48))(x)
x = layers.Conv2DTranspose(32, kernel_size=(2, 2), strides=(2, 2), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(64, kernel_size=(3, 3), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(128, kernel_size=(3, 3), strides=(3, 3), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
x = layers.Conv2DTranspose(256, kernel_size=(3, 3), strides=(5, 5), padding="same")(x)
x = layers.LeakyReLU(alpha=0.3)(x)
soundfield_layer = layers.Conv2DTranspose(1, kernel_size=(1, 1), strides=(1, 1), padding='same')(x)
# soundfield_layer = layers.Dense(40000, name="sf_vec")(x)
if mask_kind == 1:
model = keras.Model(inp_1, [binary_mask, soundfield_layer], name="2_out_model")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, decay=decay), # in caso
# rimettere 0.001
loss=["mse", "mse"], loss_weights=[1, 1])
# plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
model.summary()
else:
model = keras.Model(inp_1, [binary_mask, soundfield_layer], name="2_out_model")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1, decay=decay), # in caso
# rimettere 0.001
loss=["mse", "mse"], loss_weights=[0, 1])
# plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
model.summary()
return model
并且我正在尝试使用学习率步进衰减来查看我是否可以在训练期间改进我的验证损失函数。我为调度程序定义 class 如下:
class StepDecay:
def __init__(self, initAlpha=0.1, factor=0.25, dropEvery=30):
# store the base initial learning rate, drop factor, and
# epochs to drop every
self.initAlpha = initAlpha
self.factor = factor
self.dropEvery = dropEvery
def __call__(self, epoch):
# compute the learning rate for the current epoch
exp = np.floor((1 + epoch) / self.dropEvery)
alpha = self.initAlpha * (self.factor ** exp)
# return the learning rate
return float(alpha)
然后我运行我的训练:
schedule = StepDecay(initAlpha=1e-1, factor=0.25, dropEvery=30)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
callbacks = [es, LearningRateScheduler(schedule)]
model = get_model2(mask_kind=1)
history = model.fit(X_train, [Y_train, Z_train], validation_data=(X_val, [Y_val, Z_val]), epochs=300,
batch_size=32,
callbacks=callbacks, verbose=1)
test_loss, _, _ = model.evaluate(X_test, [Y_test, Z_test], verbose=1)
print('Test: %.3f' % test_loss)
但是当我训练时我得到了“nan”损失:
25/25 [==============================] - 17s 684ms/step - loss: nan - lambda_layer1_loss: nan - conv2d_transpose_4_loss: nan - val_loss: nan - val_lambda_layer1_loss: nan etc....
我不明白为什么。问题可能是衰减率,它是 SGD 优化器中存在的一个参数,但文档中的参数对于 Adam 不存在,但我没有收到任何错误信息,所以……有什么想法吗?
您可以调整参数以找到良好的平衡,但这是使用指数衰减作为 Adam 优化器的回调函数的一种方法。
LR_MAX = 0.0001
LR_MIN = 0.00001
LR_EXP_DECAY = 0.85
def lrfn(epoch):
lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch) + LR_MIN
return lr
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
像下面的例子一样简单地定义回调。
model.fit(..
..
callbacks = [lr_callback],
..
..)