TensorFlow Checkpoint 恢复学习率
TensorFlow Checkpoint restoring Learning rate
我正在尝试使用 TensorFlow checkpoint
,除 Learning rate
外一切正常。每次我 运行 它都会重新初始化并且不会从以前的恢复。
这是我试图重现问题的玩具示例:
import numpy as np
import tensorflow as tf
X = tf.range(10.)
Y = 50.*X
class CGMM(tf.Module):
def __init__(self):
super(CGMM, self).__init__(name='CGMM')
self.beta = tf.Variable(1. , dtype=np.float32)
self.learning_rate = tf.Variable(1. , dtype=np.float32)
@tf.function
def objfun(self):
beta = self.beta
obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))
return obj
def build_model(self,X,Y,decay_steps):
self.X,self.Y=X,Y
starter_learning_rate = 0.05 #0.05
global_step = tf.Variable(1, trainable=False)
self.learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,decay_steps, 0.96, staircase=True)
optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learning_rate)
ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print("Restored from {}".format(manager.latest_checkpoint))
else:
print("Initializing from scratch.")
for i in range(500):
optimizer.minimize(self.objfun, global_step=global_step, var_list = [self.beta])
loss, beta, learning_rate = self.objfun(), self.beta, self.learning_rate().numpy()
ckpt.step.assign_add(1)
if (int(ckpt.step)-1) % 100 == 0:
save_path = manager.save()
print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
print("learning_rate : " + str(learning_rate))
return beta
model = CGMM()
opt_beta = model.build_model(X,Y,100)
结果第 1 运行:
Initializing from scratch.
Saved checkpoint for step 101: ./tf_ckpts_cg/ckpt-1
learning_rate : 0.048
Saved checkpoint for step 201: ./tf_ckpts_cg/ckpt-2
learning_rate : 0.04608
Saved checkpoint for step 301: ./tf_ckpts_cg/ckpt-3
learning_rate : 0.044236798
Saved checkpoint for step 401: ./tf_ckpts_cg/ckpt-4
learning_rate : 0.042467322
Saved checkpoint for step 501: ./tf_ckpts_cg/ckpt-5
learning_rate : 0.04076863
第二个结果运行:
Restored from ./tf_ckpts_cg/ckpt-5
Saved checkpoint for step 601: ./tf_ckpts_cg/ckpt-6
learning_rate : 0.048
Saved checkpoint for step 701: ./tf_ckpts_cg/ckpt-7
learning_rate : 0.04608
Saved checkpoint for step 801: ./tf_ckpts_cg/ckpt-8
learning_rate : 0.044236798
Saved checkpoint for step 901: ./tf_ckpts_cg/ckpt-9
learning_rate : 0.042467322
Saved checkpoint for step 1001: ./tf_ckpts_cg/ckpt-10
learning_rate : 0.04076863
如您所见,在 运行 上重复了相同的 Learning Rate
,但其他变量运行良好。你能帮我解决这个问题吗?
我找到了解决方法并想离开这个 post,也许它对将来的人有帮助。
必须添加 self.global_step = tf.Variable(1, trainable=False)
这是完整的脚本;
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
X = tf.range(10.)
Y = 50.*X
class CGMM(tf.Module):
def __init__(self):
super(CGMM, self).__init__(name='CGMM')
self.beta = tf.Variable(1. , dtype=np.float32)
self.global_step = tf.Variable(1, trainable=False)
@tf.function
def objfun(self):
beta = self.beta
obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))
return obj
def build_model(self,X,Y,decay_steps):
self.X,self.Y=X,Y
starter_learning_rate = 0.05
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, self.global_step,decay_steps, 0.96, staircase=True)
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate)
ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print("Restored from {}".format(manager.latest_checkpoint))
else:
print("Initializing from scratch.")
for i in range(500):
optimizer.minimize(self.objfun, global_step=self.global_step, var_list = [self.beta])
loss, beta = self.objfun(), self.beta
ckpt.step.assign_add(1)
lr_vector.append(learning_rate().numpy())
if (int(ckpt.step)-1) % 100 == 0:
save_path = manager.save()
print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
print("learning_rate : " + str(learning_rate().numpy()))
return loss_vec,beta,lr_vector
model = CGMM()
loss_vec, opt_beta,lr_vector = model.build_model(X,Y,100)
我正在尝试使用 TensorFlow checkpoint
,除 Learning rate
外一切正常。每次我 运行 它都会重新初始化并且不会从以前的恢复。
这是我试图重现问题的玩具示例:
import numpy as np
import tensorflow as tf
X = tf.range(10.)
Y = 50.*X
class CGMM(tf.Module):
def __init__(self):
super(CGMM, self).__init__(name='CGMM')
self.beta = tf.Variable(1. , dtype=np.float32)
self.learning_rate = tf.Variable(1. , dtype=np.float32)
@tf.function
def objfun(self):
beta = self.beta
obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))
return obj
def build_model(self,X,Y,decay_steps):
self.X,self.Y=X,Y
starter_learning_rate = 0.05 #0.05
global_step = tf.Variable(1, trainable=False)
self.learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, global_step,decay_steps, 0.96, staircase=True)
optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learning_rate)
ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print("Restored from {}".format(manager.latest_checkpoint))
else:
print("Initializing from scratch.")
for i in range(500):
optimizer.minimize(self.objfun, global_step=global_step, var_list = [self.beta])
loss, beta, learning_rate = self.objfun(), self.beta, self.learning_rate().numpy()
ckpt.step.assign_add(1)
if (int(ckpt.step)-1) % 100 == 0:
save_path = manager.save()
print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
print("learning_rate : " + str(learning_rate))
return beta
model = CGMM()
opt_beta = model.build_model(X,Y,100)
结果第 1 运行:
Initializing from scratch.
Saved checkpoint for step 101: ./tf_ckpts_cg/ckpt-1
learning_rate : 0.048
Saved checkpoint for step 201: ./tf_ckpts_cg/ckpt-2
learning_rate : 0.04608
Saved checkpoint for step 301: ./tf_ckpts_cg/ckpt-3
learning_rate : 0.044236798
Saved checkpoint for step 401: ./tf_ckpts_cg/ckpt-4
learning_rate : 0.042467322
Saved checkpoint for step 501: ./tf_ckpts_cg/ckpt-5
learning_rate : 0.04076863
第二个结果运行:
Restored from ./tf_ckpts_cg/ckpt-5
Saved checkpoint for step 601: ./tf_ckpts_cg/ckpt-6
learning_rate : 0.048
Saved checkpoint for step 701: ./tf_ckpts_cg/ckpt-7
learning_rate : 0.04608
Saved checkpoint for step 801: ./tf_ckpts_cg/ckpt-8
learning_rate : 0.044236798
Saved checkpoint for step 901: ./tf_ckpts_cg/ckpt-9
learning_rate : 0.042467322
Saved checkpoint for step 1001: ./tf_ckpts_cg/ckpt-10
learning_rate : 0.04076863
如您所见,在 运行 上重复了相同的 Learning Rate
,但其他变量运行良好。你能帮我解决这个问题吗?
我找到了解决方法并想离开这个 post,也许它对将来的人有帮助。
必须添加 self.global_step = tf.Variable(1, trainable=False)
这是完整的脚本;
import numpy as np
import tensorflow as tf
from matplotlib import pyplot as plt
X = tf.range(10.)
Y = 50.*X
class CGMM(tf.Module):
def __init__(self):
super(CGMM, self).__init__(name='CGMM')
self.beta = tf.Variable(1. , dtype=np.float32)
self.global_step = tf.Variable(1, trainable=False)
@tf.function
def objfun(self):
beta = self.beta
obj = tf.reduce_mean(tf.square(beta*self.X - self.Y))
return obj
def build_model(self,X,Y,decay_steps):
self.X,self.Y=X,Y
starter_learning_rate = 0.05
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate, self.global_step,decay_steps, 0.96, staircase=True)
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate)
ckpt = tf.train.Checkpoint(step=tf.Variable(1) ,model=self, optimizer=optimizer)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts_cg', max_to_keep=3)
ckpt.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
print("Restored from {}".format(manager.latest_checkpoint))
else:
print("Initializing from scratch.")
for i in range(500):
optimizer.minimize(self.objfun, global_step=self.global_step, var_list = [self.beta])
loss, beta = self.objfun(), self.beta
ckpt.step.assign_add(1)
lr_vector.append(learning_rate().numpy())
if (int(ckpt.step)-1) % 100 == 0:
save_path = manager.save()
print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
print("learning_rate : " + str(learning_rate().numpy()))
return loss_vec,beta,lr_vector
model = CGMM()
loss_vec, opt_beta,lr_vector = model.build_model(X,Y,100)