恢复 Keras 模型的训练
Resuming Training of Keras Model
我正在使用 tensorflow.keras
来构建一个具有 3 个密集层的简单神经网络。我能够成功训练模型 9000 个时期,达到 0.0496 的均方误差 (MSE
)。不管怎样恢复模型,它在大约 57 MSE
时开始训练。
这可能表明模型权重未成功加载,但是当从头开始重新启动训练过程时(不加载以前保存的权重),MSE
从大约 +9000 开始。
编辑:
- 这是正常问题,还是我做错了什么?
- 为什么即使经过 9000 个迭代后准确率始终为 0.0?
我的代码如下:
from __future__ import absolute_import, division, print_function
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import load_model
print(tf.__version__)
dataset_path = 'D:\data.csv'
checkpoint_model_json_path = 'modelBackup/model.json'
checkpoint_weights_h5_path = 'modelBackup/weights00009000.h5'
resume_from_checkpoint = True
print('reading dataset...')
column_names = ['paircode','x1o','x1h','x1l','x1c','x1v','x2o','x2h','x2l','x2c','x2v','x3o','x3h','x3l','x3c','x3v','x4o','x4h','x4l','x4c','x4v','x5o','x5h','x5l','x5c','x5v','x6o','x6h','x6l','x6c','x6v','x7o','x7h','x7l','x7c','x7v','x8o','x8h','x8l','x8c','x8v','x9o','x9h','x9l','x9c','x9v','x10o','x10h','x10l','x10c','x10v','x11o','x11h','x11l','x11c','x11v','x12o','x12h','x12l','x12c','x12v','x13o','x13h','x13l','x13c','x13v','x14o','x14h','x14l','x14c','x14v','x15o','x15h','x15l','x15c','x15v','x16o','x16h','x16l','x16c','x16v','x17o','x17h','x17l','x17c','x17v','x18o','x18h','x18l','x18c','x18v','x19o','x19h','x19l','x19c','x19v','x20o','x20h','x20l','x20c','x20v','x21o','x21h','x21l','x21c','x21v','x22o','x22h','x22l','x22c','x22v','x23o','x23h','x23l','x23c','x23v','x24o','x24h','x24l','x24c','x24v','x25o','x25h','x25l','x25c','x25v','x26o','x26h','x26l','x26c','x26v','x27o','x27h','x27l','x27c','x27v','x28o','x28h','x28l','x28c','x28v','x29o','x29h','x29l','x29c','x29v','x30o','x30h','x30l','x30c','x30v','x31o','x31h','x31l','x31c','x31v','x32o','x32h','x32l','x32c','x32v','x33o','x33h','x33l','x33c','x33v','x34o','x34h','x34l','x34c','x34v','x35o','x35h','x35l','x35c','x35v','x36o','x36h','x36l','x36c','x36v','x37o','x37h','x37l','x37c','x37v','x38o','x38h','x38l','x38c','x38v','x39o','x39h','x39l','x39c','x39v','x40o','x40h','x40l','x40c','x40v','x41o','x41h','x41l','x41c','x41v','x42o','x42h','x42l','x42c','x42v','x43o','x43h','x43l','x43c','x43v','x44o','x44h','x44l','x44c','x44v','x45o','x45h','x45l','x45c','x45v','x46o','x46h','x46l','x46c','x46v','x47o','x47h','x47l','x47c','x47v','x48o','x48h','x48l','x48c','x48v','x49o','x49h','x49l','x49c','x49v','x50o','x50h','x50l','x50c','x50v','nextclose']
dataset = pd.read_csv(dataset_path, names=column_names,
na_values = "?", comment='\t',
sep=",", skipinitialspace=True, skiprows = [0])
print('printing dataset tail...')
print(dataset.tail())
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)
train_labels = train_dataset.pop('nextclose')
test_labels = test_dataset.pop('nextclose')
def norm(x):
return x
# return (x - train_stats['mean']) / train_stats['std']
print('normalizing dataset...')
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
def build_model():
print('building the model')
model = keras.Sequential([
layers.Dense(512, activation=tf.nn.relu, input_shape=[len(train_dataset.keys())]),
layers.Dense(512, activation=tf.nn.relu), layers.Dense(256, activation=tf.nn.relu),
layers.Dense(1)
])
return model
def load_model_():
print('loading the model')
# load json and create model
json_file = open(checkpoint_model_json_path, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(checkpoint_weights_h5_path)
print("Loaded model from disk")
return loaded_model
if resume_from_checkpoint:
model = load_model_()
else:
model = build_model()
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error', 'accuracy'])
model.summary()
print('testing 10 widthed batch...')
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
print(example_result)
def plot_history(history):
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [nextclose]')
plt.plot(hist['epoch'], hist['mean_absolute_error'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
label = 'Val Error')
plt.ylim([0,5])
plt.legend()
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Square Error [$nextclose^2$]')
plt.plot(hist['epoch'], hist['mean_squared_error'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mean_squared_error'],
label = 'Val Error')
plt.ylim([0,20])
plt.legend()
plt.show()
print('fitting the model...')
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=500)
print('saving the model...')
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
history = model.fit(
normed_train_data, train_labels,
epochs=1, validation_split = 0.2, verbose=2,
batch_size=100000, callbacks=[mc])
print('evaluating the model...')
loss, mae, mse, accuracy = model.evaluate(normed_test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: {:5.2f} nextclose".format(mae))
print("Testing set Accuracy: {:5.2f} nextclose".format(accuracy))
输出:
1.13.1
reading dataset...
printing dataset tail...
paircode x1o x1h x1l x1c x1v x2o x2h x2l x2c x2v x3o x3h ... x48c x48v x49o x49h x49l x49c x49v x50o x50h x50l x50c x50v nextclose
381045 50 112.606 112.622 112.606 112.619 0.0 112.580 112.581 112.561 112.575 0.0 112.601 112.612 ... 112.118 0.0 112.083 112.090 112.079 112.087 0.0 112.025 112.033 112.023 112.032 0.0 112.033
381046 50 112.580 112.581 112.561 112.575 0.0 112.601 112.612 112.598 112.599 0.0 112.581 112.599 ... 112.087 0.0 112.025 112.033 112.023 112.032 0.0 112.031 112.034 112.031 112.033 0.0 112.141
381047 50 112.601 112.612 112.598 112.599 0.0 112.581 112.599 112.580 112.593 0.0 112.548 112.548 ... 112.032 0.0 112.031 112.034 112.031 112.033 0.0 112.142 112.149 112.140 112.141 0.0 112.157
381048 50 112.581 112.599 112.580 112.593 0.0 112.548 112.548 112.540 112.542 0.0 112.551 112.565 ... 112.033 0.0 112.142 112.149 112.140 112.141 0.0 112.161 112.161 112.157 112.157 0.0 112.121
381049 50 112.548 112.548 112.540 112.542 0.0 112.551 112.565 112.551 112.565 0.0 112.564 112.577 ... 112.141 0.0 112.161 112.161 112.157 112.157 0.0 112.121 112.129 112.121 112.121 0.0 112.140
[5 rows x 252 columns]
normalizing dataset...
loading the model
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
2019-04-05 12:10:15.520118: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
Loaded model from disk
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\keras\utils\losses_utils.py:170: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 512) 129024
_________________________________________________________________
dense_1 (Dense) (None, 512) 262656
_________________________________________________________________
dense_2 (Dense) (None, 256) 131328
_________________________________________________________________
dense_3 (Dense) (None, 1) 257
=================================================================
Total params: 523,265
Trainable params: 523,265
Non-trainable params: 0
_________________________________________________________________
testing 10 widthed batch...
[[106.244064]
[ 76.667534]
[ 82.01627 ]
[ 79.776405]
[116.600204]
[ 95.28444 ]
[ 76.96633 ]
[118.25993 ]
[120.39911 ]
[108.5381 ]]
fitting the model...
saving the model...
Train on 243872 samples, validate on 60968 samples
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
- 6s - loss: 56.9330 - mean_absolute_error: 5.3921 - mean_squared_error: 56.9330 - acc: 0.0000e+00 - val_loss: 38.9868 - val_mean_absolute_error: 6.1875 - val_mean_squared_error: 38.9868 - val_acc: 0.0000e+00
evaluating the model...
Testing set Mean Abs Error: 6.19 nextclose
Testing set Accuracy: 0.00 nextclose
提前致谢。
您可以简单地构建模型、编译它并保留随机初始化的权重以开始训练。接下来,恢复训练:构建模型,编译它,然后重新加载保存的权重。
对于恢复培训,您不应该 运行 完整代码,
只是 运行 这个 :
model= load_model('model.h5')
history = model.fit(normed_train_data, train_labels, epochs=1, v
validation_split = 0.2, verbose=2,
batch_size=128, callbacks=[mc])`
但是你应该编辑这个:
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=100)
进入这个:
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=False, period=100)
如前所述,这是 tensorflow-gpu-nighlybuild 2.0
中已修复的错误 here.
我正在使用 tensorflow.keras
来构建一个具有 3 个密集层的简单神经网络。我能够成功训练模型 9000 个时期,达到 0.0496 的均方误差 (MSE
)。不管怎样恢复模型,它在大约 57 MSE
时开始训练。
这可能表明模型权重未成功加载,但是当从头开始重新启动训练过程时(不加载以前保存的权重),MSE
从大约 +9000 开始。
编辑:
- 这是正常问题,还是我做错了什么?
- 为什么即使经过 9000 个迭代后准确率始终为 0.0?
我的代码如下:
from __future__ import absolute_import, division, print_function
import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import load_model
print(tf.__version__)
dataset_path = 'D:\data.csv'
checkpoint_model_json_path = 'modelBackup/model.json'
checkpoint_weights_h5_path = 'modelBackup/weights00009000.h5'
resume_from_checkpoint = True
print('reading dataset...')
column_names = ['paircode','x1o','x1h','x1l','x1c','x1v','x2o','x2h','x2l','x2c','x2v','x3o','x3h','x3l','x3c','x3v','x4o','x4h','x4l','x4c','x4v','x5o','x5h','x5l','x5c','x5v','x6o','x6h','x6l','x6c','x6v','x7o','x7h','x7l','x7c','x7v','x8o','x8h','x8l','x8c','x8v','x9o','x9h','x9l','x9c','x9v','x10o','x10h','x10l','x10c','x10v','x11o','x11h','x11l','x11c','x11v','x12o','x12h','x12l','x12c','x12v','x13o','x13h','x13l','x13c','x13v','x14o','x14h','x14l','x14c','x14v','x15o','x15h','x15l','x15c','x15v','x16o','x16h','x16l','x16c','x16v','x17o','x17h','x17l','x17c','x17v','x18o','x18h','x18l','x18c','x18v','x19o','x19h','x19l','x19c','x19v','x20o','x20h','x20l','x20c','x20v','x21o','x21h','x21l','x21c','x21v','x22o','x22h','x22l','x22c','x22v','x23o','x23h','x23l','x23c','x23v','x24o','x24h','x24l','x24c','x24v','x25o','x25h','x25l','x25c','x25v','x26o','x26h','x26l','x26c','x26v','x27o','x27h','x27l','x27c','x27v','x28o','x28h','x28l','x28c','x28v','x29o','x29h','x29l','x29c','x29v','x30o','x30h','x30l','x30c','x30v','x31o','x31h','x31l','x31c','x31v','x32o','x32h','x32l','x32c','x32v','x33o','x33h','x33l','x33c','x33v','x34o','x34h','x34l','x34c','x34v','x35o','x35h','x35l','x35c','x35v','x36o','x36h','x36l','x36c','x36v','x37o','x37h','x37l','x37c','x37v','x38o','x38h','x38l','x38c','x38v','x39o','x39h','x39l','x39c','x39v','x40o','x40h','x40l','x40c','x40v','x41o','x41h','x41l','x41c','x41v','x42o','x42h','x42l','x42c','x42v','x43o','x43h','x43l','x43c','x43v','x44o','x44h','x44l','x44c','x44v','x45o','x45h','x45l','x45c','x45v','x46o','x46h','x46l','x46c','x46v','x47o','x47h','x47l','x47c','x47v','x48o','x48h','x48l','x48c','x48v','x49o','x49h','x49l','x49c','x49v','x50o','x50h','x50l','x50c','x50v','nextclose']
dataset = pd.read_csv(dataset_path, names=column_names,
na_values = "?", comment='\t',
sep=",", skipinitialspace=True, skiprows = [0])
print('printing dataset tail...')
print(dataset.tail())
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)
train_labels = train_dataset.pop('nextclose')
test_labels = test_dataset.pop('nextclose')
def norm(x):
return x
# return (x - train_stats['mean']) / train_stats['std']
print('normalizing dataset...')
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
def build_model():
print('building the model')
model = keras.Sequential([
layers.Dense(512, activation=tf.nn.relu, input_shape=[len(train_dataset.keys())]),
layers.Dense(512, activation=tf.nn.relu), layers.Dense(256, activation=tf.nn.relu),
layers.Dense(1)
])
return model
def load_model_():
print('loading the model')
# load json and create model
json_file = open(checkpoint_model_json_path, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(checkpoint_weights_h5_path)
print("Loaded model from disk")
return loaded_model
if resume_from_checkpoint:
model = load_model_()
else:
model = build_model()
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error', 'accuracy'])
model.summary()
print('testing 10 widthed batch...')
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
print(example_result)
def plot_history(history):
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [nextclose]')
plt.plot(hist['epoch'], hist['mean_absolute_error'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
label = 'Val Error')
plt.ylim([0,5])
plt.legend()
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Square Error [$nextclose^2$]')
plt.plot(hist['epoch'], hist['mean_squared_error'],
label='Train Error')
plt.plot(hist['epoch'], hist['val_mean_squared_error'],
label = 'Val Error')
plt.ylim([0,20])
plt.legend()
plt.show()
print('fitting the model...')
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=500)
print('saving the model...')
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
history = model.fit(
normed_train_data, train_labels,
epochs=1, validation_split = 0.2, verbose=2,
batch_size=100000, callbacks=[mc])
print('evaluating the model...')
loss, mae, mse, accuracy = model.evaluate(normed_test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: {:5.2f} nextclose".format(mae))
print("Testing set Accuracy: {:5.2f} nextclose".format(accuracy))
输出:
1.13.1
reading dataset...
printing dataset tail...
paircode x1o x1h x1l x1c x1v x2o x2h x2l x2c x2v x3o x3h ... x48c x48v x49o x49h x49l x49c x49v x50o x50h x50l x50c x50v nextclose
381045 50 112.606 112.622 112.606 112.619 0.0 112.580 112.581 112.561 112.575 0.0 112.601 112.612 ... 112.118 0.0 112.083 112.090 112.079 112.087 0.0 112.025 112.033 112.023 112.032 0.0 112.033
381046 50 112.580 112.581 112.561 112.575 0.0 112.601 112.612 112.598 112.599 0.0 112.581 112.599 ... 112.087 0.0 112.025 112.033 112.023 112.032 0.0 112.031 112.034 112.031 112.033 0.0 112.141
381047 50 112.601 112.612 112.598 112.599 0.0 112.581 112.599 112.580 112.593 0.0 112.548 112.548 ... 112.032 0.0 112.031 112.034 112.031 112.033 0.0 112.142 112.149 112.140 112.141 0.0 112.157
381048 50 112.581 112.599 112.580 112.593 0.0 112.548 112.548 112.540 112.542 0.0 112.551 112.565 ... 112.033 0.0 112.142 112.149 112.140 112.141 0.0 112.161 112.161 112.157 112.157 0.0 112.121
381049 50 112.548 112.548 112.540 112.542 0.0 112.551 112.565 112.551 112.565 0.0 112.564 112.577 ... 112.141 0.0 112.161 112.161 112.157 112.157 0.0 112.121 112.129 112.121 112.121 0.0 112.140
[5 rows x 252 columns]
normalizing dataset...
loading the model
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
2019-04-05 12:10:15.520118: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
Loaded model from disk
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\keras\utils\losses_utils.py:170: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 512) 129024
_________________________________________________________________
dense_1 (Dense) (None, 512) 262656
_________________________________________________________________
dense_2 (Dense) (None, 256) 131328
_________________________________________________________________
dense_3 (Dense) (None, 1) 257
=================================================================
Total params: 523,265
Trainable params: 523,265
Non-trainable params: 0
_________________________________________________________________
testing 10 widthed batch...
[[106.244064]
[ 76.667534]
[ 82.01627 ]
[ 79.776405]
[116.600204]
[ 95.28444 ]
[ 76.96633 ]
[118.25993 ]
[120.39911 ]
[108.5381 ]]
fitting the model...
saving the model...
Train on 243872 samples, validate on 60968 samples
WARNING:tensorflow:From C:\Program Files\Python36\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
- 6s - loss: 56.9330 - mean_absolute_error: 5.3921 - mean_squared_error: 56.9330 - acc: 0.0000e+00 - val_loss: 38.9868 - val_mean_absolute_error: 6.1875 - val_mean_squared_error: 38.9868 - val_acc: 0.0000e+00
evaluating the model...
Testing set Mean Abs Error: 6.19 nextclose
Testing set Accuracy: 0.00 nextclose
提前致谢。
您可以简单地构建模型、编译它并保留随机初始化的权重以开始训练。接下来,恢复训练:构建模型,编译它,然后重新加载保存的权重。
对于恢复培训,您不应该 运行 完整代码, 只是 运行 这个 :
model= load_model('model.h5')
history = model.fit(normed_train_data, train_labels, epochs=1, v
validation_split = 0.2, verbose=2,
batch_size=128, callbacks=[mc])`
但是你应该编辑这个:
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=100)
进入这个:
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=False, period=100)
如前所述,这是 tensorflow-gpu-nighlybuild 2.0
中已修复的错误 here.