Tensorflow:运行 GPU 训练阶段和 CPU 测试阶段
Tensorflow: Run training phase on GPU and test phase on CPU
我希望在我的 GPU 上 运行 我的 tensorflow 代码的训练阶段,同时在我完成并存储结果以加载我创建的模型和 运行 它的测试阶段 CPU.
我已经创建了这段代码(我已经放了一部分,仅供参考,因为它很大,否则我知道规则将包含一个功能齐全的代码,对此我深表歉意)。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn.python.ops import rnn_cell, rnn
# Import MNIST data http://yann.lecun.com/exdb/mnist/
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x_train = mnist.train.images
# Check that the dataset contains 55,000 rows and 784 columns
N,D = x_train.shape
tf.reset_default_graph()
sess = tf.InteractiveSession()
x = tf.placeholder("float", [None, n_steps,n_input])
y_true = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32,shape=[])
learning_rate = tf.placeholder(tf.float32,shape=[])
#[............Build the RNN graph model.............]
sess.run(tf.global_variables_initializer())
# Because I am using my GPU for the training, I avoid allocating the whole
# mnist.validation set because of memory error, so I gragment it to
# small batches (100)
x_validation_bin, y_validation_bin = mnist.validation.next_batch(batch_size)
x_validation_bin = binarize(x_validation_bin, threshold=0.1)
x_validation_bin = x_validation_bin.reshape((-1,n_steps,n_input))
for k in range(epochs):
steps = 0
for i in range(training_iters):
#Stochastic descent
batch_x, batch_y = mnist.train.next_batch(batch_size)
batch_x = binarize(batch_x, threshold=0.1)
batch_x = batch_x.reshape((-1,n_steps,n_input))
sess.run(train_step, feed_dict={x: batch_x, y_true: batch_y,keep_prob: keep_prob,eta:learning_rate})
if do_report_err == 1:
if steps % display_step == 0:
# Calculate batch accuracy
acc = sess.run(accuracy, feed_dict={x: batch_x, y_true: batch_y,keep_prob: 1.0})
# Calculate batch loss
loss = sess.run(total_loss, feed_dict={x: batch_x, y_true: batch_y,keep_prob: 1.0})
print("Iter " + str(i) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
steps += 1
# Validation Accuracy and Cost
validation_accuracy = sess.run(accuracy,feed_dict={x:x_validation_bin, y_true:y_validation_bin, keep_prob:1.0})
validation_cost = sess.run(total_loss,feed_dict={x:x_validation_bin, y_true:y_validation_bin, keep_prob:1.0})
validation_loss_array.append(final_validation_cost)
validation_accuracy_array.append(final_validation_accuracy)
saver.save(sess, savefilename)
total_epochs = total_epochs + 1
np.savez(datasavefilename,epochs_saved = total_epochs,learning_rate_saved = learning_rate,keep_prob_saved = best_keep_prob, validation_loss_array_saved = validation_loss_array,validation_accuracy_array_saved = validation_accuracy_array,modelsavefilename = savefilename)
在那之后,我的模型已经成功训练并保存了相关数据,所以我希望加载文件并在模型中做最后的训练和测试部分,但这次使用我的 CPU。原因是 GPU 无法处理 mnist.train.images 和 mnist.train.labels.
的整个数据集
所以,手动我 select 这部分和我 运行 它:
with tf.device('/cpu:0'):
# Initialise variables
sess.run(tf.global_variables_initializer())
# Accuracy and Cost
saver.restore(sess, savefilename)
x_train_bin = binarize(mnist.train.images, threshold=0.1)
x_train_bin = x_train_bin.reshape((-1,n_steps,n_input))
final_train_accuracy = sess.run(accuracy,feed_dict={x:x_train_bin, y_true:mnist.train.labels, keep_prob:1.0})
final_train_cost = sess.run(total_loss,feed_dict={x:x_train_bin, y_true:mnist.train.labels, keep_prob:1.0})
x_test_bin = binarize(mnist.test.images, threshold=0.1)
x_test_bin = x_test_bin.reshape((-1,n_steps,n_input))
final_test_accuracy = sess.run(accuracy,feed_dict={x:x_test_bin, y_true:mnist.test.labels, keep_prob:1.0})
final_test_cost = sess.run(total_loss,feed_dict={x:x_test_bin, y_true:mnist.test.labels, keep_prob:1.0})
但是我得到了一个 OMM GPU 内存错误,这对我来说没有意义,因为我认为我已经强迫程序依赖 CPU。我没有在第一个(批量训练)代码中放置命令 sess.close() ,但我不确定这是否真的是它背后的原因。我实际上遵循了这个 post
关于如何 运行 最后一部分仅 CPU 有什么建议吗?
with tf.device()
语句仅适用于图形构建,不适用于执行,因此在设备块内执行 sess.run
相当于根本没有设备。
要完成您想做的事情,您需要构建单独的训练图和测试图,它们共享变量。
我希望在我的 GPU 上 运行 我的 tensorflow 代码的训练阶段,同时在我完成并存储结果以加载我创建的模型和 运行 它的测试阶段 CPU.
我已经创建了这段代码(我已经放了一部分,仅供参考,因为它很大,否则我知道规则将包含一个功能齐全的代码,对此我深表歉意)。
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn.python.ops import rnn_cell, rnn
# Import MNIST data http://yann.lecun.com/exdb/mnist/
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x_train = mnist.train.images
# Check that the dataset contains 55,000 rows and 784 columns
N,D = x_train.shape
tf.reset_default_graph()
sess = tf.InteractiveSession()
x = tf.placeholder("float", [None, n_steps,n_input])
y_true = tf.placeholder("float", [None, n_classes])
keep_prob = tf.placeholder(tf.float32,shape=[])
learning_rate = tf.placeholder(tf.float32,shape=[])
#[............Build the RNN graph model.............]
sess.run(tf.global_variables_initializer())
# Because I am using my GPU for the training, I avoid allocating the whole
# mnist.validation set because of memory error, so I gragment it to
# small batches (100)
x_validation_bin, y_validation_bin = mnist.validation.next_batch(batch_size)
x_validation_bin = binarize(x_validation_bin, threshold=0.1)
x_validation_bin = x_validation_bin.reshape((-1,n_steps,n_input))
for k in range(epochs):
steps = 0
for i in range(training_iters):
#Stochastic descent
batch_x, batch_y = mnist.train.next_batch(batch_size)
batch_x = binarize(batch_x, threshold=0.1)
batch_x = batch_x.reshape((-1,n_steps,n_input))
sess.run(train_step, feed_dict={x: batch_x, y_true: batch_y,keep_prob: keep_prob,eta:learning_rate})
if do_report_err == 1:
if steps % display_step == 0:
# Calculate batch accuracy
acc = sess.run(accuracy, feed_dict={x: batch_x, y_true: batch_y,keep_prob: 1.0})
# Calculate batch loss
loss = sess.run(total_loss, feed_dict={x: batch_x, y_true: batch_y,keep_prob: 1.0})
print("Iter " + str(i) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
steps += 1
# Validation Accuracy and Cost
validation_accuracy = sess.run(accuracy,feed_dict={x:x_validation_bin, y_true:y_validation_bin, keep_prob:1.0})
validation_cost = sess.run(total_loss,feed_dict={x:x_validation_bin, y_true:y_validation_bin, keep_prob:1.0})
validation_loss_array.append(final_validation_cost)
validation_accuracy_array.append(final_validation_accuracy)
saver.save(sess, savefilename)
total_epochs = total_epochs + 1
np.savez(datasavefilename,epochs_saved = total_epochs,learning_rate_saved = learning_rate,keep_prob_saved = best_keep_prob, validation_loss_array_saved = validation_loss_array,validation_accuracy_array_saved = validation_accuracy_array,modelsavefilename = savefilename)
在那之后,我的模型已经成功训练并保存了相关数据,所以我希望加载文件并在模型中做最后的训练和测试部分,但这次使用我的 CPU。原因是 GPU 无法处理 mnist.train.images 和 mnist.train.labels.
的整个数据集所以,手动我 select 这部分和我 运行 它:
with tf.device('/cpu:0'):
# Initialise variables
sess.run(tf.global_variables_initializer())
# Accuracy and Cost
saver.restore(sess, savefilename)
x_train_bin = binarize(mnist.train.images, threshold=0.1)
x_train_bin = x_train_bin.reshape((-1,n_steps,n_input))
final_train_accuracy = sess.run(accuracy,feed_dict={x:x_train_bin, y_true:mnist.train.labels, keep_prob:1.0})
final_train_cost = sess.run(total_loss,feed_dict={x:x_train_bin, y_true:mnist.train.labels, keep_prob:1.0})
x_test_bin = binarize(mnist.test.images, threshold=0.1)
x_test_bin = x_test_bin.reshape((-1,n_steps,n_input))
final_test_accuracy = sess.run(accuracy,feed_dict={x:x_test_bin, y_true:mnist.test.labels, keep_prob:1.0})
final_test_cost = sess.run(total_loss,feed_dict={x:x_test_bin, y_true:mnist.test.labels, keep_prob:1.0})
但是我得到了一个 OMM GPU 内存错误,这对我来说没有意义,因为我认为我已经强迫程序依赖 CPU。我没有在第一个(批量训练)代码中放置命令 sess.close() ,但我不确定这是否真的是它背后的原因。我实际上遵循了这个 post
with tf.device()
语句仅适用于图形构建,不适用于执行,因此在设备块内执行 sess.run
相当于根本没有设备。
要完成您想做的事情,您需要构建单独的训练图和测试图,它们共享变量。