Tensorflow 损失不收敛
Tensorflow loss not converge
我正在研究眼睛标志提取。我对训练数据做了一些扩充和规范化。但是在训练阶段,误差函数似乎并没有减少。
初始学习率设置为 1e-3,每 20 个 epoch 衰减一次,批量大小为 64。
这是我的代码:
from __future__ import division, print_function
import tensorflow as tf
import cv2
import numpy as np
import os
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
class DataSet(object):
def __init__(self, images, landmark, size, fake=False):
if fake:
self._num_examples = size
else:
assert images.shape[0] == landmark.shape[0], ("images.shape: %s landmark.shape: %s" % (images.shape, landmark.shape))
self._num_examples = images.shape[0]
self._images = np.asanyarray(images, dtype=np.float32)
self._landmark = np.asanyarray(landmark, dtype=np.float32)
self._epochs_completed = 0
self._index_in_epoch = 0
self.train = None
self.test = None
@property
def images(self):
return self._images
@property
def landmark(self):
return self._landmark
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
def get_next_batch(self, batch_size, fake=False):
if fake:
fake_image = [1.0 for _ in range(1024)]
fake_landmark = [0.0] * 4
return [fake_image for _ in range(batch_size)], [fake_landmark for _ in range(batch_size)]
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
self._epochs_completed += 1
perm = np.arange(self._num_examples)
np.random.shuffle(perm)
self._images = self._images[perm]
self._landmark = self._landmark[perm]
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._landmark[start:end]
def read_data_sets(fake=False):
empty = np.empty([0], dtype=np.float32)
data_sets = DataSet(empty, empty, size=0, fake=False)
if fake:
data_sets.train = DataSet(empty, empty, size=0, fake=True)
data_sets.test = DataSet(empty, empty, size=0, fake=True)
print("reading training data...")
train_dir = "train_imgs_aug.txt"
f = open(train_dir, 'r')
lines = f.readlines()
total_num = len(lines)
lines = np.random.permutation(lines)
VALIDATION_SIZE = 25000
images_train = np.empty((VALIDATION_SIZE, 32, 32), dtype=np.float32)
landmark_train = np.empty((VALIDATION_SIZE, 4), dtype=np.float32)
count = 0
for line in lines:
content = line.split(" ")
if not content: break
img = cv2.imread(content[0], 0)
images_train[count] = img / 255.0
landmark_train[count] = [float(content[1]), float(content[2]), float(content[3]), float(content[4])]
count += 1
if count % 100 == 0:
print(count)
if count >= VALIDATION_SIZE:
lines = lines[count:]
break
f.close()
print("reading testing data...")
VALIDATION_SIZE = total_num - VALIDATION_SIZE
images_test = np.empty((VALIDATION_SIZE, 32, 32), dtype=np.float32)
landmarks_test = np.empty((VALIDATION_SIZE, 4), dtype=np.float32)
count = 0
for line in lines:
content = line.split(" ")
if not content: break
img = cv2.imread(content[0], 0)
images_test[count] = img / 255.0
landmarks_test[count] = [float(content[1]), float(content[2]), float(content[3]), float(content[4])]
count += 1
if count % 100 == 0:
print(count)
data_sets.train = DataSet(images_train, landmark_train, size=0, fake=False)
data_sets.test = DataSet(images_test, landmarks_test, size=0, fake=False)
print(data_sets.train.num_examples, data_sets.test.num_examples)
return data_sets
if __name__ == '__main__':
data = read_data_sets(fake=False)
print("Read data end!!\n")
image = tf.placeholder(tf.float32, [None, 32, 32])
landmarks = tf.placeholder(tf.float32, [None, 4])
# paras
W_conv1 = weight_variable([3, 3, 1, 16])
b_conv1 = bias_variable([16])
# conv layer-1
x_image = tf.reshape(image, [-1, 32, 32, 1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
print(h_conv1.get_shape())
h_pool1 = max_pool_2x2(h_conv1)
print(h_pool1.get_shape())
# conv layer-2
W_conv2 = weight_variable([3, 3, 16, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
print(h_conv2.get_shape())
h_pool2 = max_pool_2x2(h_conv2)
print(h_pool2.get_shape())
# conv layer-3
W_conv3 = weight_variable([3, 3, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
print(h_conv3.get_shape())
h_pool3 = max_pool_2x2(h_conv3)
print(h_pool3.get_shape())
# dense
W_fc1 = weight_variable([1024, 512])
b_fc1 = bias_variable([512])
h_pool3_flat = tf.reshape(h_pool3, [-1, 1024])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
# dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# output-dense
W_fc2 = weight_variable([512, 4])
b_fc2 = bias_variable([4])
y_conv = tf.nn.relu(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
# training
print(landmarks.get_shape())
print(y_conv.get_shape())
error = 1 / 2 * tf.reduce_mean(tf.squared_difference(landmarks, y_conv)) + 2 * tf.nn.l2_loss(W_fc2)
# error = 1 / 2 * tf.reduce_mean(tf.square(landmarks - y_conv)) + 2 * tf.nn.l2_loss(W_fc2)
lr = tf.placeholder(tf.float64)
train_step = tf.train.AdamOptimizer(lr).minimize(error)
train_name = "train(32)"
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(train_name)
print("start Training...")
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
if ckpt and ckpt.model_checkpoint_path:
import re
ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
saver.restore(sess, os.path.join(train_name, ckpt_name))
counter = int(next(re.finditer("(\d+)(?!.*\d)", ckpt_name)).group(0))
print(" [*] Success to read {}".format(ckpt_name))
else:
counter = 0
print(" [*] Failed to find a checkpoint")
numberr = 0
soll_EPOCH = 500
EPOCH = 0
learning_rate = 0.001
for i in range(25000 * soll_EPOCH):
image_, landmark_ = data.train.get_next_batch(64)
error_data = sess.run(error, feed_dict={
image: image_, landmarks: landmark_, keep_prob: 0.9, lr: learning_rate
})
if i % 25000 == 0:
EPOCH += 1
print("epoch:", EPOCH, ",training err :", error_data)
test_image, test_landmark = data.test.get_next_batch(32)
error_test = sess.run(error, feed_dict={
image: test_image, landmarks: test_landmark, keep_prob: 1, lr: learning_rate
})
print("landmark testing error :", error_test)
if cv2.waitKey(1) == ord('q'):
print("early stopping")
break
if EPOCH % 20 == 0:
learning_rate = learning_rate * 0.5
numberr = i
model_name = "model"
checkpoint_dir = train_name
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
print("total training count: ", counter+numberr)
saver.save(sess, os.path.join(checkpoint_dir, model_name), global_step=counter + numberr)
并输出错误值:
epoch: 1 ,training err : 16.404902
landmark testing error : 16.376444
epoch: 2 ,training err : 16.40308
landmark testing error : 16.361961
epoch: 3 ,training err : 16.43909
landmark testing error : 16.377134
epoch: 4 ,training err : 16.404552
landmark testing error : 16.370085
epoch: 5 ,training err : 16.433796
landmark testing error : 16.374432
epoch: 6 ,training err : 16.4025
landmark testing error : 16.362604
epoch: 7 ,training err : 16.378864
landmark testing error : 16.36681
epoch: 8 ,training err : 16.408669
landmark testing error : 16.37526
epoch: 9 ,training err : 16.414948
landmark testing error : 16.389187
epoch: 10 ,training err : 16.416836
landmark testing error : 16.373346
epoch: 11 ,training err : 16.429422
landmark testing error : 16.378914
epoch: 12 ,training err : 16.424402
landmark testing error : 16.357906
我真诚地希望有人能帮助我...
非常感谢您!
我觉得你的 l2_loss 太高了。
也许试试:
error = tf.reduce_mean(tf.squared_difference(landmarks, y_conv)) + 0.01 * tf.nn.l2_loss(W_fc2)
除此之外,您应该考虑使用更高级别的 API,例如 tf.layers,它将为您节省大量工作,可能会做一些更好的事情(如初始化),并使您的模型更具可读性
我正在研究眼睛标志提取。我对训练数据做了一些扩充和规范化。但是在训练阶段,误差函数似乎并没有减少。 初始学习率设置为 1e-3,每 20 个 epoch 衰减一次,批量大小为 64。 这是我的代码:
from __future__ import division, print_function
import tensorflow as tf
import cv2
import numpy as np
import os
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
class DataSet(object):
def __init__(self, images, landmark, size, fake=False):
if fake:
self._num_examples = size
else:
assert images.shape[0] == landmark.shape[0], ("images.shape: %s landmark.shape: %s" % (images.shape, landmark.shape))
self._num_examples = images.shape[0]
self._images = np.asanyarray(images, dtype=np.float32)
self._landmark = np.asanyarray(landmark, dtype=np.float32)
self._epochs_completed = 0
self._index_in_epoch = 0
self.train = None
self.test = None
@property
def images(self):
return self._images
@property
def landmark(self):
return self._landmark
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
def get_next_batch(self, batch_size, fake=False):
if fake:
fake_image = [1.0 for _ in range(1024)]
fake_landmark = [0.0] * 4
return [fake_image for _ in range(batch_size)], [fake_landmark for _ in range(batch_size)]
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
self._epochs_completed += 1
perm = np.arange(self._num_examples)
np.random.shuffle(perm)
self._images = self._images[perm]
self._landmark = self._landmark[perm]
start = 0
self._index_in_epoch = batch_size
assert batch_size <= self._num_examples
end = self._index_in_epoch
return self._images[start:end], self._landmark[start:end]
def read_data_sets(fake=False):
empty = np.empty([0], dtype=np.float32)
data_sets = DataSet(empty, empty, size=0, fake=False)
if fake:
data_sets.train = DataSet(empty, empty, size=0, fake=True)
data_sets.test = DataSet(empty, empty, size=0, fake=True)
print("reading training data...")
train_dir = "train_imgs_aug.txt"
f = open(train_dir, 'r')
lines = f.readlines()
total_num = len(lines)
lines = np.random.permutation(lines)
VALIDATION_SIZE = 25000
images_train = np.empty((VALIDATION_SIZE, 32, 32), dtype=np.float32)
landmark_train = np.empty((VALIDATION_SIZE, 4), dtype=np.float32)
count = 0
for line in lines:
content = line.split(" ")
if not content: break
img = cv2.imread(content[0], 0)
images_train[count] = img / 255.0
landmark_train[count] = [float(content[1]), float(content[2]), float(content[3]), float(content[4])]
count += 1
if count % 100 == 0:
print(count)
if count >= VALIDATION_SIZE:
lines = lines[count:]
break
f.close()
print("reading testing data...")
VALIDATION_SIZE = total_num - VALIDATION_SIZE
images_test = np.empty((VALIDATION_SIZE, 32, 32), dtype=np.float32)
landmarks_test = np.empty((VALIDATION_SIZE, 4), dtype=np.float32)
count = 0
for line in lines:
content = line.split(" ")
if not content: break
img = cv2.imread(content[0], 0)
images_test[count] = img / 255.0
landmarks_test[count] = [float(content[1]), float(content[2]), float(content[3]), float(content[4])]
count += 1
if count % 100 == 0:
print(count)
data_sets.train = DataSet(images_train, landmark_train, size=0, fake=False)
data_sets.test = DataSet(images_test, landmarks_test, size=0, fake=False)
print(data_sets.train.num_examples, data_sets.test.num_examples)
return data_sets
if __name__ == '__main__':
data = read_data_sets(fake=False)
print("Read data end!!\n")
image = tf.placeholder(tf.float32, [None, 32, 32])
landmarks = tf.placeholder(tf.float32, [None, 4])
# paras
W_conv1 = weight_variable([3, 3, 1, 16])
b_conv1 = bias_variable([16])
# conv layer-1
x_image = tf.reshape(image, [-1, 32, 32, 1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
print(h_conv1.get_shape())
h_pool1 = max_pool_2x2(h_conv1)
print(h_pool1.get_shape())
# conv layer-2
W_conv2 = weight_variable([3, 3, 16, 32])
b_conv2 = bias_variable([32])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
print(h_conv2.get_shape())
h_pool2 = max_pool_2x2(h_conv2)
print(h_pool2.get_shape())
# conv layer-3
W_conv3 = weight_variable([3, 3, 32, 64])
b_conv3 = bias_variable([64])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
print(h_conv3.get_shape())
h_pool3 = max_pool_2x2(h_conv3)
print(h_pool3.get_shape())
# dense
W_fc1 = weight_variable([1024, 512])
b_fc1 = bias_variable([512])
h_pool3_flat = tf.reshape(h_pool3, [-1, 1024])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
# dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# output-dense
W_fc2 = weight_variable([512, 4])
b_fc2 = bias_variable([4])
y_conv = tf.nn.relu(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
# training
print(landmarks.get_shape())
print(y_conv.get_shape())
error = 1 / 2 * tf.reduce_mean(tf.squared_difference(landmarks, y_conv)) + 2 * tf.nn.l2_loss(W_fc2)
# error = 1 / 2 * tf.reduce_mean(tf.square(landmarks - y_conv)) + 2 * tf.nn.l2_loss(W_fc2)
lr = tf.placeholder(tf.float64)
train_step = tf.train.AdamOptimizer(lr).minimize(error)
train_name = "train(32)"
saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(train_name)
print("start Training...")
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
if ckpt and ckpt.model_checkpoint_path:
import re
ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
saver.restore(sess, os.path.join(train_name, ckpt_name))
counter = int(next(re.finditer("(\d+)(?!.*\d)", ckpt_name)).group(0))
print(" [*] Success to read {}".format(ckpt_name))
else:
counter = 0
print(" [*] Failed to find a checkpoint")
numberr = 0
soll_EPOCH = 500
EPOCH = 0
learning_rate = 0.001
for i in range(25000 * soll_EPOCH):
image_, landmark_ = data.train.get_next_batch(64)
error_data = sess.run(error, feed_dict={
image: image_, landmarks: landmark_, keep_prob: 0.9, lr: learning_rate
})
if i % 25000 == 0:
EPOCH += 1
print("epoch:", EPOCH, ",training err :", error_data)
test_image, test_landmark = data.test.get_next_batch(32)
error_test = sess.run(error, feed_dict={
image: test_image, landmarks: test_landmark, keep_prob: 1, lr: learning_rate
})
print("landmark testing error :", error_test)
if cv2.waitKey(1) == ord('q'):
print("early stopping")
break
if EPOCH % 20 == 0:
learning_rate = learning_rate * 0.5
numberr = i
model_name = "model"
checkpoint_dir = train_name
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
print("total training count: ", counter+numberr)
saver.save(sess, os.path.join(checkpoint_dir, model_name), global_step=counter + numberr)
并输出错误值:
epoch: 1 ,training err : 16.404902
landmark testing error : 16.376444
epoch: 2 ,training err : 16.40308
landmark testing error : 16.361961
epoch: 3 ,training err : 16.43909
landmark testing error : 16.377134
epoch: 4 ,training err : 16.404552
landmark testing error : 16.370085
epoch: 5 ,training err : 16.433796
landmark testing error : 16.374432
epoch: 6 ,training err : 16.4025
landmark testing error : 16.362604
epoch: 7 ,training err : 16.378864
landmark testing error : 16.36681
epoch: 8 ,training err : 16.408669
landmark testing error : 16.37526
epoch: 9 ,training err : 16.414948
landmark testing error : 16.389187
epoch: 10 ,training err : 16.416836
landmark testing error : 16.373346
epoch: 11 ,training err : 16.429422
landmark testing error : 16.378914
epoch: 12 ,training err : 16.424402
landmark testing error : 16.357906
我真诚地希望有人能帮助我... 非常感谢您!
我觉得你的 l2_loss 太高了。
也许试试:
error = tf.reduce_mean(tf.squared_difference(landmarks, y_conv)) + 0.01 * tf.nn.l2_loss(W_fc2)
除此之外,您应该考虑使用更高级别的 API,例如 tf.layers,它将为您节省大量工作,可能会做一些更好的事情(如初始化),并使您的模型更具可读性