我想使用 tf.Softmax
I want to use tf.Softmax
我使用 tf.softmax
创建了预测 MNIST 数据的模型。
但它没有用。成本打印为 nan
.
我知道这样使用 tf.nn.softmax_cross_entropy_with_logits
会有效
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg)
我认为我的成本代码有问题。
我想知道为什么使用 tf.softmax
的方法在神经网络中不起作用。
并且 l2reg 的改编是 coorect ??
谢谢~
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import numpy as np
mnist = input_data.read_data_sets('MNIST_data/',one_hot=True)
nb_classes = 10
X = tf.placeholder(tf.float32,[None,28*28], name='x-input')
Y = tf.placeholder(tf.float32,[None,nb_classes], name='y-input')
with tf.name_scope('layer1') as scope:
W1 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight1')
b1 = tf.Variable(tf.random_normal([28*28]),name='bias1')
layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)
w1_hist = tf.summary.histogram('weight1', W1)
layer1_hist = tf.summary.histogram('layer1', layer1)
with tf.name_scope('layer2') as scope:
W2 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight2')
b2 = tf.Variable(tf.random_normal([28*28]),name='bias2')
layer2 = tf.nn.relu(tf.matmul(layer1,W2)+b2)
# 1. From TF graph, decide which tensors you want to log
w2_hist = tf.summary.histogram('weight2', W2)
layer2_hist = tf.summary.histogram('layer2', layer2)
with tf.name_scope('layer3') as scope:
W3 = tf.Variable(tf.random_normal([28*28,nb_classes]),name='weight3')
b3 = tf.Variable(tf.random_normal([nb_classes]),name='bias3')
logits = tf.matmul(layer2,W3)+b3
#hypothesis = tf.div(tf.exp(logits),tf.exp(logit,dim)
hypothesis = tf.nn.softmax(logits)
w3_hist = tf.summary.histogram('weight3', W3)
hypothesis_hist = tf.summary.histogram('hypothesis', hypothesis)
with tf.name_scope('cost') as scope:
# the method of l2reg, when deep
l2reg = tf.reduce_sum(tf.square(W1)) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(W3)
cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(hypothesis),reduction_indices=1)) + (0.01 * l2reg)
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg) => this worked very well
cost_summ = tf.summary.scalar('cost',cost)
with tf.name_scope('train') as scope:
train = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cost)
predicted = tf.argmax(hypothesis,1)
correction = tf.cast(tf.equal(predicted,tf.argmax(Y,1)),dtype=tf.float32)
Accuracy = tf.reduce_mean(correction)
# parameters
training_epochs = 15
batch_size = 100
with tf.Session() as sess:
# 2. merge all summaries
summary = tf.summary.merge_all()
# 3. Create writer and add graph
writer = tf.summary.FileWriter('./logs/mnist_l2reg_1e-2',sess.graph)
#writer.add_graph(sess.graph)
# 4. Run summary merge and add_summary
sess.run(tf.global_variables_initializer())
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
s,cost_val,_ = sess.run([summary,cost,train],feed_dict={X:batch_xs,Y:batch_ys})
writer.add_summary(s, global_step=i)
avg_cost += cost_val / total_batch
print('{:5} cost: {:.2f}'.format(epoch+1,avg_cost))
print('Accuracy: ',Accuracy.eval(session=sess,feed_dict={X:mnist.test.images, Y:mnist.test.labels}))
# Get one and predict
r = random.randint(0, mnist.test.num_examples - 1)
print(r, 'test_num: {}, train_num: {}'.format(mnist.test.num_examples,mnist.train.num_examples))
# numpy.array[something] mean the row of the array
# notice that if slice like numpy.array[s:s+1], shape will be printed [1,-1]
print(mnist.test.labels[r],np.shape(mnist.test.labels))
print('argmax test none axis when array vector{}'.format(tf.argmax(mnist.test.labels[r]).eval(session=sess)))
print('Label:',sess.run(tf.argmax(mnist.test.labels[r:r+1], 1)))
print('Prediction:', sess.run(tf.argmax(hypothesis, 1),feed_dict={X: mnist.test.images[r:r +1]}))
plt.imshow(mnist.test.images[r:r+1].reshape(28,28), cmap='Greys', interpolation='nearest')
plt.show()
您遇到的问题正是 tf.nn.softmax_cross_entropy_with_logits
使用起来如此重要的原因:log
运算本身的数值不稳定性。
解释:这里你有一个相当大的网络,最终会对某些class化验非常有信心。特别是,它最终会将极 低 的概率分配给某些图像(例如 1
的图片)是特定 class(例如,class 5
)。 logit
将是非常负的,并且该高度负的 logit 的 tf.nn.softmax
条目可以是 numerically 零(不完全为零,但精度有限,它将表示为零)。然后,当您使用 log
自己计算交叉熵时,您会遇到数值问题,从而导致您的成本值 nan
。函数 tf.nn.softmax_cross_entropy_with_logits
使用技巧来处理这个问题,以避免 log
和 exp
under/overflow 问题。这个技巧有时被称为 exp-normalize 技巧;请参阅 this blog post(不是我写的;我只是认为这是一个清晰的解释)以获取更多详细信息。
简而言之,使用 tf.nn.softmax_cross_entropy_with_logits
并且不要尝试自己计算交叉熵。
我使用 tf.softmax
创建了预测 MNIST 数据的模型。
但它没有用。成本打印为 nan
.
我知道这样使用 tf.nn.softmax_cross_entropy_with_logits
会有效
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg)
我认为我的成本代码有问题。
我想知道为什么使用 tf.softmax
的方法在神经网络中不起作用。
并且 l2reg 的改编是 coorect ??
谢谢~
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import numpy as np
mnist = input_data.read_data_sets('MNIST_data/',one_hot=True)
nb_classes = 10
X = tf.placeholder(tf.float32,[None,28*28], name='x-input')
Y = tf.placeholder(tf.float32,[None,nb_classes], name='y-input')
with tf.name_scope('layer1') as scope:
W1 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight1')
b1 = tf.Variable(tf.random_normal([28*28]),name='bias1')
layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)
w1_hist = tf.summary.histogram('weight1', W1)
layer1_hist = tf.summary.histogram('layer1', layer1)
with tf.name_scope('layer2') as scope:
W2 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight2')
b2 = tf.Variable(tf.random_normal([28*28]),name='bias2')
layer2 = tf.nn.relu(tf.matmul(layer1,W2)+b2)
# 1. From TF graph, decide which tensors you want to log
w2_hist = tf.summary.histogram('weight2', W2)
layer2_hist = tf.summary.histogram('layer2', layer2)
with tf.name_scope('layer3') as scope:
W3 = tf.Variable(tf.random_normal([28*28,nb_classes]),name='weight3')
b3 = tf.Variable(tf.random_normal([nb_classes]),name='bias3')
logits = tf.matmul(layer2,W3)+b3
#hypothesis = tf.div(tf.exp(logits),tf.exp(logit,dim)
hypothesis = tf.nn.softmax(logits)
w3_hist = tf.summary.histogram('weight3', W3)
hypothesis_hist = tf.summary.histogram('hypothesis', hypothesis)
with tf.name_scope('cost') as scope:
# the method of l2reg, when deep
l2reg = tf.reduce_sum(tf.square(W1)) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(W3)
cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(hypothesis),reduction_indices=1)) + (0.01 * l2reg)
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg) => this worked very well
cost_summ = tf.summary.scalar('cost',cost)
with tf.name_scope('train') as scope:
train = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cost)
predicted = tf.argmax(hypothesis,1)
correction = tf.cast(tf.equal(predicted,tf.argmax(Y,1)),dtype=tf.float32)
Accuracy = tf.reduce_mean(correction)
# parameters
training_epochs = 15
batch_size = 100
with tf.Session() as sess:
# 2. merge all summaries
summary = tf.summary.merge_all()
# 3. Create writer and add graph
writer = tf.summary.FileWriter('./logs/mnist_l2reg_1e-2',sess.graph)
#writer.add_graph(sess.graph)
# 4. Run summary merge and add_summary
sess.run(tf.global_variables_initializer())
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
s,cost_val,_ = sess.run([summary,cost,train],feed_dict={X:batch_xs,Y:batch_ys})
writer.add_summary(s, global_step=i)
avg_cost += cost_val / total_batch
print('{:5} cost: {:.2f}'.format(epoch+1,avg_cost))
print('Accuracy: ',Accuracy.eval(session=sess,feed_dict={X:mnist.test.images, Y:mnist.test.labels}))
# Get one and predict
r = random.randint(0, mnist.test.num_examples - 1)
print(r, 'test_num: {}, train_num: {}'.format(mnist.test.num_examples,mnist.train.num_examples))
# numpy.array[something] mean the row of the array
# notice that if slice like numpy.array[s:s+1], shape will be printed [1,-1]
print(mnist.test.labels[r],np.shape(mnist.test.labels))
print('argmax test none axis when array vector{}'.format(tf.argmax(mnist.test.labels[r]).eval(session=sess)))
print('Label:',sess.run(tf.argmax(mnist.test.labels[r:r+1], 1)))
print('Prediction:', sess.run(tf.argmax(hypothesis, 1),feed_dict={X: mnist.test.images[r:r +1]}))
plt.imshow(mnist.test.images[r:r+1].reshape(28,28), cmap='Greys', interpolation='nearest')
plt.show()
您遇到的问题正是 tf.nn.softmax_cross_entropy_with_logits
使用起来如此重要的原因:log
运算本身的数值不稳定性。
解释:这里你有一个相当大的网络,最终会对某些class化验非常有信心。特别是,它最终会将极 低 的概率分配给某些图像(例如 1
的图片)是特定 class(例如,class 5
)。 logit
将是非常负的,并且该高度负的 logit 的 tf.nn.softmax
条目可以是 numerically 零(不完全为零,但精度有限,它将表示为零)。然后,当您使用 log
自己计算交叉熵时,您会遇到数值问题,从而导致您的成本值 nan
。函数 tf.nn.softmax_cross_entropy_with_logits
使用技巧来处理这个问题,以避免 log
和 exp
under/overflow 问题。这个技巧有时被称为 exp-normalize 技巧;请参阅 this blog post(不是我写的;我只是认为这是一个清晰的解释)以获取更多详细信息。
简而言之,使用 tf.nn.softmax_cross_entropy_with_logits
并且不要尝试自己计算交叉熵。