我想使用 tf.Softmax

Question

我使用 tf.softmax 创建了预测 MNIST 数据的模型。
但它没有用。成本打印为 nan.
我知道这样使用 tf.nn.softmax_cross_entropy_with_logits 会有效

cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg)

我认为我的成本代码有问题。
我想知道为什么使用 tf.softmax 的方法在神经网络中不起作用。
并且 l2reg 的改编是 coorect ??
谢谢~

from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import numpy as np 
mnist = input_data.read_data_sets('MNIST_data/',one_hot=True)

nb_classes = 10
X = tf.placeholder(tf.float32,[None,28*28], name='x-input')
Y = tf.placeholder(tf.float32,[None,nb_classes], name='y-input')

with tf.name_scope('layer1') as scope:
    W1 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight1')
    b1 = tf.Variable(tf.random_normal([28*28]),name='bias1')
    layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)
    w1_hist = tf.summary.histogram('weight1', W1)
    layer1_hist = tf.summary.histogram('layer1', layer1)

with tf.name_scope('layer2') as scope:
    W2 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight2')
    b2 = tf.Variable(tf.random_normal([28*28]),name='bias2')
    layer2 = tf.nn.relu(tf.matmul(layer1,W2)+b2)
    # 1. From TF graph, decide which tensors you want to log
    w2_hist = tf.summary.histogram('weight2', W2)
    layer2_hist = tf.summary.histogram('layer2', layer2)

with tf.name_scope('layer3') as scope:
    W3 = tf.Variable(tf.random_normal([28*28,nb_classes]),name='weight3')
    b3 = tf.Variable(tf.random_normal([nb_classes]),name='bias3')
    logits = tf.matmul(layer2,W3)+b3
    #hypothesis = tf.div(tf.exp(logits),tf.exp(logit,dim)
    hypothesis = tf.nn.softmax(logits)
    w3_hist = tf.summary.histogram('weight3', W3)
    hypothesis_hist = tf.summary.histogram('hypothesis', hypothesis)
with tf.name_scope('cost') as scope:
    # the method of l2reg, when  deep
    l2reg = tf.reduce_sum(tf.square(W1)) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(W3)
    cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(hypothesis),reduction_indices=1)) + (0.01 * l2reg)
    # cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg) => this worked very well
    cost_summ = tf.summary.scalar('cost',cost)
with tf.name_scope('train') as scope:
    train = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cost)
predicted = tf.argmax(hypothesis,1)
correction = tf.cast(tf.equal(predicted,tf.argmax(Y,1)),dtype=tf.float32)
Accuracy = tf.reduce_mean(correction)

# parameters
training_epochs = 15
batch_size = 100

with tf.Session() as sess:

    # 2. merge all summaries
    summary = tf.summary.merge_all()
    # 3. Create writer and add graph
    writer = tf.summary.FileWriter('./logs/mnist_l2reg_1e-2',sess.graph)
    #writer.add_graph(sess.graph)
    # 4. Run summary merge and add_summary
    sess.run(tf.global_variables_initializer())
    for epoch in range(training_epochs):
        avg_cost = 0
        total_batch = int(mnist.train.num_examples / batch_size)
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            s,cost_val,_ = sess.run([summary,cost,train],feed_dict={X:batch_xs,Y:batch_ys})
            writer.add_summary(s, global_step=i)
            avg_cost +=  cost_val / total_batch
        print('{:5} cost: {:.2f}'.format(epoch+1,avg_cost))
    print('Accuracy: ',Accuracy.eval(session=sess,feed_dict={X:mnist.test.images, Y:mnist.test.labels}))
    # Get one and predict
    r = random.randint(0, mnist.test.num_examples - 1)
    print(r, 'test_num: {}, train_num: {}'.format(mnist.test.num_examples,mnist.train.num_examples))
    # numpy.array[something] mean the row of the array
    # notice that if slice like numpy.array[s:s+1], shape will be printed [1,-1]
    print(mnist.test.labels[r],np.shape(mnist.test.labels))
    print('argmax test none axis when array vector{}'.format(tf.argmax(mnist.test.labels[r]).eval(session=sess)))
    print('Label:',sess.run(tf.argmax(mnist.test.labels[r:r+1], 1)))
    print('Prediction:', sess.run(tf.argmax(hypothesis, 1),feed_dict={X: mnist.test.images[r:r +1]}))
    plt.imshow(mnist.test.images[r:r+1].reshape(28,28), cmap='Greys', interpolation='nearest')
    plt.show()

Answer 1

您遇到的问题正是 tf.nn.softmax_cross_entropy_with_logits 使用起来如此重要的原因：log 运算本身的数值不稳定性。

解释：这里你有一个相当大的网络，最终会对某些class化验非常有信心。特别是，它最终会将极低的概率分配给某些图像（例如 1 的图片）是特定 class（例如，class 5)。 logit 将是非常负的，并且该高度负的 logit 的 tf.nn.softmax 条目可以是 numerically 零（不完全为零，但精度有限，它将表示为零）。然后，当您使用 log 自己计算交叉熵时，您会遇到数值问题，从而导致您的成本值 nan。函数 tf.nn.softmax_cross_entropy_with_logits 使用技巧来处理这个问题，以避免 log 和 exp under/overflow 问题。这个技巧有时被称为 exp-normalize 技巧；请参阅 this blog post（不是我写的；我只是认为这是一个清晰的解释）以获取更多详细信息。

简而言之，使用 tf.nn.softmax_cross_entropy_with_logits 并且不要尝试自己计算交叉熵。

我想使用 tf.Softmax

I want to use tf.Softmax

python

deep-learning

tensorflow

softmax