MNIST 数字的神经网络根本没有学习 - 反向传播问题
Neural Network for MNIST digits is not learning at all - problem with backpropagation
很长一段时间后,我仍然无法 运行 我的 nn 没有任何错误。这个玩具 nn 的准确度是惊人的 1-2%(隐藏层中有 60 个神经元,100 个 epochs,0.3 学习率,tanh 激活,通过 TF 下载的 MNIST 数据集)——所以基本上它根本没有学习。在看了这么多关于反向传播的视频/post 之后,我仍然无法修复它。
所以我的错误必须在标有两个##### 行的部分之间。我认为我对导数的理解总体上很好,但我就是无法将这些知识与反向传播联系起来。
如果反向传播基础是正确的,那么错误一定在axis = 0/1
,因为我也无法理解,如何确定我将在哪个轴上工作。
另外,我有一个强烈的预感,dZ2 = A2 - Y
可能是错误的,应该是dZ2 = Y - A2
,但是在更正之后,nn开始只猜一个数字。
(是的,反向传播本身我没有写,我在网上找到了)
#importing data and normalizing it
#"x_test" will be my X
#"y_test" will be my Y
import tensorflow as tf
(traindataX, traindataY), (testdataX, testdataY) = tf.keras.datasets.mnist.load_data()
x_test = testdataX.reshape(testdataX.shape[0], testdataX.shape[1]**2).astype('float32')
x_test = x_test / 255
y_test = testdataY
y_test = np.eye(10)[y_test]
#Activation functions:
def tanh(z):
a = (np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
return a
###############################################################################START
def softmax(z):
smExp = np.exp(z - np.max(z, axis=0))
out = smExp / np.sum(smExp, axis=0)
return out
###############################################################################STOP
def neural_network(num_hid, epochs,
learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
W1 = np.random.randn(784, num_hid) * 0.01
W2 = np.random.randn(num_hid, 10) * 0.01
b1 = np.zeros((1, num_hid))
b2 = np.zeros((1, 10))
correct = 0
for x in range(1, epochs+1):
#feedforward
Z1 = np.dot(X, W1) + b1
A1 = tanh(Z1)
Z2 = np.dot(A1, W2) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[1] #-> 784
loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
dZ2 = A2 - Y
dW2 = (1/m)*np.dot(A1.T, dZ2)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(X.T, dZ1)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
for i in range(np.shape(Y)[1]):
guess = np.argmax(A2[i, :])
ans = np.argmax(Y[i, :])
print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
if guess == ans:
correct = correct + 1;
accuracy = (correct/np.shape(Y)[0]) * 100
这可能是因为您应该通过将 X 除以 255(255 是最大像素值)来标准化 0 和 1 值之间的输入。您还应该将 Y one 热编码为一系列大小为 10 的向量。我认为你的反向传播是正确的,但你应该实施梯度检查来仔细检查。
您计算准确度的方法有误。
对于一个纪元的每次迭代,第一个正确的变量应该初始化为 0,第二个
如果 y.shape 是 (10000, 10),那么为了计算精度,循环应该是 for i in range(np.shape(Y)[0])
而不是 for i in range(np.shape(Y)[1])
,第一个将迭代 10,000 次,第二个将迭代 10 次。
更好的方法是使用 NumPy 来计算正确猜测的次数correct = np.sum(np.argmax(A2,axis=1) == np.argmax(Y,axis=1))
你的学习率太高了,我通过将学习率设置为 0.003 50 个 epoch 和 60 个隐藏神经元能够达到 50% 的准确率
def neural_network(num_hid, epochs,
learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
W1 = np.random.randn(784, num_hid) * 0.01
W2 = np.random.randn(num_hid, 10) * 0.01
b1 = np.zeros((1, num_hid))
b2 = np.zeros((1, 10))
correct = 0
for x in range(1, epochs+1):
#feedforward
Z1 = np.dot(X, W1) + b1
A1 = tanh(Z1)
Z2 = np.dot(A1, W2) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[1] #-> 784
loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
dZ2 = A2 - Y
dW2 = (1/m)*np.dot(A1.T, dZ2)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(X.T, dZ1)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
correct = 0
for i in range(np.shape(Y)[0]):
guess = np.argmax(A2[i, :])
ans = np.argmax(Y[i, :])
# print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
if guess == ans:
correct = correct + 1
# correct = np.sum(np.argmax(A2,axis=1) == np.argmax(Y,axis=1))
# print(correct)
accuracy = (correct/np.shape(Y)[0]) * 100
print(accuracy)
您需要进行实验,为了获得良好的准确性,请尝试调整隐藏层数、时期和学习率。
卢卡斯,
刷新基础知识的好问题。我对您的代码进行了一些修复:
- m的计算
- 转置所有权重和偏差(无法正确解释,但它在其他方面不起作用)。
- 更改了准确度(和未使用的损失)的计算。
请参阅下面更正后的代码。使用您的原始参数可以达到 90% 的准确率:
def neural_network(num_hid, epochs, learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
# W1 = np.random.randn(784, num_hid) * 0.01
# W2 = np.random.randn(num_hid, 10) * 0.01
# b1 = np.zeros((1, num_hid))
# b2 = np.zeros((1, 10))
W1 = np.random.randn(num_hid, 784) * 0.01
W2 = np.random.randn(10, num_hid) * 0.01
b1 = np.zeros((num_hid, 1))
b2 = np.zeros((10, 1))
for x in range(1, epochs+1):
correct = 0 # moved inside cycle
#feedforward
# Z1 = np.dot(X, W1) + b1
Z1 = np.dot(W1, X.T) + b1
A1 = tanh(Z1)
# Z2 = np.dot(A1, W2) + b2
Z2 = np.dot(W2, A1) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[0] #-> 784 # SHOULD BE NUMBER OF SAMPLES IN THE BATCH
# loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
loss = - np.sum((Y.T * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
# dZ2 = A2 - Y
# dW2 = (1/m)*np.dot(A1.T, dZ2)
# db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
# dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
# dW1 = (1/m)*np.dot(X.T, dZ1)
dZ2 = A2 - Y.T
dW2 = (1/m)*np.dot(dZ2, A1.T)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(dZ1, X)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
guess = np.argmax(A2, axis=0) # axis fixed
ans = np.argmax(Y, axis=1) # axis fixed
# print (guess.shape, ans.shape)
correct += sum (guess==ans)
# #print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
# if guess == ans:
# correct = correct + 1;
accuracy = correct / x_test.shape[0]
print (f"Epoch {x}. accuracy = {accuracy*100:.2f}%")
neural_network (64, 100, 0.3, x_test, y_test)
Epoch 1. accuracy = 14.93%
Epoch 2. accuracy = 34.70%
Epoch 3. accuracy = 47.41%
(...)
Epoch 98. accuracy = 89.29%
Epoch 99. accuracy = 89.33%
Epoch 100. accuracy = 89.37%
很长一段时间后,我仍然无法 运行 我的 nn 没有任何错误。这个玩具 nn 的准确度是惊人的 1-2%(隐藏层中有 60 个神经元,100 个 epochs,0.3 学习率,tanh 激活,通过 TF 下载的 MNIST 数据集)——所以基本上它根本没有学习。在看了这么多关于反向传播的视频/post 之后,我仍然无法修复它。
所以我的错误必须在标有两个##### 行的部分之间。我认为我对导数的理解总体上很好,但我就是无法将这些知识与反向传播联系起来。
如果反向传播基础是正确的,那么错误一定在axis = 0/1
,因为我也无法理解,如何确定我将在哪个轴上工作。
另外,我有一个强烈的预感,dZ2 = A2 - Y
可能是错误的,应该是dZ2 = Y - A2
,但是在更正之后,nn开始只猜一个数字。
(是的,反向传播本身我没有写,我在网上找到了)
#importing data and normalizing it
#"x_test" will be my X
#"y_test" will be my Y
import tensorflow as tf
(traindataX, traindataY), (testdataX, testdataY) = tf.keras.datasets.mnist.load_data()
x_test = testdataX.reshape(testdataX.shape[0], testdataX.shape[1]**2).astype('float32')
x_test = x_test / 255
y_test = testdataY
y_test = np.eye(10)[y_test]
#Activation functions:
def tanh(z):
a = (np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z))
return a
###############################################################################START
def softmax(z):
smExp = np.exp(z - np.max(z, axis=0))
out = smExp / np.sum(smExp, axis=0)
return out
###############################################################################STOP
def neural_network(num_hid, epochs,
learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
W1 = np.random.randn(784, num_hid) * 0.01
W2 = np.random.randn(num_hid, 10) * 0.01
b1 = np.zeros((1, num_hid))
b2 = np.zeros((1, 10))
correct = 0
for x in range(1, epochs+1):
#feedforward
Z1 = np.dot(X, W1) + b1
A1 = tanh(Z1)
Z2 = np.dot(A1, W2) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[1] #-> 784
loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
dZ2 = A2 - Y
dW2 = (1/m)*np.dot(A1.T, dZ2)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(X.T, dZ1)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
for i in range(np.shape(Y)[1]):
guess = np.argmax(A2[i, :])
ans = np.argmax(Y[i, :])
print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
if guess == ans:
correct = correct + 1;
accuracy = (correct/np.shape(Y)[0]) * 100
这可能是因为您应该通过将 X 除以 255(255 是最大像素值)来标准化 0 和 1 值之间的输入。您还应该将 Y one 热编码为一系列大小为 10 的向量。我认为你的反向传播是正确的,但你应该实施梯度检查来仔细检查。
您计算准确度的方法有误。
对于一个纪元的每次迭代,第一个正确的变量应该初始化为 0,第二个
如果 y.shape 是 (10000, 10),那么为了计算精度,循环应该是 for i in range(np.shape(Y)[0])
而不是 for i in range(np.shape(Y)[1])
,第一个将迭代 10,000 次,第二个将迭代 10 次。
更好的方法是使用 NumPy 来计算正确猜测的次数correct = np.sum(np.argmax(A2,axis=1) == np.argmax(Y,axis=1))
你的学习率太高了,我通过将学习率设置为 0.003 50 个 epoch 和 60 个隐藏神经元能够达到 50% 的准确率
def neural_network(num_hid, epochs,
learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
W1 = np.random.randn(784, num_hid) * 0.01
W2 = np.random.randn(num_hid, 10) * 0.01
b1 = np.zeros((1, num_hid))
b2 = np.zeros((1, 10))
correct = 0
for x in range(1, epochs+1):
#feedforward
Z1 = np.dot(X, W1) + b1
A1 = tanh(Z1)
Z2 = np.dot(A1, W2) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[1] #-> 784
loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
dZ2 = A2 - Y
dW2 = (1/m)*np.dot(A1.T, dZ2)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(X.T, dZ1)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
correct = 0
for i in range(np.shape(Y)[0]):
guess = np.argmax(A2[i, :])
ans = np.argmax(Y[i, :])
# print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
if guess == ans:
correct = correct + 1
# correct = np.sum(np.argmax(A2,axis=1) == np.argmax(Y,axis=1))
# print(correct)
accuracy = (correct/np.shape(Y)[0]) * 100
print(accuracy)
您需要进行实验,为了获得良好的准确性,请尝试调整隐藏层数、时期和学习率。
卢卡斯, 刷新基础知识的好问题。我对您的代码进行了一些修复:
- m的计算
- 转置所有权重和偏差(无法正确解释,但它在其他方面不起作用)。
- 更改了准确度(和未使用的损失)的计算。
请参阅下面更正后的代码。使用您的原始参数可以达到 90% 的准确率:
def neural_network(num_hid, epochs, learning_rate, X, Y):
#num_hid - number of neurons in the hidden layer
#X - dataX - shape (10000, 784)
#Y - labels - shape (10000, 10)
#inicialization
# W1 = np.random.randn(784, num_hid) * 0.01
# W2 = np.random.randn(num_hid, 10) * 0.01
# b1 = np.zeros((1, num_hid))
# b2 = np.zeros((1, 10))
W1 = np.random.randn(num_hid, 784) * 0.01
W2 = np.random.randn(10, num_hid) * 0.01
b1 = np.zeros((num_hid, 1))
b2 = np.zeros((10, 1))
for x in range(1, epochs+1):
correct = 0 # moved inside cycle
#feedforward
# Z1 = np.dot(X, W1) + b1
Z1 = np.dot(W1, X.T) + b1
A1 = tanh(Z1)
# Z2 = np.dot(A1, W2) + b2
Z2 = np.dot(W2, A1) + b2
A2 = softmax(Z2)
###############################################################################START
m = X.shape[0] #-> 784 # SHOULD BE NUMBER OF SAMPLES IN THE BATCH
# loss = - np.sum((Y * np.log(A2)), axis=0, keepdims=True)
loss = - np.sum((Y.T * np.log(A2)), axis=0, keepdims=True)
cost = np.sum(loss, axis=1) / m
#backpropagation
# dZ2 = A2 - Y
# dW2 = (1/m)*np.dot(A1.T, dZ2)
# db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
# dZ1 = np.multiply(np.dot(dZ2, W2.T), 1 - np.power(A1, 2))
# dW1 = (1/m)*np.dot(X.T, dZ1)
dZ2 = A2 - Y.T
dW2 = (1/m)*np.dot(dZ2, A1.T)
db2 = (1/m)*np.sum(dZ2, axis = 1, keepdims = True)
dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
dW1 = (1/m)*np.dot(dZ1, X)
db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)
###############################################################################STOP
#parameters update - gradient descent
W1 = W1 - dW1*learning_rate
b1 = b1 - db1*learning_rate
W2 = W2 - dW2*learning_rate
b2 = b2 - db2*learning_rate
guess = np.argmax(A2, axis=0) # axis fixed
ans = np.argmax(Y, axis=1) # axis fixed
# print (guess.shape, ans.shape)
correct += sum (guess==ans)
# #print(str(x) + " " + str(i) + ". " +"guess: ", guess, "| ans: ", ans)
# if guess == ans:
# correct = correct + 1;
accuracy = correct / x_test.shape[0]
print (f"Epoch {x}. accuracy = {accuracy*100:.2f}%")
neural_network (64, 100, 0.3, x_test, y_test)
Epoch 1. accuracy = 14.93%
Epoch 2. accuracy = 34.70%
Epoch 3. accuracy = 47.41%
(...)
Epoch 98. accuracy = 89.29%
Epoch 99. accuracy = 89.33%
Epoch 100. accuracy = 89.37%