如何使用不同大小的层执行反向传播?
How to perform back propagation with different sized layers?
我正在开发我的第一个神经网络,使用著名的 MNIST 手写数字数据库。我希望 NN 能够在给定图像的情况下对 0 到 9 之间的数字进行分类。
我的神经网络由三层组成:输入层(784 个神经元,每个神经元代表数字的每个像素),隐藏层有 30 个神经元(也可以是 100 或 50,但我不是太担心超参数调整了)和输出层,10 个神经元,每个代表每个数字的激活。这给了我两个权重矩阵:一个是 30x724,另一个是 10x30。
我知道并理解反向传播、优化背后的理论及其背后的数学公式,这本身不是问题。我可以优化第二个权重矩阵的权重,并且成本确实随着时间的推移而降低。但是由于矩阵结构,我无法继续传播它。
知道我找到了成本的导数w.r.t。权重:
d(cost) / d(w) = d(cost) / d(f(z)) * d(f(z)) / d(z) * d(z) / d(w)
(作为 f
激活函数和 z
点积加上神经元的偏差)
所以我在最右边的层,有一个包含 10 个元素的输出数组。 d(cost) / d(f(z))
是观测值与预测值的减法。我可以将它乘以 d(f(z)) / d(z)
,这只是最右边层的 f'(z)
,也是一个包含 10 个元素的一维向量,现在已计算 d(cost) / d(z)
。然后,d(z)/d(w)
只是该层的输入,即前一层的输出,它是一个包含 30 个元素的向量。我想我可以转置 d(cost) / d(z)
以便 T( d(cost) / d(z) ) * d(z) / d(w)
给我一个 (10, 30) 的矩阵,这是有道理的,因为它与最右边的权重矩阵的维度相匹配。
但后来我卡住了。 d(cost) / d(f(z))
的维数是 (1, 10),d(f(z)) / d(z)
的维数是 (1, 30),d(z) / d(w)
的维数是 (1, 784)。我不知道如何得出结果。
这是我目前编写的代码。不完整的部分是 _propagate_back
方法。我还不关心偏差,因为我只是受制于权重,首先我想弄清楚这一点。
import random
from typing import List, Tuple
import numpy as np
from matplotlib import pyplot as plt
import mnist_loader
np.random.seed(42)
NETWORK_LAYER_SIZES = [784, 30, 10]
LEARNING_RATE = 0.05
BATCH_SIZE = 20
NUMBER_OF_EPOCHS = 5000
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_der(x):
return sigmoid(x) * (1 - sigmoid(x))
class Layer:
def __init__(self, input_size: int, output_size: int):
self.weights = np.random.uniform(-1, 1, [output_size, input_size])
self.biases = np.random.uniform(-1, 1, [output_size])
self.z = np.zeros(output_size)
self.a = np.zeros(output_size)
self.dz = np.zeros(output_size)
def feed_forward(self, input_data: np.ndarray):
input_data_t = np.atleast_2d(input_data).T
dot_product = self.weights.dot(input_data_t).T[0]
self.z = dot_product + self.biases
self.a = sigmoid(self.z)
self.dz = sigmoid_der(self.z)
class Network:
def __init__(self, layer_sizes: List[int], X_train: np.ndarray, y_train: np.ndarray):
self.layers = [
Layer(input_size, output_size)
for input_size, output_size
in zip(layer_sizes[0:], layer_sizes[1:])
]
self.X_train = X_train
self.y_train = y_train
@property
def predicted(self) -> np.ndarray:
return self.layers[-1].a
def _normalize_y(self, y: int) -> np.ndarray:
output_layer_size = len(self.predicted)
normalized_y = np.zeros(output_layer_size)
normalized_y[y] = 1.
return normalized_y
def _calculate_cost(self, y_observed: np.ndarray) -> int:
y_observed = self._normalize_y(y_observed)
y_predicted = self.layers[-1].a
squared_difference = (y_predicted - y_observed) ** 2
return np.sum(squared_difference)
def _get_training_batches(self, X_train: np.ndarray, y_train: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
train_batch_indexes = random.sample(range(len(X_train)), BATCH_SIZE)
return X_train[train_batch_indexes], y_train[train_batch_indexes]
def _feed_forward(self, input_data: np.ndarray):
for layer in self.layers:
layer.feed_forward(input_data)
input_data = layer.a
def _propagate_back(self, X: np.ndarray, y_observed: int):
"""
der(cost) / der(weight) = der(cost) / der(predicted) * der(predicted) / der(z) * der(z) / der(weight)
"""
y_observed = self._normalize_y(y_observed)
d_cost_d_pred = self.predicted - y_observed
hidden_layer = self.layers[0]
output_layer = self.layers[1]
# Output layer weights
d_pred_d_z = output_layer.dz
d_z_d_weight = hidden_layer.a # Input to the current layer, i.e. the output from the previous one
d_cost_d_z = d_cost_d_pred * d_pred_d_z
d_cost_d_weight = np.atleast_2d(d_cost_d_z).T * np.atleast_2d(d_z_d_weight)
output_layer.weights -= LEARNING_RATE * d_cost_d_weight
# Hidden layer weights
d_pred_d_z = hidden_layer.dz
d_z_d_weight = X
# ...
def train(self, X_train: np.ndarray, y_train: np.ndarray):
X_train_batch, y_train_batch = self._get_training_batches(X_train, y_train)
cost_over_epoch = []
for epoch_number in range(NUMBER_OF_EPOCHS):
X_train_batch, y_train_batch = self._get_training_batches(X_train, y_train)
cost = 0
for X_sample, y_observed in zip(X_train_batch, y_train_batch):
self._feed_forward(X_sample)
cost += self._calculate_cost(y_observed)
self._propagate_back(X_sample, y_observed)
cost_over_epoch.append(cost / BATCH_SIZE)
plt.plot(cost_over_epoch)
plt.ylabel('Cost')
plt.xlabel('Epoch')
plt.savefig('cost_over_epoch.png')
training_data, validation_data, test_data = mnist_loader.load_data()
X_train, y_train = training_data[0], training_data[1]
network = Network(NETWORK_LAYER_SIZES, training_data[0], training_data[1])
network.train(X_train, y_train)
这是 mnist_loader 的代码,以防有人想重现该示例:
import pickle
import gzip
def load_data():
f = gzip.open('data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin-1')
f.close()
return training_data, validation_data, test_data
一旦你有了 d(cost) / d(z)
,我认为你实际上应该将它乘以权重矩阵:只有这样你才能将错误 [=11=] 向后移动到新层(并得到一个有意义的矩阵形状)。
以下是我如何更改您的反向传递函数:
def _propagate_back(self, X: np.ndarray, y_observed: int):
"""
der(cost) / der(weight) = der(cost) / der(predicted) * der(predicted) / der(z) * der(z) / der(weight)
"""
y_observed = self._normalize_y(y_observed)
d_cost_d_pred = self.predicted - y_observed
hidden_layer = self.layers[0]
output_layer = self.layers[1]
# Output layer weights
d_pred_d_z = output_layer.dz
d_z_d_weight = np.atleast_2d(hidden_layer.a) # Input to the current layer, i.e. the output from the previous one
d_cost_d_z = np.atleast_2d(d_cost_d_pred * d_pred_d_z)
d_cost_d_weight = np.dot(d_cost_d_z.T, d_z_d_weight)
output_layer.weights -= LEARNING_RATE * d_cost_d_weight
# Hidden layer weights
d_pred_d_z = hidden_layer.dz
d_z_d_weight = np.atleast_2d(X)
hidden_err = np.dot(d_cost_d_z, output_layer.weights)
d_cost_d_z = np.atleast_2d(hidden_err * d_pred_d_z)
d_cost_d_weight = np.dot(d_cost_d_z.T, d_z_d_weight)
hidden_layer.weights -= LEARNING_RATE * d_cost_d_weight
两个注意事项:
- 行
hidden_err = np.dot(d_cost_d_z, output_layer.weights)
是我将 d(cost) / d(z)
乘以权重矩阵 的地方
- 我已经用
np.dot
函数(Numpy 中的矩阵乘法)
我不是专家,所以我希望我没有犯一些可怕的错误...反正我的回答主要是基于神经网络和深度学习this chapter 作者:Michael Nielsen。
我正在开发我的第一个神经网络,使用著名的 MNIST 手写数字数据库。我希望 NN 能够在给定图像的情况下对 0 到 9 之间的数字进行分类。
我的神经网络由三层组成:输入层(784 个神经元,每个神经元代表数字的每个像素),隐藏层有 30 个神经元(也可以是 100 或 50,但我不是太担心超参数调整了)和输出层,10 个神经元,每个代表每个数字的激活。这给了我两个权重矩阵:一个是 30x724,另一个是 10x30。
我知道并理解反向传播、优化背后的理论及其背后的数学公式,这本身不是问题。我可以优化第二个权重矩阵的权重,并且成本确实随着时间的推移而降低。但是由于矩阵结构,我无法继续传播它。
知道我找到了成本的导数w.r.t。权重:
d(cost) / d(w) = d(cost) / d(f(z)) * d(f(z)) / d(z) * d(z) / d(w)
(作为 f
激活函数和 z
点积加上神经元的偏差)
所以我在最右边的层,有一个包含 10 个元素的输出数组。 d(cost) / d(f(z))
是观测值与预测值的减法。我可以将它乘以 d(f(z)) / d(z)
,这只是最右边层的 f'(z)
,也是一个包含 10 个元素的一维向量,现在已计算 d(cost) / d(z)
。然后,d(z)/d(w)
只是该层的输入,即前一层的输出,它是一个包含 30 个元素的向量。我想我可以转置 d(cost) / d(z)
以便 T( d(cost) / d(z) ) * d(z) / d(w)
给我一个 (10, 30) 的矩阵,这是有道理的,因为它与最右边的权重矩阵的维度相匹配。
但后来我卡住了。 d(cost) / d(f(z))
的维数是 (1, 10),d(f(z)) / d(z)
的维数是 (1, 30),d(z) / d(w)
的维数是 (1, 784)。我不知道如何得出结果。
这是我目前编写的代码。不完整的部分是 _propagate_back
方法。我还不关心偏差,因为我只是受制于权重,首先我想弄清楚这一点。
import random
from typing import List, Tuple
import numpy as np
from matplotlib import pyplot as plt
import mnist_loader
np.random.seed(42)
NETWORK_LAYER_SIZES = [784, 30, 10]
LEARNING_RATE = 0.05
BATCH_SIZE = 20
NUMBER_OF_EPOCHS = 5000
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_der(x):
return sigmoid(x) * (1 - sigmoid(x))
class Layer:
def __init__(self, input_size: int, output_size: int):
self.weights = np.random.uniform(-1, 1, [output_size, input_size])
self.biases = np.random.uniform(-1, 1, [output_size])
self.z = np.zeros(output_size)
self.a = np.zeros(output_size)
self.dz = np.zeros(output_size)
def feed_forward(self, input_data: np.ndarray):
input_data_t = np.atleast_2d(input_data).T
dot_product = self.weights.dot(input_data_t).T[0]
self.z = dot_product + self.biases
self.a = sigmoid(self.z)
self.dz = sigmoid_der(self.z)
class Network:
def __init__(self, layer_sizes: List[int], X_train: np.ndarray, y_train: np.ndarray):
self.layers = [
Layer(input_size, output_size)
for input_size, output_size
in zip(layer_sizes[0:], layer_sizes[1:])
]
self.X_train = X_train
self.y_train = y_train
@property
def predicted(self) -> np.ndarray:
return self.layers[-1].a
def _normalize_y(self, y: int) -> np.ndarray:
output_layer_size = len(self.predicted)
normalized_y = np.zeros(output_layer_size)
normalized_y[y] = 1.
return normalized_y
def _calculate_cost(self, y_observed: np.ndarray) -> int:
y_observed = self._normalize_y(y_observed)
y_predicted = self.layers[-1].a
squared_difference = (y_predicted - y_observed) ** 2
return np.sum(squared_difference)
def _get_training_batches(self, X_train: np.ndarray, y_train: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
train_batch_indexes = random.sample(range(len(X_train)), BATCH_SIZE)
return X_train[train_batch_indexes], y_train[train_batch_indexes]
def _feed_forward(self, input_data: np.ndarray):
for layer in self.layers:
layer.feed_forward(input_data)
input_data = layer.a
def _propagate_back(self, X: np.ndarray, y_observed: int):
"""
der(cost) / der(weight) = der(cost) / der(predicted) * der(predicted) / der(z) * der(z) / der(weight)
"""
y_observed = self._normalize_y(y_observed)
d_cost_d_pred = self.predicted - y_observed
hidden_layer = self.layers[0]
output_layer = self.layers[1]
# Output layer weights
d_pred_d_z = output_layer.dz
d_z_d_weight = hidden_layer.a # Input to the current layer, i.e. the output from the previous one
d_cost_d_z = d_cost_d_pred * d_pred_d_z
d_cost_d_weight = np.atleast_2d(d_cost_d_z).T * np.atleast_2d(d_z_d_weight)
output_layer.weights -= LEARNING_RATE * d_cost_d_weight
# Hidden layer weights
d_pred_d_z = hidden_layer.dz
d_z_d_weight = X
# ...
def train(self, X_train: np.ndarray, y_train: np.ndarray):
X_train_batch, y_train_batch = self._get_training_batches(X_train, y_train)
cost_over_epoch = []
for epoch_number in range(NUMBER_OF_EPOCHS):
X_train_batch, y_train_batch = self._get_training_batches(X_train, y_train)
cost = 0
for X_sample, y_observed in zip(X_train_batch, y_train_batch):
self._feed_forward(X_sample)
cost += self._calculate_cost(y_observed)
self._propagate_back(X_sample, y_observed)
cost_over_epoch.append(cost / BATCH_SIZE)
plt.plot(cost_over_epoch)
plt.ylabel('Cost')
plt.xlabel('Epoch')
plt.savefig('cost_over_epoch.png')
training_data, validation_data, test_data = mnist_loader.load_data()
X_train, y_train = training_data[0], training_data[1]
network = Network(NETWORK_LAYER_SIZES, training_data[0], training_data[1])
network.train(X_train, y_train)
这是 mnist_loader 的代码,以防有人想重现该示例:
import pickle
import gzip
def load_data():
f = gzip.open('data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin-1')
f.close()
return training_data, validation_data, test_data
一旦你有了 d(cost) / d(z)
,我认为你实际上应该将它乘以权重矩阵:只有这样你才能将错误 [=11=] 向后移动到新层(并得到一个有意义的矩阵形状)。
以下是我如何更改您的反向传递函数:
def _propagate_back(self, X: np.ndarray, y_observed: int):
"""
der(cost) / der(weight) = der(cost) / der(predicted) * der(predicted) / der(z) * der(z) / der(weight)
"""
y_observed = self._normalize_y(y_observed)
d_cost_d_pred = self.predicted - y_observed
hidden_layer = self.layers[0]
output_layer = self.layers[1]
# Output layer weights
d_pred_d_z = output_layer.dz
d_z_d_weight = np.atleast_2d(hidden_layer.a) # Input to the current layer, i.e. the output from the previous one
d_cost_d_z = np.atleast_2d(d_cost_d_pred * d_pred_d_z)
d_cost_d_weight = np.dot(d_cost_d_z.T, d_z_d_weight)
output_layer.weights -= LEARNING_RATE * d_cost_d_weight
# Hidden layer weights
d_pred_d_z = hidden_layer.dz
d_z_d_weight = np.atleast_2d(X)
hidden_err = np.dot(d_cost_d_z, output_layer.weights)
d_cost_d_z = np.atleast_2d(hidden_err * d_pred_d_z)
d_cost_d_weight = np.dot(d_cost_d_z.T, d_z_d_weight)
hidden_layer.weights -= LEARNING_RATE * d_cost_d_weight
两个注意事项:
- 行
hidden_err = np.dot(d_cost_d_z, output_layer.weights)
是我将d(cost) / d(z)
乘以权重矩阵 的地方
- 我已经用
np.dot
函数(Numpy 中的矩阵乘法)
我不是专家,所以我希望我没有犯一些可怕的错误...反正我的回答主要是基于神经网络和深度学习this chapter 作者:Michael Nielsen。