反向传播算法给出不好的结果
Backpropagation algorithm giving bad results
我正在尝试使用 MNIST 数据集通过前馈神经网络和反向传播来解决 classic 手写数字识别问题。我正在使用 Michael Nielsen's book to learn the essentials and 3Blue1Brown's youtube video 作为反向传播算法。
前段时间写完一直在调试,因为效果很差。在最好的情况下,网络可以在 1 个纪元后识别出约 4000/10000 个样本,并且该数字只会在接下来的纪元中下降,这让我相信反向传播算法存在一些问题。在过去的几天里,我一直沉浸在试图调试它的索引地狱中,但无法弄清楚问题出在哪里,如果能指出问题,我将不胜感激。
一些背景知识:1) 我没有使用任何矩阵乘法,也没有使用外部框架,而是使用 for 循环来完成所有操作,因为这是我从视频中学到的。 2) 与书中不同的是,我将权重和偏差都存储在同一个数组中。每层的偏差是该层权重矩阵末尾的一列。
最后是代码,这是 NeuralNetwork class 的 Backpropagate 方法,它在 UpdateMiniBatch 中调用,它本身在 SGD 中调用:
/// <summary>
/// Returns the partial derivative of the cost function on one sample with respect to every weight in the network.
/// </summary>
public List<double[,]> Backpropagate(ITrainingSample sample)
{
// Forwards pass
var (weightedInputs, activations) = GetWeightedInputsAndActivations(sample.Input);
// The derivative with respect to the activation of the last layer is simple to compute: activation - expectedActivation
var errors = activations.Last().Select((a, i) => a - sample.Output[i]).ToArray();
// Backwards pass
List<double[,]> delCostOverDelWeights = Weights.Select(x => new double[x.GetLength(0), x.GetLength(1)]).ToList();
List<double[]> delCostOverDelActivations = Weights.Select(x => new double[x.GetLength(0)]).ToList();
delCostOverDelActivations[delCostOverDelActivations.Count - 1] = errors;
// Comment notation:
// Cost function: C
// Weight connecting the i-th neuron on the (l + 1)-th layer to the j-th neuron on the l-th layer: w[l][i, j]
// Bias of the i-th neuron on the (l + 1)-th layer: b[l][i]
// Activation of the i-th neuon on the l-th layer: a[l][i]
// Weighted input of the i-th neuron on the l-th layer: z[l][i] // which doesn't make sense on layer 0, but is left for index convenience
// Notice that weights, biases, delCostOverDelWeights and delCostOverDelActivation all start at layer 1 (the 0-th layer is irrelevant to their meanings) while activations and weightedInputs strat at the 0-th layer
for (int l = Weights.Count - 1; l >= 0; l--)
{
//Calculate ∂C/∂w for the current layer:
for (int i = 0; i < Weights[l].GetLength(0); i++)
for (int j = 0; j < Weights[l].GetLength(1); j++)
delCostOverDelWeights[l][i, j] = // ∂C/∂w[l][i, j]
delCostOverDelActivations[l][i] * // ∂C/∂a[l + 1][i]
SigmoidPrime(weightedInputs[l + 1][i]) * // ∂a[l + 1][i]/∂z[l + 1][i] = ∂(σ(z[l + 1][i]))/∂z[l + 1][i] = σ′(z[l + 1][i])
(j < Weights[l].GetLength(1) - 1 ? activations[l][j] : 1); // ∂z[l + 1][i]/∂w[l][i, j] = a[l][j] ||OR|| ∂z[l + 1][i]/∂b[l][i] = 1
// Calculate ∂C/∂a for the previous layer(a[l]):
if (l != 0)
for (int i = 0; i < Weights[l - 1].GetLength(0); i++)
for (int j = 0; j < Weights[l].GetLength(0); j++)
delCostOverDelActivations[l - 1][i] += // ∂C/∂a[l][i] = sum over j:
delCostOverDelActivations[l][j] * // ∂C/∂a[l + 1][j]
SigmoidPrime(weightedInputs[l + 1][j]) * // ∂a[l + 1][j]/∂z[l + 1][j] = ∂(σ(z[l + 1][j]))/∂z[l + 1][j] = σ′(z[l + 1][j])
Weights[l][j, i]; // ∂z[l + 1][j]/∂a[l][i] = w[l][j, i]
}
return delCostOverDelWeights;
}
GetWeightedInputsAndActivations:
public (List<double[]>, List<double[]>) GetWeightedInputsAndActivations(double[] input)
{
List<double[]> activations = new List<double[]>() { input }.Concat(Weights.Select(x => new double[x.GetLength(0)])).ToList();
List<double[]> weightedInputs = activations.Select(x => new double[x.Length]).ToList();
for (int l = 0; l < Weights.Count; l++)
for (int i = 0; i < Weights[l].GetLength(0); i++)
{
double value = 0;
for (int j = 0; j < Weights[l].GetLength(1) - 1; j++)
value += Weights[l][i, j] * activations[l][j];// weights
weightedInputs[l + 1][i] = value + Weights[l][i, Weights[l].GetLength(1) - 1];// bias
activations[l + 1][i] = Sigmoid(weightedInputs[l + 1][i]);
}
return (weightedInputs, activations);
}
整个神经网络以及其他一切都可以找到 here。
编辑:在对 repo 进行了许多重大更改之后,上述 link 可能不再起作用,但考虑到答案,应该是无关紧要的。为了完整起见,这是 functional link to the changed repository.
已修复。问题是:我没有将像素输入除以 255。其他一切似乎都正常工作,我现在在第一个纪元上得到 +9000/10000。
我正在尝试使用 MNIST 数据集通过前馈神经网络和反向传播来解决 classic 手写数字识别问题。我正在使用 Michael Nielsen's book to learn the essentials and 3Blue1Brown's youtube video 作为反向传播算法。
前段时间写完一直在调试,因为效果很差。在最好的情况下,网络可以在 1 个纪元后识别出约 4000/10000 个样本,并且该数字只会在接下来的纪元中下降,这让我相信反向传播算法存在一些问题。在过去的几天里,我一直沉浸在试图调试它的索引地狱中,但无法弄清楚问题出在哪里,如果能指出问题,我将不胜感激。
一些背景知识:1) 我没有使用任何矩阵乘法,也没有使用外部框架,而是使用 for 循环来完成所有操作,因为这是我从视频中学到的。 2) 与书中不同的是,我将权重和偏差都存储在同一个数组中。每层的偏差是该层权重矩阵末尾的一列。
最后是代码,这是 NeuralNetwork class 的 Backpropagate 方法,它在 UpdateMiniBatch 中调用,它本身在 SGD 中调用:
/// <summary>
/// Returns the partial derivative of the cost function on one sample with respect to every weight in the network.
/// </summary>
public List<double[,]> Backpropagate(ITrainingSample sample)
{
// Forwards pass
var (weightedInputs, activations) = GetWeightedInputsAndActivations(sample.Input);
// The derivative with respect to the activation of the last layer is simple to compute: activation - expectedActivation
var errors = activations.Last().Select((a, i) => a - sample.Output[i]).ToArray();
// Backwards pass
List<double[,]> delCostOverDelWeights = Weights.Select(x => new double[x.GetLength(0), x.GetLength(1)]).ToList();
List<double[]> delCostOverDelActivations = Weights.Select(x => new double[x.GetLength(0)]).ToList();
delCostOverDelActivations[delCostOverDelActivations.Count - 1] = errors;
// Comment notation:
// Cost function: C
// Weight connecting the i-th neuron on the (l + 1)-th layer to the j-th neuron on the l-th layer: w[l][i, j]
// Bias of the i-th neuron on the (l + 1)-th layer: b[l][i]
// Activation of the i-th neuon on the l-th layer: a[l][i]
// Weighted input of the i-th neuron on the l-th layer: z[l][i] // which doesn't make sense on layer 0, but is left for index convenience
// Notice that weights, biases, delCostOverDelWeights and delCostOverDelActivation all start at layer 1 (the 0-th layer is irrelevant to their meanings) while activations and weightedInputs strat at the 0-th layer
for (int l = Weights.Count - 1; l >= 0; l--)
{
//Calculate ∂C/∂w for the current layer:
for (int i = 0; i < Weights[l].GetLength(0); i++)
for (int j = 0; j < Weights[l].GetLength(1); j++)
delCostOverDelWeights[l][i, j] = // ∂C/∂w[l][i, j]
delCostOverDelActivations[l][i] * // ∂C/∂a[l + 1][i]
SigmoidPrime(weightedInputs[l + 1][i]) * // ∂a[l + 1][i]/∂z[l + 1][i] = ∂(σ(z[l + 1][i]))/∂z[l + 1][i] = σ′(z[l + 1][i])
(j < Weights[l].GetLength(1) - 1 ? activations[l][j] : 1); // ∂z[l + 1][i]/∂w[l][i, j] = a[l][j] ||OR|| ∂z[l + 1][i]/∂b[l][i] = 1
// Calculate ∂C/∂a for the previous layer(a[l]):
if (l != 0)
for (int i = 0; i < Weights[l - 1].GetLength(0); i++)
for (int j = 0; j < Weights[l].GetLength(0); j++)
delCostOverDelActivations[l - 1][i] += // ∂C/∂a[l][i] = sum over j:
delCostOverDelActivations[l][j] * // ∂C/∂a[l + 1][j]
SigmoidPrime(weightedInputs[l + 1][j]) * // ∂a[l + 1][j]/∂z[l + 1][j] = ∂(σ(z[l + 1][j]))/∂z[l + 1][j] = σ′(z[l + 1][j])
Weights[l][j, i]; // ∂z[l + 1][j]/∂a[l][i] = w[l][j, i]
}
return delCostOverDelWeights;
}
GetWeightedInputsAndActivations:
public (List<double[]>, List<double[]>) GetWeightedInputsAndActivations(double[] input)
{
List<double[]> activations = new List<double[]>() { input }.Concat(Weights.Select(x => new double[x.GetLength(0)])).ToList();
List<double[]> weightedInputs = activations.Select(x => new double[x.Length]).ToList();
for (int l = 0; l < Weights.Count; l++)
for (int i = 0; i < Weights[l].GetLength(0); i++)
{
double value = 0;
for (int j = 0; j < Weights[l].GetLength(1) - 1; j++)
value += Weights[l][i, j] * activations[l][j];// weights
weightedInputs[l + 1][i] = value + Weights[l][i, Weights[l].GetLength(1) - 1];// bias
activations[l + 1][i] = Sigmoid(weightedInputs[l + 1][i]);
}
return (weightedInputs, activations);
}
整个神经网络以及其他一切都可以找到 here。
编辑:在对 repo 进行了许多重大更改之后,上述 link 可能不再起作用,但考虑到答案,应该是无关紧要的。为了完整起见,这是 functional link to the changed repository.
已修复。问题是:我没有将像素输入除以 255。其他一切似乎都正常工作,我现在在第一个纪元上得到 +9000/10000。