我的 BPTT 实现有什么问题？

Question

我尝试通过时间手动实现反向传播，但最终网络没有收敛。我尝试在网上四处寻找关于 BPTT 的描述和课程，代码会相应地执行所有操作：

正向传播
向后传播错误
根据期望值计算梯度
根据梯度和学习率更新权重

我理解递归导数的方式是，在递归神经网络的情况下，上一步的输入不能被视为常数。所以例如：第3步w1的导数不仅取决于当前步骤的输入，还取决于前面的步骤。所以dw1[1] = net_inputs_train[first_sample_index + 1][0];不正确，应该是dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;。

在展开的网络中，其他一切都应该是反向传播 "only"。不幸的是，这个程序根本不起作用，错误只是在没有网络收敛的情况下跳来跳去。

我不知道我还能做些什么来完成这项工作，也许我完全误解了它的概念...

#include <iostream>
#include <vector>
#include <cmath>

using namespace std;

int main(int argc, char *argv[]){

  srand(time(nullptr));

  /* Manual BPTT with one custom implemented Neuron */
  double number_of_samples = 3;  /* Binary addition dataset */
  vector<vector<double>> net_inputs_train = {  /* 2 inputs in each step */
      {1,1},    {0,0},  {0,0}, /* 100 + 100 = 110 */
      {1,0},    {0,1},  {1,0}, /* 101 + 010 = 111*/
      {1,0},    {1,1},  {0,0}, /* 110 + 010 = 111 */
  };

  vector<vector<double>> expected_output = { /* 1 output in each step */
      {1},      {1},    {0}, /* 110 */
      {1},      {1},    {1}, /* 111 */
      {1},      {1},    {1}, /* 111 */
  };

  double w1 = 0.5;
  double w2 = 0.5;
  double w3 = 0.5;
  double b = 0.0;

  vector<double> neuron_data(3,0);
  vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/

  vector<double> dw1(3,0); /* derivatives for weights for each sequence */
  vector<double> dw2(3,0);
  vector<double> dw3(3,0);
  vector<double> derb(3,0);

  int first_sample_index;
  double manual_error = 1.0;
  double learning_rate = 1e-2;
  while(manual_error > learning_rate){
    for(int mbIter = 0; mbIter < 4; ++mbIter){
      first_sample_index = (rand()%(static_cast<int>(number_of_samples)));

      /* Fill in the data and derviatives */
      neuron_data[0] = (
        net_inputs_train[first_sample_index][0] * w1
        + net_inputs_train[first_sample_index][1] * w2
        + b
      );
      dw1[0] = net_inputs_train[first_sample_index][0];
      dw2[0] = net_inputs_train[first_sample_index][1];
      dw3[0] = 0;
      derb[0] = 1;

      neuron_data[1] = (
        net_inputs_train[first_sample_index + 1][0] * w1
        + net_inputs_train[first_sample_index + 1][1] * w2
        + neuron_data[0] * w3
        + b
      );
      dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
      dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
      dw3[1] = neuron_data[0] + w3 * dw3[0];
      derb[1] = 1 + derb[0] * w3;

      neuron_data[2] = (
        net_inputs_train[first_sample_index + 2][0] * w1
        + net_inputs_train[first_sample_index + 2][1] * w2
        + neuron_data[1] * w3
        + b
      );
      dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
      dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
      dw3[2] = neuron_data[1] + w3 * dw3[1];
      derb[2] = 1 + derb[1] * w3;

      /* Calculate the error and the gradients */
      manual_error = (
        pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
        +pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
        +pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
      );

      neuron_deriv[2] = (
        (-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
      );
      neuron_deriv[1] = (
        (-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
        + (w3 * neuron_deriv[2])
      );
      neuron_deriv[0] = (
        (-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
        + (w3 * neuron_deriv[1])
      );

      w1 += (learning_rate * (
        neuron_deriv[2] * dw1[2]
        + neuron_deriv[1] * dw1[1]
        + neuron_deriv[0] * dw1[0]
      ) / number_of_samples);

      w2 += (learning_rate * (
        neuron_deriv[2] * dw2[2]
        + neuron_deriv[1] * dw2[1]
        + neuron_deriv[0] * dw2[0]
      ) / number_of_samples);

      w3 += (learning_rate * (
        neuron_deriv[2] * dw3[2]
        + neuron_deriv[1] * dw3[1]
        + neuron_deriv[0] * dw3[0]
      ) / number_of_samples);

      b += (learning_rate * (
        neuron_deriv[2] * derb[2]
        + neuron_deriv[1] * derb[1]
        + neuron_deriv[0] * derb[0]
      ) / number_of_samples);
      std::cout << "\r Error: " << manual_error << "                    \n";
    }
  }

  return 0;
}

编辑： 一件有趣的事情是，如果 w1 += (learning_rate * (...)/number_of_samples); 切换到 w1 += ((...)/number_of_samples);

，训练会收敛

Answer 1

我认为这是一个错字：

 w1 += ((
        neuron_deriv[2] * dw1[2]
        + neuron_deriv[1] * dw1[1]
        + neuron_deriv[0] * dw1[0]
      ) / 300.0);                   // why?

因为你没有对其他重量做同样的事情。

如果您将其更改为与您计算其他权重的方式一致：

 w1 += ((
        neuron_deriv[2] * dw1[2]
        + neuron_deriv[1] * dw1[1]
        + neuron_deriv[0] * dw1[0]
      ) / number_of_samples);       // makes more sense

它converges.

也许您打算在 b 的计算中使用常量 300.0。无论如何，不要使用那样的幻数；给它起个名字。

其他问题；不要使用 using namespace std;。让你所有的常量值 const，甚至更好，constexpr。此外，将循环中的代码分解为几个命名函数。例如如果你这样做是为了重量计算，你的代码中的不一致甚至永远不会发生。

Answer 2

那么我从哪里开始呢？除了一些逻辑错误（例如在第 43 行，在 last_sample_index 的设置处），主要问题是反向传播在序列之间混合。

含义：每个序列都混合了其他序列的误差值。所以即使输入来自隐藏状态，它不应该影响其他序列的梯度。

当我在我的纸堆上哭泣时，我意识到这一点，迫使我检查 BPTT 技术（以及我的生活选择）到骨头，用反向传播算法交叉检查它，因为different sequences through time is basically a special kind of back-propagation, where some of the coefficients of the formulas are repeating.

考虑到这一点，我重新编写了代码以按顺序分离梯度计算。

然后，就是vanishing/exploding gradients的问题了。在上述返工之后，网络仍然没有收敛，因此。在第三次分解和我发现的一些实验之后，只需将来自序列 2 的偏差的梯度减半即可解决消失问题。偏差的梯度是有目标的，因为在数值上它是所有权重中最大的。

下面的程序现在可以运行了，网络收敛成功。

#include <iostream>
#include <vector>
#include <cmath>

using namespace std;

int main(int argc, char *argv[]){

  srand(time(nullptr));

  /* Manual BPTT with one custom implemented Neuron */
  double sequence_size = 3;
  double number_of_samples = 3;  /* Binary addition dataset */
  double minibatch_size = 4;
  vector<vector<double>> net_inputs_train = {  /* 2 inputs in each step */
      {1,1},    {0,0},  {0,0}, /* 100 + 100 = 110 */
      {1,0},    {0,1},  {1,0}, /* 101 + 010 = 111*/
      {1,0},    {1,1},  {0,0}, /* 110 + 010 = 111 */
  };

  vector<vector<double>> expected_output = { /* 1 output in each step */
      {1},      {1},    {0}, /* 110 */
      {1},      {1},    {1}, /* 111 */
      {1},      {1},    {1}, /* 111 */
  };

  double w1 = 0.5;
  double w2 = 0.5;
  double w3 = 0.5;
  double b = 0.0;

  double gradw1; /* gradients for the weights */
  double gradw2;
  double gradw3;
  double gradb;

  vector<double> neuron_data(3,0);
  double neuron_deriv = 0; /* Neuron error value ( partial based on the expected output and the error function )*/

  vector<double> dw1(3,0); /* derivatives for weights for each sequence */
  vector<double> dw2(3,0);
  vector<double> dw3(3,0);
  vector<double> derb(3,0);

  int first_sample_index;
  double manual_error = 1.0;
  double learning_rate = 1e-2;
  while(manual_error > learning_rate){
    for(int mbIter = 0; mbIter < minibatch_size; ++mbIter){ /* minibatches */
      first_sample_index = sequence_size * (rand()%(static_cast<int>(number_of_samples)));
      gradw1 = 0;
      gradw2 = 0;
      gradw3 = 0;
      gradb = 0;

      /* Fill in the data and derviatives */
      neuron_data[0] = (
        net_inputs_train[first_sample_index][0] * w1
        + net_inputs_train[first_sample_index][1] * w2
        + b
      );
      dw1[0] = net_inputs_train[first_sample_index][0];
      dw2[0] = net_inputs_train[first_sample_index][1];
      dw3[0] = 0;
      derb[0] = 1;

      neuron_data[1] = (
        net_inputs_train[first_sample_index + 1][0] * w1
        + net_inputs_train[first_sample_index + 1][1] * w2
        + neuron_data[0] * w3
        + b
      );
      dw1[1] = net_inputs_train[first_sample_index + 1][0] + w3 * dw1[0];
      dw2[1] = net_inputs_train[first_sample_index + 1][1] + w3 * dw2[0];
      dw3[1] = neuron_data[0] + w3 * dw3[0];
      derb[1] = 1 + derb[0] * w3;

      neuron_data[2] = (
        net_inputs_train[first_sample_index + 2][0] * w1
        + net_inputs_train[first_sample_index + 2][1] * w2
        + neuron_data[1] * w3
        + b
      );
      dw1[2] = net_inputs_train[first_sample_index + 2][0] + w3 * dw1[1];
      dw2[2] = net_inputs_train[first_sample_index + 2][1] + w3 * dw2[1];
      dw3[2] = neuron_data[1] + w3 * dw3[1];
      derb[2] = 1 + derb[1] * w3;

      /* Calculate the error and the gradients */
      manual_error = (
         pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
        +pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
        +pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
      );    

      /* Calculate gradients for sequence 2 */
      neuron_deriv = (
       -(neuron_data[2] - expected_output[first_sample_index + 2][0])
       -w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
       -w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
      );
      gradw1 += dw1[2] * neuron_deriv;
      gradw2 += dw2[2] * neuron_deriv;
      gradw3 += dw3[2] * neuron_deriv;
      gradb += derb[2] * neuron_deriv / 2.0;

      /* Calculate gradients for sequence 1 */
      neuron_deriv = (
        -(neuron_data[1] - expected_output[first_sample_index + 1][0])
        -w3*(neuron_data[1] - expected_output[first_sample_index + 1][0])
      );
      gradw1 += dw1[1] * neuron_deriv;
      gradw2 += dw2[1] * neuron_deriv;
      gradw3 += dw3[1] * neuron_deriv;
      gradb += derb[1] * neuron_deriv;

      /* Calculate gradients for sequence 0 */
      neuron_deriv = -(neuron_data[0] - expected_output[first_sample_index + 0][0]);
      gradw1 += dw1[0] * neuron_deriv;
      gradw2 += dw2[0] * neuron_deriv;
      gradw3 += dw3[0] * neuron_deriv;
      gradb += derb[0] * neuron_deriv;

      w1 += (learning_rate * (gradw1) / (sequence_size * minibatch_size));
      w2 += (learning_rate * (gradw2) / (sequence_size * minibatch_size));
      w3 += (learning_rate * (gradw3) / (sequence_size * minibatch_size));
      b += (learning_rate * (gradb) / (sequence_size * minibatch_size));
      std::cout << "\r Error: " << manual_error << "                     ";
    }
  }
  std::cout << std::endl;

  return 0;
}

老实说，我很难相信这确实有效。我真的希望我能帮助将来尝试此操作的任何人。

我的 BPTT 实现有什么问题？

What is wrong with my BPTT Implementation?

c++

backpropagation

back-propagation-through-time