我的 BPTT 实现有什么问题?
What is wrong with my BPTT Implementation?
我尝试通过时间手动实现反向传播,但最终网络没有收敛。
我尝试在网上四处寻找关于 BPTT 的描述和课程,代码会相应地执行所有操作:
- 正向传播
- 向后传播错误
- 根据期望值计算梯度
- 根据梯度和学习率更新权重
我理解递归导数的方式是,在递归神经网络的情况下,上一步的输入不能被视为常数。所以例如:
第3步w1
的导数不仅取决于当前步骤的输入,还取决于前面的步骤。所以dw1[1] = net_inputs_train[first_sample_index + 1][0];
不正确,应该是dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
。
在展开的网络中,其他一切都应该是反向传播 "only"。
不幸的是,这个程序根本不起作用,错误只是在没有网络收敛的情况下跳来跳去。
我不知道我还能做些什么来完成这项工作,也许我完全误解了它的概念...
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
int main(int argc, char *argv[]){
srand(time(nullptr));
/* Manual BPTT with one custom implemented Neuron */
double number_of_samples = 3; /* Binary addition dataset */
vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
{1,1}, {0,0}, {0,0}, /* 100 + 100 = 110 */
{1,0}, {0,1}, {1,0}, /* 101 + 010 = 111*/
{1,0}, {1,1}, {0,0}, /* 110 + 010 = 111 */
};
vector<vector<double>> expected_output = { /* 1 output in each step */
{1}, {1}, {0}, /* 110 */
{1}, {1}, {1}, /* 111 */
{1}, {1}, {1}, /* 111 */
};
double w1 = 0.5;
double w2 = 0.5;
double w3 = 0.5;
double b = 0.0;
vector<double> neuron_data(3,0);
vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/
vector<double> dw1(3,0); /* derivatives for weights for each sequence */
vector<double> dw2(3,0);
vector<double> dw3(3,0);
vector<double> derb(3,0);
int first_sample_index;
double manual_error = 1.0;
double learning_rate = 1e-2;
while(manual_error > learning_rate){
for(int mbIter = 0; mbIter < 4; ++mbIter){
first_sample_index = (rand()%(static_cast<int>(number_of_samples)));
/* Fill in the data and derviatives */
neuron_data[0] = (
net_inputs_train[first_sample_index][0] * w1
+ net_inputs_train[first_sample_index][1] * w2
+ b
);
dw1[0] = net_inputs_train[first_sample_index][0];
dw2[0] = net_inputs_train[first_sample_index][1];
dw3[0] = 0;
derb[0] = 1;
neuron_data[1] = (
net_inputs_train[first_sample_index + 1][0] * w1
+ net_inputs_train[first_sample_index + 1][1] * w2
+ neuron_data[0] * w3
+ b
);
dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
dw3[1] = neuron_data[0] + w3 * dw3[0];
derb[1] = 1 + derb[0] * w3;
neuron_data[2] = (
net_inputs_train[first_sample_index + 2][0] * w1
+ net_inputs_train[first_sample_index + 2][1] * w2
+ neuron_data[1] * w3
+ b
);
dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
dw3[2] = neuron_data[1] + w3 * dw3[1];
derb[2] = 1 + derb[1] * w3;
/* Calculate the error and the gradients */
manual_error = (
pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
+pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
+pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
);
neuron_deriv[2] = (
(-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
);
neuron_deriv[1] = (
(-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
+ (w3 * neuron_deriv[2])
);
neuron_deriv[0] = (
(-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
+ (w3 * neuron_deriv[1])
);
w1 += (learning_rate * (
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / number_of_samples);
w2 += (learning_rate * (
neuron_deriv[2] * dw2[2]
+ neuron_deriv[1] * dw2[1]
+ neuron_deriv[0] * dw2[0]
) / number_of_samples);
w3 += (learning_rate * (
neuron_deriv[2] * dw3[2]
+ neuron_deriv[1] * dw3[1]
+ neuron_deriv[0] * dw3[0]
) / number_of_samples);
b += (learning_rate * (
neuron_deriv[2] * derb[2]
+ neuron_deriv[1] * derb[1]
+ neuron_deriv[0] * derb[0]
) / number_of_samples);
std::cout << "\r Error: " << manual_error << " \n";
}
}
return 0;
}
编辑: 一件有趣的事情是,如果 w1 += (learning_rate * (...)/number_of_samples);
切换到 w1 += ((...)/number_of_samples);
,训练会收敛
我认为这是一个错字:
w1 += ((
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / 300.0); // why?
因为你没有对其他重量做同样的事情。
如果您将其更改为与您计算其他权重的方式一致:
w1 += ((
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / number_of_samples); // makes more sense
也许您打算在 b
的计算中使用常量 300.0
。无论如何,不要使用那样的幻数;给它起个名字。
其他问题;不要使用 using namespace std;
。让你所有的常量值 const
,甚至更好,constexpr
。此外,将循环中的代码分解为几个命名函数。例如如果你这样做是为了重量计算,你的代码中的不一致甚至永远不会发生。
那么我从哪里开始呢?
除了一些逻辑错误(例如在第 43 行,在 last_sample_index
的设置处),主要问题是反向传播在序列之间混合。
含义:每个序列都混合了其他序列的误差值。所以即使
输入来自隐藏状态,它不应该影响其他序列的梯度。
当我在我的纸堆上哭泣时,我意识到这一点,迫使我检查 BPTT 技术(以及我的生活选择)到骨头,用反向传播算法交叉检查它,因为different sequences through time is basically a special kind of back-propagation, where some of the coefficients of the formulas are repeating.
考虑到这一点,我重新编写了代码以按顺序分离梯度计算。
然后,就是vanishing/exploding gradients的问题了。在上述返工之后,网络仍然没有收敛,因此。在第三次分解和我发现的一些实验之后,只需将来自序列 2 的偏差的梯度减半即可解决消失问题。偏差的梯度是有目标的,因为在数值上它是所有权重中最大的。
下面的程序现在可以运行了,网络收敛成功。
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
int main(int argc, char *argv[]){
srand(time(nullptr));
/* Manual BPTT with one custom implemented Neuron */
double sequence_size = 3;
double number_of_samples = 3; /* Binary addition dataset */
double minibatch_size = 4;
vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
{1,1}, {0,0}, {0,0}, /* 100 + 100 = 110 */
{1,0}, {0,1}, {1,0}, /* 101 + 010 = 111*/
{1,0}, {1,1}, {0,0}, /* 110 + 010 = 111 */
};
vector<vector<double>> expected_output = { /* 1 output in each step */
{1}, {1}, {0}, /* 110 */
{1}, {1}, {1}, /* 111 */
{1}, {1}, {1}, /* 111 */
};
double w1 = 0.5;
double w2 = 0.5;
double w3 = 0.5;
double b = 0.0;
double gradw1; /* gradients for the weights */
double gradw2;
double gradw3;
double gradb;
vector<double> neuron_data(3,0);
double neuron_deriv = 0; /* Neuron error value ( partial based on the expected output and the error function )*/
vector<double> dw1(3,0); /* derivatives for weights for each sequence */
vector<double> dw2(3,0);
vector<double> dw3(3,0);
vector<double> derb(3,0);
int first_sample_index;
double manual_error = 1.0;
double learning_rate = 1e-2;
while(manual_error > learning_rate){
for(int mbIter = 0; mbIter < minibatch_size; ++mbIter){ /* minibatches */
first_sample_index = sequence_size * (rand()%(static_cast<int>(number_of_samples)));
gradw1 = 0;
gradw2 = 0;
gradw3 = 0;
gradb = 0;
/* Fill in the data and derviatives */
neuron_data[0] = (
net_inputs_train[first_sample_index][0] * w1
+ net_inputs_train[first_sample_index][1] * w2
+ b
);
dw1[0] = net_inputs_train[first_sample_index][0];
dw2[0] = net_inputs_train[first_sample_index][1];
dw3[0] = 0;
derb[0] = 1;
neuron_data[1] = (
net_inputs_train[first_sample_index + 1][0] * w1
+ net_inputs_train[first_sample_index + 1][1] * w2
+ neuron_data[0] * w3
+ b
);
dw1[1] = net_inputs_train[first_sample_index + 1][0] + w3 * dw1[0];
dw2[1] = net_inputs_train[first_sample_index + 1][1] + w3 * dw2[0];
dw3[1] = neuron_data[0] + w3 * dw3[0];
derb[1] = 1 + derb[0] * w3;
neuron_data[2] = (
net_inputs_train[first_sample_index + 2][0] * w1
+ net_inputs_train[first_sample_index + 2][1] * w2
+ neuron_data[1] * w3
+ b
);
dw1[2] = net_inputs_train[first_sample_index + 2][0] + w3 * dw1[1];
dw2[2] = net_inputs_train[first_sample_index + 2][1] + w3 * dw2[1];
dw3[2] = neuron_data[1] + w3 * dw3[1];
derb[2] = 1 + derb[1] * w3;
/* Calculate the error and the gradients */
manual_error = (
pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
+pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
+pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
);
/* Calculate gradients for sequence 2 */
neuron_deriv = (
-(neuron_data[2] - expected_output[first_sample_index + 2][0])
-w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
-w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
);
gradw1 += dw1[2] * neuron_deriv;
gradw2 += dw2[2] * neuron_deriv;
gradw3 += dw3[2] * neuron_deriv;
gradb += derb[2] * neuron_deriv / 2.0;
/* Calculate gradients for sequence 1 */
neuron_deriv = (
-(neuron_data[1] - expected_output[first_sample_index + 1][0])
-w3*(neuron_data[1] - expected_output[first_sample_index + 1][0])
);
gradw1 += dw1[1] * neuron_deriv;
gradw2 += dw2[1] * neuron_deriv;
gradw3 += dw3[1] * neuron_deriv;
gradb += derb[1] * neuron_deriv;
/* Calculate gradients for sequence 0 */
neuron_deriv = -(neuron_data[0] - expected_output[first_sample_index + 0][0]);
gradw1 += dw1[0] * neuron_deriv;
gradw2 += dw2[0] * neuron_deriv;
gradw3 += dw3[0] * neuron_deriv;
gradb += derb[0] * neuron_deriv;
w1 += (learning_rate * (gradw1) / (sequence_size * minibatch_size));
w2 += (learning_rate * (gradw2) / (sequence_size * minibatch_size));
w3 += (learning_rate * (gradw3) / (sequence_size * minibatch_size));
b += (learning_rate * (gradb) / (sequence_size * minibatch_size));
std::cout << "\r Error: " << manual_error << " ";
}
}
std::cout << std::endl;
return 0;
}
老实说,我很难相信这确实有效。我真的希望我能帮助将来尝试此操作的任何人。
我尝试通过时间手动实现反向传播,但最终网络没有收敛。 我尝试在网上四处寻找关于 BPTT 的描述和课程,代码会相应地执行所有操作:
- 正向传播
- 向后传播错误
- 根据期望值计算梯度
- 根据梯度和学习率更新权重
我理解递归导数的方式是,在递归神经网络的情况下,上一步的输入不能被视为常数。所以例如:
第3步w1
的导数不仅取决于当前步骤的输入,还取决于前面的步骤。所以dw1[1] = net_inputs_train[first_sample_index + 1][0];
不正确,应该是dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
。
在展开的网络中,其他一切都应该是反向传播 "only"。 不幸的是,这个程序根本不起作用,错误只是在没有网络收敛的情况下跳来跳去。
我不知道我还能做些什么来完成这项工作,也许我完全误解了它的概念...
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
int main(int argc, char *argv[]){
srand(time(nullptr));
/* Manual BPTT with one custom implemented Neuron */
double number_of_samples = 3; /* Binary addition dataset */
vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
{1,1}, {0,0}, {0,0}, /* 100 + 100 = 110 */
{1,0}, {0,1}, {1,0}, /* 101 + 010 = 111*/
{1,0}, {1,1}, {0,0}, /* 110 + 010 = 111 */
};
vector<vector<double>> expected_output = { /* 1 output in each step */
{1}, {1}, {0}, /* 110 */
{1}, {1}, {1}, /* 111 */
{1}, {1}, {1}, /* 111 */
};
double w1 = 0.5;
double w2 = 0.5;
double w3 = 0.5;
double b = 0.0;
vector<double> neuron_data(3,0);
vector<double> neuron_deriv(3,0); /* Neuron error value ( partial based on the output )*/
vector<double> dw1(3,0); /* derivatives for weights for each sequence */
vector<double> dw2(3,0);
vector<double> dw3(3,0);
vector<double> derb(3,0);
int first_sample_index;
double manual_error = 1.0;
double learning_rate = 1e-2;
while(manual_error > learning_rate){
for(int mbIter = 0; mbIter < 4; ++mbIter){
first_sample_index = (rand()%(static_cast<int>(number_of_samples)));
/* Fill in the data and derviatives */
neuron_data[0] = (
net_inputs_train[first_sample_index][0] * w1
+ net_inputs_train[first_sample_index][1] * w2
+ b
);
dw1[0] = net_inputs_train[first_sample_index][0];
dw2[0] = net_inputs_train[first_sample_index][1];
dw3[0] = 0;
derb[0] = 1;
neuron_data[1] = (
net_inputs_train[first_sample_index + 1][0] * w1
+ net_inputs_train[first_sample_index + 1][1] * w2
+ neuron_data[0] * w3
+ b
);
dw1[1] = net_inputs_train[first_sample_index + 1][0] + dw1[0] * w3;
dw2[1] = net_inputs_train[first_sample_index + 1][1] + dw2[0] * w3;
dw3[1] = neuron_data[0] + w3 * dw3[0];
derb[1] = 1 + derb[0] * w3;
neuron_data[2] = (
net_inputs_train[first_sample_index + 2][0] * w1
+ net_inputs_train[first_sample_index + 2][1] * w2
+ neuron_data[1] * w3
+ b
);
dw1[2] = net_inputs_train[first_sample_index + 2][0] + dw1[1] * w3;
dw2[2] = net_inputs_train[first_sample_index + 2][1] + dw2[1] * w3;
dw3[2] = neuron_data[1] + w3 * dw3[1];
derb[2] = 1 + derb[1] * w3;
/* Calculate the error and the gradients */
manual_error = (
pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
+pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
+pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
);
neuron_deriv[2] = (
(-(neuron_data[2] - expected_output[first_sample_index + 2][0])/2.0)
);
neuron_deriv[1] = (
(-(neuron_data[1] - expected_output[first_sample_index + 1][0])/2.0)
+ (w3 * neuron_deriv[2])
);
neuron_deriv[0] = (
(-(neuron_data[0] - expected_output[first_sample_index + 0][0])/2.0)
+ (w3 * neuron_deriv[1])
);
w1 += (learning_rate * (
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / number_of_samples);
w2 += (learning_rate * (
neuron_deriv[2] * dw2[2]
+ neuron_deriv[1] * dw2[1]
+ neuron_deriv[0] * dw2[0]
) / number_of_samples);
w3 += (learning_rate * (
neuron_deriv[2] * dw3[2]
+ neuron_deriv[1] * dw3[1]
+ neuron_deriv[0] * dw3[0]
) / number_of_samples);
b += (learning_rate * (
neuron_deriv[2] * derb[2]
+ neuron_deriv[1] * derb[1]
+ neuron_deriv[0] * derb[0]
) / number_of_samples);
std::cout << "\r Error: " << manual_error << " \n";
}
}
return 0;
}
编辑: 一件有趣的事情是,如果 w1 += (learning_rate * (...)/number_of_samples);
切换到 w1 += ((...)/number_of_samples);
我认为这是一个错字:
w1 += ((
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / 300.0); // why?
因为你没有对其他重量做同样的事情。
如果您将其更改为与您计算其他权重的方式一致:
w1 += ((
neuron_deriv[2] * dw1[2]
+ neuron_deriv[1] * dw1[1]
+ neuron_deriv[0] * dw1[0]
) / number_of_samples); // makes more sense
也许您打算在 b
的计算中使用常量 300.0
。无论如何,不要使用那样的幻数;给它起个名字。
其他问题;不要使用 using namespace std;
。让你所有的常量值 const
,甚至更好,constexpr
。此外,将循环中的代码分解为几个命名函数。例如如果你这样做是为了重量计算,你的代码中的不一致甚至永远不会发生。
那么我从哪里开始呢?
除了一些逻辑错误(例如在第 43 行,在 last_sample_index
的设置处),主要问题是反向传播在序列之间混合。
含义:每个序列都混合了其他序列的误差值。所以即使 输入来自隐藏状态,它不应该影响其他序列的梯度。
当我在我的纸堆上哭泣时,我意识到这一点,迫使我检查 BPTT 技术(以及我的生活选择)到骨头,用反向传播算法交叉检查它,因为different sequences through time is basically a special kind of back-propagation, where some of the coefficients of the formulas are repeating.
考虑到这一点,我重新编写了代码以按顺序分离梯度计算。
然后,就是vanishing/exploding gradients的问题了。在上述返工之后,网络仍然没有收敛,因此。在第三次分解和我发现的一些实验之后,只需将来自序列 2 的偏差的梯度减半即可解决消失问题。偏差的梯度是有目标的,因为在数值上它是所有权重中最大的。
下面的程序现在可以运行了,网络收敛成功。
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
int main(int argc, char *argv[]){
srand(time(nullptr));
/* Manual BPTT with one custom implemented Neuron */
double sequence_size = 3;
double number_of_samples = 3; /* Binary addition dataset */
double minibatch_size = 4;
vector<vector<double>> net_inputs_train = { /* 2 inputs in each step */
{1,1}, {0,0}, {0,0}, /* 100 + 100 = 110 */
{1,0}, {0,1}, {1,0}, /* 101 + 010 = 111*/
{1,0}, {1,1}, {0,0}, /* 110 + 010 = 111 */
};
vector<vector<double>> expected_output = { /* 1 output in each step */
{1}, {1}, {0}, /* 110 */
{1}, {1}, {1}, /* 111 */
{1}, {1}, {1}, /* 111 */
};
double w1 = 0.5;
double w2 = 0.5;
double w3 = 0.5;
double b = 0.0;
double gradw1; /* gradients for the weights */
double gradw2;
double gradw3;
double gradb;
vector<double> neuron_data(3,0);
double neuron_deriv = 0; /* Neuron error value ( partial based on the expected output and the error function )*/
vector<double> dw1(3,0); /* derivatives for weights for each sequence */
vector<double> dw2(3,0);
vector<double> dw3(3,0);
vector<double> derb(3,0);
int first_sample_index;
double manual_error = 1.0;
double learning_rate = 1e-2;
while(manual_error > learning_rate){
for(int mbIter = 0; mbIter < minibatch_size; ++mbIter){ /* minibatches */
first_sample_index = sequence_size * (rand()%(static_cast<int>(number_of_samples)));
gradw1 = 0;
gradw2 = 0;
gradw3 = 0;
gradb = 0;
/* Fill in the data and derviatives */
neuron_data[0] = (
net_inputs_train[first_sample_index][0] * w1
+ net_inputs_train[first_sample_index][1] * w2
+ b
);
dw1[0] = net_inputs_train[first_sample_index][0];
dw2[0] = net_inputs_train[first_sample_index][1];
dw3[0] = 0;
derb[0] = 1;
neuron_data[1] = (
net_inputs_train[first_sample_index + 1][0] * w1
+ net_inputs_train[first_sample_index + 1][1] * w2
+ neuron_data[0] * w3
+ b
);
dw1[1] = net_inputs_train[first_sample_index + 1][0] + w3 * dw1[0];
dw2[1] = net_inputs_train[first_sample_index + 1][1] + w3 * dw2[0];
dw3[1] = neuron_data[0] + w3 * dw3[0];
derb[1] = 1 + derb[0] * w3;
neuron_data[2] = (
net_inputs_train[first_sample_index + 2][0] * w1
+ net_inputs_train[first_sample_index + 2][1] * w2
+ neuron_data[1] * w3
+ b
);
dw1[2] = net_inputs_train[first_sample_index + 2][0] + w3 * dw1[1];
dw2[2] = net_inputs_train[first_sample_index + 2][1] + w3 * dw2[1];
dw3[2] = neuron_data[1] + w3 * dw3[1];
derb[2] = 1 + derb[1] * w3;
/* Calculate the error and the gradients */
manual_error = (
pow((neuron_data[2] - expected_output[first_sample_index + 2][0]),2)/2.0
+pow((neuron_data[1] - expected_output[first_sample_index + 1][0]),2)/2.0
+pow((neuron_data[0] - expected_output[first_sample_index + 0][0]),2)/2.0
);
/* Calculate gradients for sequence 2 */
neuron_deriv = (
-(neuron_data[2] - expected_output[first_sample_index + 2][0])
-w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
-w3*(neuron_data[2] - expected_output[first_sample_index + 2][0])
);
gradw1 += dw1[2] * neuron_deriv;
gradw2 += dw2[2] * neuron_deriv;
gradw3 += dw3[2] * neuron_deriv;
gradb += derb[2] * neuron_deriv / 2.0;
/* Calculate gradients for sequence 1 */
neuron_deriv = (
-(neuron_data[1] - expected_output[first_sample_index + 1][0])
-w3*(neuron_data[1] - expected_output[first_sample_index + 1][0])
);
gradw1 += dw1[1] * neuron_deriv;
gradw2 += dw2[1] * neuron_deriv;
gradw3 += dw3[1] * neuron_deriv;
gradb += derb[1] * neuron_deriv;
/* Calculate gradients for sequence 0 */
neuron_deriv = -(neuron_data[0] - expected_output[first_sample_index + 0][0]);
gradw1 += dw1[0] * neuron_deriv;
gradw2 += dw2[0] * neuron_deriv;
gradw3 += dw3[0] * neuron_deriv;
gradb += derb[0] * neuron_deriv;
w1 += (learning_rate * (gradw1) / (sequence_size * minibatch_size));
w2 += (learning_rate * (gradw2) / (sequence_size * minibatch_size));
w3 += (learning_rate * (gradw3) / (sequence_size * minibatch_size));
b += (learning_rate * (gradb) / (sequence_size * minibatch_size));
std::cout << "\r Error: " << manual_error << " ";
}
}
std::cout << std::endl;
return 0;
}
老实说,我很难相信这确实有效。我真的希望我能帮助将来尝试此操作的任何人。