仅包含向量的 MLP 和反向传播问题

MLP with vectors only and backprop issue

我对 AI 感兴趣并开始学习它。 我尝试仅基于向量实现 MLP class,但它无法正常工作。

前馈功能好像还可以,但是显然我对反向传播算法缺乏了解。 Train 函数是一个虚拟函数,用于测试 XOR 情况,网络总是 returns 相同的结果(在我的例子中为 ~0.625915),当输入为 1 和 0(预期为 1)时,从 0.264518 开始设置错误作为输出)到 0.442609 当输入为 1 和 1(预期输出为 0)时。

我想知道我在反向传播和下降梯度方面做错了什么。 下面是这个 class 的完整代码和主要功能。 感谢您的帮助和亮点!

#include <iostream>
#include <vector>
#include <cassert>
#include <functional>
#include <stdlib.h>


using namespace std;
typedef function<double(double, bool)> func;
typedef vector < vector < vector<double> > > Matrix3d;


class Net {
public:
    Net(const vector<unsigned> &topology, vector<func> &fns) {
        learning_rate = 0.1;
        alpha = 0.5;
        global_error = 1.0;

        activationFns = fns;
        nbLayers = topology.size();
        lastLayerId = nbLayers - 1;

        gradients.resize(nbLayers);
        neuron_errors.resize(nbLayers);
        layers.resize(nbLayers);
        weights.resize(nbLayers);
        wdeltas.resize(nbLayers);

        for (unsigned layerNum = 0; layerNum < nbLayers; layerNum++) {
            bool isLastLayer = layerNum == lastLayerId;
            unsigned nbNeuronsInLayer = isLastLayer ? topology[layerNum] : topology[layerNum] + 1;
            unsigned nbWeights = isLastLayer ? 0 : topology[layerNum + 1] + 1;

            gradients[layerNum].resize(nbNeuronsInLayer, 0.0);
            layers[layerNum].resize(nbNeuronsInLayer);
            weights[layerNum].resize(nbNeuronsInLayer);
            wdeltas[layerNum].resize(nbNeuronsInLayer);
            neuron_errors[layerNum].resize(nbNeuronsInLayer, 0.0);

            if (! isLastLayer) {
                layers[layerNum][nbNeuronsInLayer-1] = 1.0; // initialisation du bias 
            }

            for (unsigned n = 0; n < weights[layerNum].size(); n++) {
                weights[layerNum][n].resize(nbWeights); // On affecte le nombre de neurones du layer suivant : nombre de weights de ce neurone 
                wdeltas[layerNum][n].resize(nbWeights, 0.0);

                InitialiseWeights(weights[layerNum][n]); // on randomise les weights des neurones de ce layer 
            }
        }
    };

    ~Net() {
        gradients.clear();
        layers.clear();
        weights.clear();
        wdeltas.clear();
        neuron_errors.clear();
    };



    // on propage à travers le réseau 
    // lors du feed forward, output vaut = activationFn(somme des entrées des neurones * leurs poids)
    // pour chaque neurone du layer précédent :
    // on prend sa sortie : prevLayer[n] qu'on multiplie par le poids vers le neurone i du layer actuel 
    void FeedForward(const vector<double> &inputs) {
        assert(inputs.size() == layers[0].size() - 1);

        // on assigne les entrées aux sorties des neurones du layer INPUT 
        for (unsigned i = 0; i < inputs.size(); i++) {
            layers[0][i] = inputs[i];
        }

        for (unsigned layerNum = 1; layerNum < nbLayers; layerNum++) {
            vector<double> &prevLayer = layers[layerNum - 1];    
            
            const bool isLastLayer = layerNum == lastLayerId;
            const unsigned forcap = isLastLayer ? layers[layerNum].size() : layers[layerNum].size() - 1;


            for (unsigned i = 0; i < forcap; i++) {
                const double bias = prevLayer[prevLayer.size()-1] * weights[layerNum-1][weights[layerNum-1].size()-1][i];
                double output = 0.0; 

                for (unsigned n = 0; n < prevLayer.size() - 1; n++) {
                    output += prevLayer[n] * weights[layerNum - 1][n][i];
                }

                output += bias;
                layers[layerNum][i] = activationFns[layerNum - 1](output, false);
            }
        }

        //Print();
    };


    void BackPropagate(const vector<double> &targets) {
        vector<double> &guessed = layers[lastLayerId];
        func &outputActivationFn = activationFns[lastLayerId];

        assert(targets.size() == guessed.size());
        global_error = 0.0;

        // Calcul des erreurs de la couche OUTPUT //
        for (unsigned t = 0; t < targets.size(); t++) {
            double diff_ =  targets[t] - guessed[t];
            global_error += (diff_ * diff_); 

            neuron_errors[lastLayerId][t] = targets[t] - guessed[t]; // l'erreur du neurone de sortie
            gradients[lastLayerId][t] = diff_ * outputActivationFn(guessed[t], true);
        }

        if (guessed.size() > 1)
            global_error /= guessed.size()-1;
        else
            global_error *= 0.5;
        global_error = sqrt(global_error);

        // Calcul des erreurs des neurones des autres couches 
        for (unsigned l = nbLayers - 2; l < nbLayers; --l) {

            // récupérer les weights reliant la couche hidden à la couche output 
            for (unsigned n = 0; n < layers[l].size(); n++) { // pour chaque neurone de cette couche 
                neuron_errors[l][n] = 0.0;

                for (unsigned m = 0; m < layers[l+1].size(); m++) { // on target le neurone m de la couche supérieure
                    double &weight = weights[l][n][m];

                    // là on peut calculer l'erreur du neurone n
                    neuron_errors[l][n] += weight * gradients[l+1][m];
                }

                gradients[l][n] = neuron_errors[l][n] * activationFns[l](layers[l][n], true); // ?
            }
        }

        // Mise à jour des weights (?)
        for (unsigned l = nbLayers - 2; l < nbLayers; --l) {
            for (unsigned n = 0; n < layers[l].size(); n++) {
                for (unsigned m = 0; m < layers[l + 1].size(); m++) {
                    weights[l][n][m] -= (learning_rate * gradients[l][n] * layers[l][n]) + (wdeltas[l][n][m] * alpha);
                    wdeltas[l][n][m] = (learning_rate * gradients[l][n] * layers[l][n]) + (wdeltas[l][n][m] * alpha);
                }
            }
        }
    };


    void GetResults(vector<double> &results) {
        results.clear();
        for (unsigned i = 0; i < layers[lastLayerId].size(); i++) {
            results[i] = layers[lastLayerId][i];
        }
    };


    void Train() {
        vector < vector<double> > ins = {
            { 1.0, 0.0 },
            { 0.0, 1.0 },
            { 0.0, 0.0 },
            { 1.0, 1.0 }
        };

        vector < vector<double> > outs = {
            { 1.0 },
            { 1.0 },
            { 0.0 },
            { 0.0 }
        };

        for (unsigned i = 0; i < 1000; i++) {
            unsigned r = rand() % ins.size();

            vector<double> k = ins[r];
            vector<double> o = outs[r];

            FeedForward(k);
            BackPropagate(o);

            cout << "[" << i << "] " << k[0] << " & " << k[1] << " -> " << o[0] << "\tresult : " << layers[lastLayerId][0] << "\terror = " << global_error << endl;
        }


        cout << endl << "Test: [ 1 , 0 ]" << endl;
        FeedForward({ 1.0, 0.0 });
        BackPropagate({ 1.0 });
        cout << "Result : " << layers[lastLayerId][0] << "\t(error = " << global_error << endl;

        cout << "Test: [ 1 , 1 ]" << endl;
        FeedForward({ 0.85, 0.99 });
        BackPropagate({ 0.0 });
        cout << "Result : " << layers[lastLayerId][0] << "\t(error = " << global_error << endl;
    };



    double Getglobal_error(void) const {
        return global_error;
    };

    void Print(void) {
        for (unsigned l = 0; l < nbLayers; l++) {
            cout << "Layer " << l << " : " << endl;

            for (unsigned n = 0; n < layers[l].size(); n++) {
                cout << "\t" << "Neuron " << l << "-" << n << " : ";
                cout << "(" << layers[l][n] << ")" << endl;

                for (unsigned w = 0; w < weights[l][n].size(); w++) {
                    cout << "\t\t" << l << "-" << n << " -> " << (l+1) << "-" << w << " | weight=" << weights[l][n][w] << endl;
                }
            }
        }
    }

private:
    void InitialiseWeights(vector<double> &weights_) {
        for (unsigned w = 0; w < weights_.size(); w++) {
            weights_[w] = ((double) rand() / (RAND_MAX));
        }
    }

    double global_error;
    double learning_rate;
    double alpha;
    unsigned nbLayers;
    unsigned lastLayerId;
    vector<func> activationFns;
    vector< vector<double> > gradients; // [layerNum][neuronNum] gradients des erreurs des neurones 
    vector< vector<double> > layers; // [layerNum][neuronNum]
    vector< vector<double> > neuron_errors; // [layerNum][neuronNum] // erreur des neurones 
    Matrix3d weights; // [layer][neuron][outputWeight]
    Matrix3d wdeltas; // [layer][neuron][outputWeight]
};




double transfer_tanh(double x, bool isDerivative) {
    if (isDerivative) {
        return 1.0 - (tanh(x) * tanh(x));
    }

    return tanh(x);
}

double transfer_sigmoid(double x, bool isDerivative) {
    if (isDerivative) {
        return x * (1.0 - x);
    }

    return 1.0 / (1.0 + exp(-x));
}


int main () {
    vector<unsigned> topo = { 2, 2, 1 };
    vector<func> funcs = { transfer_sigmoid, transfer_sigmoid, transfer_sigmoid };

    Net mynet(topo, funcs);
    
    /*
    mynet.FeedForward({ 1.0, 0.0 });
    mynet.BackPropagate({ 1.0 });
    mynet.Print();
    mynet.FeedForward({ 1.0, 0.0 });
    mynet.BackPropagate({ 1.0 });
    mynet.Print();
    */
    mynet.Train();
}

我对反向传播的数学知识缺乏了解。 多亏了这个资源:https://pabloinsente.github.io/the-multilayer-perceptron,我想通了这个 BackPropagate 方法:

void BackPropagate(const vector<double> &targets) {
        assert(targets.size() == layers[lastLayerId].size());
        global_error = 0.0;

        for (unsigned l = lastLayerId; l < nbLayers; --l) {
            for (unsigned n = 0; n < layers[l].size(); n++) {
                neuron_errors[l][n] = 0.0;

                if (l == lastLayerId) { // couche output
                    global_error += (targets[n] - layers[lastLayerId][n]) * (targets[n] - layers[lastLayerId][n]);
                    neuron_errors[lastLayerId][n] = (targets[n] - layers[lastLayerId][n]) * activationFns[lastLayerId](layers[lastLayerId][n], true);
                    continue;
                }

                for (unsigned m = 0; m < layers[l + 1].size(); m++) {
                    double neuron_output = (l == 0) ? inputs[n] : layers[l][n];
                    double delta = learning_rate * (neuron_errors[l + 1][m] * neuron_output);

                    neuron_errors[l][n] += (neuron_errors[l + 1][m] * weights[l][n][m]) 
                        * activationFns[l](layers[l][n], true);

                    weights[l][n][m] += delta + (wdeltas[l][n][m] * alpha);
                    wdeltas[l][n][m] = delta;
                }
            }
        }
}