反向传播问题;总成本越来越高,直到无穷大

backpropagation trouble; getting higher and higher total cost up until its infinity

我根据韦尔奇实验室的视频用 numpy 制作了一个 FC 神经网络,但是当我尝试训练它时,我似乎在启动时有爆炸梯度,这很奇怪,我会放下整个可测试的代码在 python 3+。只有 costfunctionprime 似乎打破了梯度下降的东西,但我不知道发生了什么。比我聪明的人可以帮忙吗?

编辑:trng_input 和 trng_output 不是我用的,我用的是大数据集

import numpy as np
import random

trng_input = [[random.random() for _ in range(7)] for _ in range(100)]
trng_output = [[random.random() for _ in range(2)] for _ in range(100)]

def relu(x):
    return x * (x > 0)

def reluprime(x):
    return (x>0).astype(x.dtype)


class Neural_Net():
    def __init__(self, data_input, data_output):
        self.data_input = data_input
        self.trng_output = trng_output
        self.bias = 0
        self.nodes = np.array([7, 2])
        self.LR = 0.01
        self.weightinit()
        self.training(1000, self.LR)

    def randomweight(self, n):
        output = []
        for i in range(n):
            output.append(random.uniform(-1,1))
        return output

    def weightinit(self):
        self.weights = []
        for n in range(len(self.nodes)-1):
            temp = []
            for _ in range(self.nodes[n]+self.bias):
                temp.append(self.randomweight(self.nodes[n+1]))
            self.weights.append(temp)
        self.weights = [np.array(tuple(self.weights[i])) for i in range(len(self.weights))]


    def forward(self, data):
        self.Z = []
        self.A = [np.array(data)]

        for layer in range(len(self.weights)):
            self.Z.append(np.dot(self.A[layer], self.weights[layer]))
            self.A.append(relu(self.Z[layer]))

        self.output = self.A[-1]
        return self.output

    def costFunction(self):
        self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
        return self.totalcost

    def costFunctionPrime(self):
        self.forward(self.data_input)
        self.delta = [[] for x in range(len(self.weights))]
        self.DcostDw = [[] for x in range(len(self.weights))]

        for layer in reversed(range(len(self.weights))):
            Zprime = reluprime(self.Z[layer])
            if layer == len(self.weights)-1:
                self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
            else:
                self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
            self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])

        return self.DcostDw

    def backprop(self, LR):
        self.DcostDw = (np.array(self.DcostDw)*LR).tolist()
        self.weights = (np.array(self.weights) - np.array(self.DcostDw)).tolist()

    def training(self, iteration, LR):
        for i in range(iteration):
            self.costFunctionPrime()
            self.backprop(LR)
            if (i/1000.0) == (i/1000):
                print(self.costFunction())
        print(sum(self.costFunction())/len(self.costFunction()))

NN = Neural_Net(trng_input, trng_output)

正如所问,这是预期的结果(我使用 sigmoid 激活函数得到的结果):

如您所见,数字正在下降,因此网络正在训练。

这是使用relu激活函数的结果:

这里,网络卡住了,没有接受训练,它从未使用 relu 激活函数接受过训练,想了解为什么

我认为问题出在你的成本函数上。

def costFunction(self):
    self.totalcost = 0.5*sum((self.trng_output-self.output)**2)
    return self.totalcost

特别是这一行,

self.totalcost = 0.5*sum((self.trng_output-self.output)**2)

您已通过对所有错误求和来计算成本。由于您提到您使用了非常大的数据集,因此 self.totalcost 会变得非常大。反过来计算出来的梯度也会很大

尝试使用 stochastic gradient descent 或像这样取平均值,

self.totalcost = 0.5 * np.mean((self.trng_output-self.output)**2)

如果你的成本没有随着 ReLu 激活而降低,那么你的网络似乎卡在了 ReLu 输入为负的区域,因此它的输出为常数零,并且没有梯度回流 - 神经元死了。

您可以使用 leaky ReLu 而不是简单的 ReLu 来解决这个问题。您还应该开始训练偏见。对于 ReLu,建议使用较小的正值初始化偏差,以避免这种死神经元问题。

对于某些问题,它也有助于降低学习率并使网络更深。也许,你想让学习率可调,例如如果成本没有减少,则将 LR 乘以 0.5。

使用 leaky ReLu、可训练偏差和一些重构,您的模型可能如下所示:

import numpy as np
trng_input = np.random.uniform(size=(1000, 7))
trng_output = np.column_stack([np.sin(trng_input).sum(axis=1), np.cos(trng_input).sum(axis=1)])

LEAK = 0.0001

def relu(x):
    return x * (x > 0) + LEAK * x * (x < 0)

def reluprime(x):
    return (x>0).astype(x.dtype) + LEAK * (x<0).astype(x.dtype)


class Neural_Net():
    def __init__(self, data_input, data_output):
        self.data_input = data_input
        self.trng_output = trng_output
        self.nodes = np.array([7, 10, 2])
        self.LR = 0.00001
        self.weightinit()
        self.training(2000, self.LR)

    def weightinit(self):
        self.weights = [np.random.uniform(-1, 1, size=self.nodes[i:(i+2)]) for i in range(len(self.nodes) - 1)]
        self.biases = [np.random.uniform(0, 1, size=self.nodes[i+1]) for i in range(len(self.nodes) - 1)]

    def forward(self, data):
        self.Z = []
        self.A = [np.array(data)]
        for layer in range(len(self.weights)):
            self.Z.append(np.dot(self.A[layer], self.weights[layer]) + self.biases[layer])
            self.A.append(relu(self.Z[layer]))
        self.output = self.A[-1]
        return self.output

    def costFunction(self):
        self.totalcost = 0.5*np.sum((self.trng_output-self.output)**2, axis=0)
        return self.totalcost

    def costFunctionPrime(self):
        self.forward(self.data_input)
        self.delta = [[] for x in range(len(self.weights))]
        self.DcostDw = [[] for x in range(len(self.weights))]
        self.DcostDb = [[] for x in range(len(self.weights))]
        for layer in reversed(range(len(self.weights))):
            Zprime = reluprime(self.Z[layer])
            if layer == len(self.weights)-1:
                self.delta[layer] = np.multiply(-(self.trng_output-self.output), Zprime)
            else:
                self.delta[layer] = np.dot(self.delta[layer+1], self.weights[layer+1].T) * Zprime
            self.DcostDw[layer] = np.dot(self.A[layer].T, self.delta[layer])
            self.DcostDb[layer] = np.sum(self.delta[layer], axis=0)

    def backprop(self, LR):
        for layer in range(len(self.weights)):
            self.weights[layer] -= self.DcostDw[layer] * LR
            self.biases[layer] -= self.DcostDb[layer] * LR

    def training(self, iteration, LR):
        for i in range(iteration):
            self.costFunctionPrime()
            self.backprop(LR)
            if (i/100.0) == (i/100):
                print(self.costFunction())
        print(sum(self.costFunction())/len(self.costFunction()))

NN = Neural_Net(trng_input, trng_output)