成本函数上升的神经网络错误

Bug in Neural Network with cost function rising

我一直在研究我的第一个神经网络,完全从头开始构建它。然而,当打印成本函数以跟踪模型进度时,它只会上升,我使用的数据只是 1s,0s 我想要我的第一个模型的一些简单的东西。它有两个tanh节点的一个隐藏层,然后输出到一个sigmoid单元。

代码如下,从 jupyter notebook 的 markdown 版本复制而来:

import numpy as np
import matplotlib.pyplot as plt
#creating our data
x = np.array([[0, 1, 0, 1], [0, 1, 0, 1], [1, 0, 1, 0], [1, 0, 1, 0], [0, 1, 0, 1]])
y = np.array([0, 1, 0, 1])
y = y.reshape(1, 4)
print(x)
[[0 1 0 1]
 [0 1 0 1]
 [1 0 1 0]
 [1 0 1 0]
 [0 1 0 1]]
print(y)
[[0 1 0 1]]
print(x.shape)
(5, 4)
print(y.shape)
(1, 4)
#initalize parameters
def rand_params():
    W1 = np.random.randn(2, 5)
    b1 = np.zeros([2, 1])

    W2 = np.random.randn(1, 2)
    b2 = np.zeros([1, 1])
    
    return W1, b1, W2, b2

W1, b1, W2, b2 = rand_params()
print(f"W1: {W1}, b1: {b1}")
print(W1.shape, b1.shape)
W1: [[ 0.60366603 -0.12225707 -0.44483219 -1.40200651 -3.02768333]
 [-0.98659326 -0.91009808  0.72461745  0.20677563  0.17493105]], b1: [[0.]
 [0.]]
(2, 5) (2, 1)
print(f"W2: {W2}, b2: {b2}")
print(W2.shape, b2.shape)
W2: [[0.05478931 0.99102802]], b2: [[0.]]
(1, 2) (1, 1)
#forward propogation 
def tanh(z):
    a = (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))
    return a 

def sigmoid(z):
    a = 1 / (1 + np.exp(z))
    return a 

def der_tanh(z):
    a = 1 - (tanh(z))**2
    return a 

def der_sigmoid(z):
    a = sigmoid(z) * (1 - sigmoid(z))
    # return a <-- MISSING?
#forward computation
def forward_prop(x, W1, b1, W2, b2):
    Z1 = np.dot(W1, x) + b1
    A1 = np.tanh(Z1)

    Z2 = np.dot(W2, A1) + b2
    y_hat = sigmoid(Z2)
    return Z1, A1, Z2, y_hat

Z1, A1, Z2, y_hat = forward_prop(x, W1, b1, W2, b2)
def cost_function(y, y_hat, x):
    m = x.shape[1]
    J = -1 / m * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    
    return J, m

J, m = cost_function(y, y_hat, x)
#back propogation
def back_prop():
    dZ2 = y_hat - y
    dW2 = 1 / m * np.dot(dZ2, A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)

    dZ1 = np.dot(W2.T, dZ2) * der_tanh(Z1)
    dW1 = 1 / m * np.dot(dZ1, x.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
    
    return dW2, db2, dW1, db1

dW2, db2, dW1, db1 = back_prop()
#optimizing weights + biases
def update(W1, b1, W2, b2):
    lr = 0.01
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1

    W2 = W2 - lr * dW2 
    b2 = b2 - lr * db2
    
    return W1, b1, W2, b2
    
W1, b1, W2, b2 = update(W1, b1, W2, b2)
# model 

costs = []
W1, b1, W2, b2 = rand_params()


for epoch in range(1500):
    Z1, A1, Z2, y_hat = forward_prop(x, W1, b1, W2, b2)
    
    J, m = cost_function(y, y_hat, x)
    
    if epoch % 100 == 0:
        print(J)
    costs.append(J)
    
    dW2, db2, dW1, db1 = back_prop()
    
    W1, b1, W2, b2 = update(W1, b1, W2, b2)
    
plt.plot(costs)
    
0.8188282199860928
1.1665507761146539
1.6868025884074527
2.3940967534280753
3.2473658397522387
4.183790888527539
5.158135855432985
6.147978715339146
7.143956636487831
8.142392777023431
9.141860280152706
10.141802197682296
11.142002210070622
12.142384342966537
13.142939005842882

除了任何其他可能的错误,sigmoid(z) 应该定义为:

def sigmoid(z):
    a = 1/(1 + np.exp(-z))
    #                 ^
    return a