
python batch gradient descent does not converge

我提高和降低了学习率,但似乎没有收敛或需要很长时间。 如果我将学习率设置为 0.0004,它会慢慢尝试收敛,但需要如此多的迭代,我不得不设置超过 100 万次以上的迭代,并且只能设法从 93 最小平方误差变为 58

我正在关注 Andrews NG 论坛



import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import time

data = pd.read_csv('weight-height.csv')
x = np.array(data['Height'])
y = np.array(data['Weight'])

plt.scatter(x, y, c='blue')
total = mpatches.Patch(color='blue', label='Total amount of data {}'.format(len(x)))

theta0 = 0
theta1 = 0
learning_rate = 0.0004
epochs = 10000

# gradient = theta0 + theta1*X

def hypothesis(x):
    return theta0 + theta1 * x

def cost_function(x):
    return 1 / (2 * len(x)) * sum((hypothesis(x) - y) ** 2)

start = time.time()

for i in range(epochs):
    print(f'{i}/ {epochs}')
    theta0 = theta0 - learning_rate * 1/len(x) * sum (hypothesis(x) - y)
    theta1 = theta1 - learning_rate * 1/len(x) * sum((hypothesis(x) - y) * x)
    print('\ncost: {}\ntheta0: {},\ntheta1: {}'.format(cost_function(x), theta0, theta1))

end = time.time()

plt.plot(x, hypothesis(x), c= 'red')

print('\ncost: {}\ntheta0: {},\ntheta1: {}'.format(cost_function(x), theta0, theta1))

print('time finished at {} seconds'.format(end - start))

您的问题可能是您正在逐一更新 theta0theta1

theta0 = theta0 - learning_rate * 1/len(x) * sum (hypothesis(x) - y)
# the update to theta1 is now using the updated version of theta0
theta1 = theta1 - learning_rate * 1/len(x) * sum((hypothesis(x) - y) * x)

最好重写 'hypothesis' 函数一次,然后显式将 theta0 和 theta1 的值传递给它使用,而不是使用全局值。

# modify to explicitly pass theta0/1
def hypothesis(x, theta0, theta1):
    return theta0 + theta1 * x

# explicitly pass y
def cost_function(x, y, theta0, theta1):
    return 1 / (2 * len(x)) * sum((hypothesis(x, theta0, theta1) - y) ** 2)

for i in range(epochs):
    print(f'{i}/ {epochs}')
    # calculate hypothesis once
    delta = hypothesis(x, theta0, theta1)
    theta0 = theta0 - learning_rate * 1/len(x) * sum (delta - y)
    theta1 = theta1 - learning_rate * 1/len(x) * sum((delta - y) * x)
    print('\ncost: {}\ntheta0: {},\ntheta1: {}'.format(cost_function(x, y, theta0, theta1))

