新元实施 Python

Question

我知道之前有人在 SO 上询问过 SGD，但我想对我的代码发表意见，如下所示：

import numpy as np
import matplotlib.pyplot as plt

# Generating data

m,n = 10000,4
x = np.random.normal(loc=0,scale=1,size=(m,4))
theta_0 = 2
theta = np.append([],[1,0.5,0.25,0.125]).reshape(n,1)
y = np.matmul(x,theta) + theta_0*np.ones(m).reshape((m,1)) + np.random.normal(loc=0,scale=0.25,size=(m,1))


# input features
x0 = np.ones([m,1])
X = np.append(x0,x,axis=1)

# defining the cost function
def compute_cost(X,y,theta_GD):
       return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2


# initializations

theta_GD = np.append([theta_0],[theta]).reshape(n+1,1)
alp = 1e-5
num_iterations = 10000

# Batch Sum
def batch(i,j,theta_GD):
    batch_sum = 0
    for k in range(i,i+9):
        batch_sum += float((y[k]-np.transpose(theta_GD).dot(X[k]))*X[k][j])
    return batch_sum

# Gradient Step
def gradient_step(theta_current, X, y, alp,i):
    for j in range(0,n):
            theta_current[j]-= alp*batch(i,j,theta_current)/10

    theta_updated = theta_current

    return theta_updated

# gradient descent
cost_vec = []
for i in range(num_iterations):

    cost_vec.append(compute_cost(X[i], y[i], theta_GD))
    theta_GD = gradient_step(theta_GD, X, y, alp,i)


plt.plot(cost_vec)
plt.xlabel('iterations')
plt.ylabel('cost')

我正在尝试批量大小为 10 的小批量 GD。我得到了 MSE 的极度振荡行为。问题在哪里？谢谢。

P.S。我在关注 NG 的 https://www.coursera.org/learn/machine-learning/lecture/9zJUs/mini-batch-gradient-descent

Answer 1

这是对基本数学原理的描述，而不是基于代码的解决方案...

成本函数是高度非线性（np.power()）和递归并且递归和非线性系统可以振荡（自振荡https://en.wikipedia.org/wiki/Self-oscillation ). In mathematics this is subject to chaos theory / theory of nonlinear dynamical systems ( https://pdfs.semanticscholar.org/8e0d/ee3c433b1806bfa0d98286836096f8c2681d.pdf），cf 后勤地图 （https://en.wikipedia.org/wiki/Logistic_map）。如果 growth factor r 超过阈值，逻辑图就会振荡。 增长因子 衡量系统中的能量。

在您的代码中，关键部分是成本函数、成本向量，即系统的历史和 时间步 :

def compute_cost(X,y,theta_GD):
   return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2

cost_vec = []
for i in range(num_iterations):

    cost_vec.append(compute_cost(X[i], y[i], theta_GD))
    theta_GD = gradient_step(theta_GD, X, y, alp,i)

# Gradient Step
def gradient_step(theta_current, X, y, alp,i):
    for j in range(0,n):
            theta_current[j]-= alp*batch(i,j,theta_current)/10

    theta_updated = theta_current
return theta_updated

如果将其与逻辑地图的实施进行比较，您会发现相似之处

from pylab import show, scatter, xlim, ylim
from random import randint

iter = 1000         # Number of iterations per point
seed = 0.5          # Seed value for x in (0, 1)
spacing = .0001     # Spacing between points on domain (r-axis)
res = 8             # Largest n-cycle visible

# Initialize r and x lists
rlist = []
xlist = []

def logisticmap(x, r):     <------------------ nonlinear function

    return x * r * (1 - x)

# Return nth iteration of logisticmap(x. r)
def iterate(n, x, r):

    for i in range(1,n):
        x = logisticmap(x, r)

    return x

# Generate list values -- iterate for each value of r
for r in [i * spacing for i in range(int(1/spacing),int(4/spacing))]:
   rlist.append(r) 
   xlist.append(iterate(randint(iter-res/2,iter+res/2), seed, r))   <--------- similar to cost_vector, the history of the system

scatter(rlist, xlist, s = .01)
xlim(0.9, 4.1)
ylim(-0.1,1.1)
show()

代码源：https://www.reddit.com/r/learnpython/comments/zzh28/a_simple_python_implementation_of_the_logistic_map/

在此基础上你可以尝试修改你的代价函数在逻辑图中引入一个类似于增长因子的因子来降低系统振荡的强度

def gradient_step(theta_current, X, y, alp,i):
    for j in range(0,n):
            theta_current[j]-= alp*batch(i,j,theta_current)/10   <--- introduce a factor somewhere to keep the system under the oscillation threshold

    theta_updated = theta_current

    return theta_updated

或

def compute_cost(X,y,theta_GD):
   return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2  <--- introduce a factor somewhere to keep the system under the oscillation threshold

如果这不起作用，请遵循 https://www.reddit.com/r/MachineLearning/comments/3y9gkj/how_can_i_avoid_oscillations_in_gradient_descent/（时间步长，...）

中的建议

新元实施 Python

SGD implementation Python

python

batch-processing

stochastic

gradient-descent