为什么我的线性回归批量梯度下降不收敛?

Why my batch gradient descent for linear regression don't converge?

我想编写一个类似于 sklearn.linear_model.LinearRegression 的线性回归模型。首先,我使用 sklearn.linear_model.LinearRegression:

训练标准线性回归模型
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

# training data
train_x = np.array([1,2,3,4,5,6], dtype=np.float64).reshape(6,1)
train_y = np.array([1,2,3,3.25,3.5,3.8], dtype=np.float64)
# test data
predict_x = np.arange(0, 7, 0.1)
predict_x = predict_x.reshape(predict_x.size, 1)

# Simple regression
model1 = linear_model.LinearRegression()
model1.fit(train_x, train_y);
print model1.coef_, model1.intercept_
# Quadratic regression
model2 = linear_model.LinearRegression()
model2.fit(np.concatenate((train_x, train_x**2), axis=1), train_y);
print model2.coef_, model2.intercept_
# Five-order polynomial regression
model5 = linear_model.LinearRegression()
model5.fit(np.concatenate((train_x, train_x**2, train_x**3, train_x**4, train_x**5), axis=1), train_y);
print model5.coef_, model5.intercept_
# Predict
predict_y1 = model1.predict(predict_x)
predict_y2 = model2.predict(np.concatenate((predict_x, predict_x**2), axis=1))
predict_y5 = model5.predict(np.concatenate((predict_x, predict_x**2, predict_x**3, predict_x**4, predict_x**5), axis=1))

# plot
plt.figure(figsize = (10,10))
plt.scatter(train_x, train_y, color='black')
plt.plot(predict_x, predict_y1, color='blue', label='underfitting')
plt.plot(predict_x, predict_y2, color='green', label='fair')
plt.plot(predict_x, predict_y5, color='red', label='overfitting')
plt.axis([0,7,0,5])
plt.legend(loc=2)
plt.show()

然后,我得到了好的结果:

[ 0.53571429] 0.883333333333

[ 1.34821429 -0.11607143] -0.2

[-8.52333333 7.0625 -2.30833333 0.3375 -0.01833333] 4.45

之后,我实现了我的模型MyLinearRegression。首先,我选择批量梯度下降和固定迭代次数来测试我的代码是否正确。

# center data
def center_matrix(X):
    assert(isinstance(X, np.ndarray))
    X_offset = np.average(X, axis=0)
    return X - X_offset, X_offset

class MyLinearRegression(object):
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
        self.learning_rate = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_samples_, = y.shape
        assert(n_samples == n_samples_)
        X, X_offset = center_matrix(X)
        y, y_offset = center_matrix(y)
        self.coef_ = np.ones((n_features,), dtype=np.float64)

        self.learning_rate = -0.0001

        error = None
        # using fixed iteration number
        for epoch in np.arange(500000):
            y_hat = X.dot(self.coef_)
            error_ = y_hat-y
            if error is not None and sum(error_**2) > sum(error**2): # if square error is increasing, then half learning_rate. 
                self.learning_rate /= 2.
                continue
            error = error_
            coef = self.coef_ + self.learning_rate * (X.T.dot(error))
            if np.isfinite(coef).all(): # if overflow happen, half learning_rate. 
                self.coef_ = coef
            else:
                self.learning_rate /= 2.

        self.intercept_ = y_offset - self.coef_.dot(X_offset.T)
        return self

    def predict(self, X):
        n_samples, n_features = X.shape
        assert(n_features == self.coef_.size)
        return X.dot(self.coef_) + self.intercept_

# Simple regression
my_model1 = MyLinearRegression()
my_model1.fit(train_x, train_y)
print my_model1.coef_, my_model1.intercept_
# Quadratic regression
my_model2 = MyLinearRegression()
my_model2.fit(np.concatenate((train_x, train_x**2), axis=1), train_y);
print my_model2.coef_, my_model2.intercept_

# Five-order polynomial regression
my_model5 = MyLinearRegression()
my_model5.fit(np.concatenate((train_x, train_x**2, train_x**3, train_x**4, train_x**5), axis=1), train_y);
print my_model5.coef_, my_model5.intercept_

# predict
my_predict_y1 = my_model1.predict(predict_x)
my_predict_y2 = my_model2.predict(np.concatenate((predict_x, predict_x**2), axis=1))
my_predict_y5 = my_model5.predict(np.concatenate((predict_x, predict_x**2, predict_x**3, predict_x**4, predict_x**5), axis=1))

# plot
plt.figure(figsize = (10,10))
plt.scatter(train_x, train_y, color='black')
plt.plot(predict_x, my_predict_y1, color='blue', label='underfitting')
plt.plot(predict_x, my_predict_y2, color='green', label='fair')
plt.plot(predict_x, my_predict_y5, color='red', label='overfitting')
plt.axis([0,7,0,5])
plt.legend(loc=2)
plt.show()

然后,我得到了不好的结果:

[ 0.53571433] 0.883333191266

[ 1.34821275 -0.11607122] -0.199997815791

[ -1.95681250e+00 -2.20847875e+01 -1.48602362e+02 -9.20144807e+02 -5.56577136e+03] 11678151.1386

我可以在 my_model1my_model2 上取得好成绩 MyLinearRegression,他们在 sklearn.linear_model.LinearRegression 上对这些关闭。但是,无论我如何调整 learning_rate 和迭代次数, my_model5 就是不收敛。有人可以帮忙吗?

这里需要特征缩放。

参见: