坐标下降 Python

Coordinate descent in Python

我想在Python中实现坐标下降,并与梯度下降的结果进行比较。我写了代码。但效果不佳。 GD可能还行,CD不行。

这是对坐标下降的引用。 --- LINK

我预料到了这个结果 Gradient Descent vs Coordinate Descent

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

x = np.linspace(10, 100, 1000)
y = np.linspace(10, 100, 1000)

def func(x, y, param):
    return param[0] * x + param[1] * y

def costf(X, y, param):
    return np.sum((X.dot(param) - y) ** 2)/(2 * len(y))

z = func(x, y, [5, 8]) + np.random.normal(0., 10.)
z = z.reshape(-1, 1)

interc = np.ones(1000)
X = np.concatenate([interc.reshape(-1, 1), x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)

#param = np.random.randn(3).T
param = np.array([2, 2, 2])

def gradient_descent(X, y, param, eta=0.01, iter=100):
    cost_history = [0] * iter

    for iteration in range(iter):
        h = X.dot(param)
        loss = h - y
        gradient = X.T.dot(loss)/(2 * len(y))
        param = param - eta * gradient
        cost = costf(X, y, param)
        #print(cost)
        cost_history[iteration] = cost

    return param, cost_history


def coordinate_descent(X, y, param, iter=100):
    cost_history = [0] * iter

    for iteration in range(iter):
        for i in range(len(param)):
            dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
            param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
            cost = costf(X, y, param)
            #print(cost)
            cost_history[iteration] = cost

    return param, cost_history


ret, xret = gradient_descent(X, z, param)
cret, cxret = coordinate_descent(X, z, param)

plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
plt.show()

提前致谢。

我不是优化方面的专家。这里只是一些建议。

我觉得你的代码很不错,除了

1.the 下面两行好像不对

loss = h - y

param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))

您使用 z = z.reshape(-1, 1) 将 y 重塑为 (n_samples, 1),而 hdele 的形状为 (n_samples,)。这会产生一个 (n_samples, n_samples) 矩阵,我认为这不是您想要的。

2.your 数据可能不适合测试目的。这并不意味着它不起作用,但您可能需要一些提取工作来调整学习率/迭代次数。

我在波士顿数据集上试过你的代码,输出如下所示。

密码

from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler

#  boston house-prices dataset
data = load_boston()
X, y = data.data, data.target

X = StandardScaler().fit_transform(X)  # for easy convergence
X = np.hstack([X, np.ones((X.shape[0], 1))])

param = np.zeros(X.shape[1])

def gradient_descent(X, y, param, eta=0.01, iter=300):
    cost_history = [0] * (iter+1)
    cost_history[0] = costf(X, y, param)  # you may want to save initial cost

    for iteration in range(iter):
        h = X.dot(param)
        loss = h - y.ravel()
        gradient = X.T.dot(loss)/(2 * len(y))
        param = param - eta * gradient
        cost = costf(X, y, param)
        #print(cost)
        cost_history[iteration+1] = cost

    return param, cost_history

def coordinate_descent(X, y, param, iter=300):
    cost_history = [0] * (iter+1)
    cost_history[0] = costf(X, y, param)

    for iteration in range(iter):
        for i in range(len(param)):
            dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
            param[i] = np.dot(X[:,i].T, (y.ravel() - dele))/np.sum(np.square(X[:,i]))
            cost = costf(X, y, param)
            cost_history[iteration+1] = cost

    return param, cost_history

ret, xret = gradient_descent(X, y, param)
cret, cxret = coordinate_descent(X, y, param.copy())

plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()

如果这是您想要的,您可以尝试在其他一些数据上进行测试。请注意,即使实现正确,您仍然需要调整超参数以获得良好的收敛性。