坐标下降 Python
Coordinate descent in Python
我想在Python中实现坐标下降,并与梯度下降的结果进行比较。我写了代码。但效果不佳。 GD可能还行,CD不行。
这是对坐标下降的引用。 --- LINK
我预料到了这个结果
Gradient Descent vs Coordinate Descent
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(10, 100, 1000)
y = np.linspace(10, 100, 1000)
def func(x, y, param):
return param[0] * x + param[1] * y
def costf(X, y, param):
return np.sum((X.dot(param) - y) ** 2)/(2 * len(y))
z = func(x, y, [5, 8]) + np.random.normal(0., 10.)
z = z.reshape(-1, 1)
interc = np.ones(1000)
X = np.concatenate([interc.reshape(-1, 1), x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
#param = np.random.randn(3).T
param = np.array([2, 2, 2])
def gradient_descent(X, y, param, eta=0.01, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
h = X.dot(param)
loss = h - y
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
ret, xret = gradient_descent(X, z, param)
cret, cxret = coordinate_descent(X, z, param)
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
plt.show()
提前致谢。
我不是优化方面的专家。这里只是一些建议。
我觉得你的代码很不错,除了
1.the 下面两行好像不对
loss = h - y
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
您使用 z = z.reshape(-1, 1)
将 y 重塑为 (n_samples, 1)
,而 h
和 dele
的形状为 (n_samples,)
。这会产生一个 (n_samples, n_samples)
矩阵,我认为这不是您想要的。
2.your 数据可能不适合测试目的。这并不意味着它不起作用,但您可能需要一些提取工作来调整学习率/迭代次数。
我在波士顿数据集上试过你的代码,输出如下所示。
密码
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
# boston house-prices dataset
data = load_boston()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X) # for easy convergence
X = np.hstack([X, np.ones((X.shape[0], 1))])
param = np.zeros(X.shape[1])
def gradient_descent(X, y, param, eta=0.01, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param) # you may want to save initial cost
for iteration in range(iter):
h = X.dot(param)
loss = h - y.ravel()
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration+1] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param)
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y.ravel() - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
cost_history[iteration+1] = cost
return param, cost_history
ret, xret = gradient_descent(X, y, param)
cret, cxret = coordinate_descent(X, y, param.copy())
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
如果这是您想要的,您可以尝试在其他一些数据上进行测试。请注意,即使实现正确,您仍然需要调整超参数以获得良好的收敛性。
我想在Python中实现坐标下降,并与梯度下降的结果进行比较。我写了代码。但效果不佳。 GD可能还行,CD不行。
这是对坐标下降的引用。 --- LINK
我预料到了这个结果 Gradient Descent vs Coordinate Descent
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(10, 100, 1000)
y = np.linspace(10, 100, 1000)
def func(x, y, param):
return param[0] * x + param[1] * y
def costf(X, y, param):
return np.sum((X.dot(param) - y) ** 2)/(2 * len(y))
z = func(x, y, [5, 8]) + np.random.normal(0., 10.)
z = z.reshape(-1, 1)
interc = np.ones(1000)
X = np.concatenate([interc.reshape(-1, 1), x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
#param = np.random.randn(3).T
param = np.array([2, 2, 2])
def gradient_descent(X, y, param, eta=0.01, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
h = X.dot(param)
loss = h - y
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
ret, xret = gradient_descent(X, z, param)
cret, cxret = coordinate_descent(X, z, param)
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
plt.show()
提前致谢。
我不是优化方面的专家。这里只是一些建议。
我觉得你的代码很不错,除了
1.the 下面两行好像不对
loss = h - y
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
您使用 z = z.reshape(-1, 1)
将 y 重塑为 (n_samples, 1)
,而 h
和 dele
的形状为 (n_samples,)
。这会产生一个 (n_samples, n_samples)
矩阵,我认为这不是您想要的。
2.your 数据可能不适合测试目的。这并不意味着它不起作用,但您可能需要一些提取工作来调整学习率/迭代次数。
我在波士顿数据集上试过你的代码,输出如下所示。
密码
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
# boston house-prices dataset
data = load_boston()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X) # for easy convergence
X = np.hstack([X, np.ones((X.shape[0], 1))])
param = np.zeros(X.shape[1])
def gradient_descent(X, y, param, eta=0.01, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param) # you may want to save initial cost
for iteration in range(iter):
h = X.dot(param)
loss = h - y.ravel()
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration+1] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param)
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y.ravel() - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
cost_history[iteration+1] = cost
return param, cost_history
ret, xret = gradient_descent(X, y, param)
cret, cxret = coordinate_descent(X, y, param.copy())
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
如果这是您想要的,您可以尝试在其他一些数据上进行测试。请注意,即使实现正确,您仍然需要调整超参数以获得良好的收敛性。