SciPy 使用渐变最小化
SciPy minimize with gradient
这是逻辑回归的实现,使用玩具数据集。来自 @dermen 的一些反馈帮助我解决了我使用 scipy.optimize.minimize
的方式的一个基本问题,但即使在解决该问题之后,优化也无法收敛,即使只使用测试数据集的前五行也是如此。这是代码的独立版本:
import numpy as np
from scipy.optimize import minimize
# `data` is a subset of a toy dataset. The full dataset is ~100 rows, linearly seperable and located at
# https://github.com/liavkoren/ng-redux/blob/master/ex2/ex2data1.txt
data = np.array([
[ 34.62365962, 78.02469282],
[ 30.28671077, 43.89499752],
[ 35.84740877, 72.90219803],
[ 60.18259939, 86.3085521 ],
[ 79.03273605, 75.34437644],
[ 45.08327748, 56.31637178],
[ 61.10666454, 96.51142588],
[ 75.02474557, 46.55401354],
[ 76.0987867, 87.42056972],
[ 84.43281996, 43.53339331],
])
# Ground truth
y = np.array([0., 0., 0., 1., 1., 0., 1., 1., 1., 1.])
def sigmoid(z):
return 1/(1 + np.power(np.e, -z))
h = lambda theta, x: sigmoid(x.dot(theta))
def cost(theta, X, y):
m = X.shape[0]
j = y.dot(np.log(h(theta, X))) + (1 - y).dot(np.log(1 - h(theta, X)))
return (-j/m)
def grad(theta, X, y):
m = X.shape[0]
return ((h(theta, X) - y).dot(X))/m
# Add a column of ones:
m, features = np.shape(X_initial)
features += 1
X = np.concatenate([np.ones((m, 1)), X_initial], axis=1)
initial_theta = np.zeros((features))
def check_functions(grad_func, cost_func):
'''
Asserts that the cost and gradient functions return known corret values for a given theta, X, y.
Test case from https://www.coursera.org/learn/machine-learning/discussions/weeks/3/threads/tA3ESpq0EeW70BJZtLVfGQ
The expected cost is 4.6832.
The expected gradient = [0.31722, 0.87232, 1.64812, 2.23787]
'''
test_X = np.array([[1, 8, 1, 6], [1, 3, 5, 7], [1, 4, 9, 2]]) # X
test_y = np.array([[1, 0, 1]]) # y
test_theta = np.array([-2, -1, 1, 2])
grad_diff = grad_func(test_theta, test_X, test_y) - np.array([0.31722, 0.87232, 1.64812, 2.23787])
assert grad_diff.dot(grad_diff.T) < 0.0001
assert abs(cost_func(test_theta, test_X, test_y, debug=False) - 4.6832) < 0.0001
check_functions(grad, cost)
# `cutoff` slices out a subset of rows.
cutoff = 2
print minimize(fun=cost, x0=initial_theta, args=(X[0:cutoff, :], y[0:cutoff]), jac=grad)
此代码失败:
fun: nan
hess_inv: array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
jac: array([ 0., 0., 0.])
message: 'Desired error not necessarily achieved due to precision loss.'
nfev: 32
nit: 1
njev: 32
status: 2
success: False
x: array([ -0.5 , -16.2275926 , -30.47992258])
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:25: RuntimeWarning: overflow encountered in power
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:38: RuntimeWarning: divide by zero encountered in log
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:42: RuntimeWarning: divide by zero encountered in log
sigma
函数中对 np.power
的调用发生溢出。我将调试消息添加到 cost
函数中,看到以下内容:
theta: [ 0. 0. 0.]
--
X: [[ 1. 34.62365962 78.02469282]
[ 1. 30.28671077 43.89499752]]
--
y=1: [ 0.5 0.5] y=0: [ 0.5 0.5]
log probabilities:
y=1: [-0.69314718 -0.69314718]
y=0: [-0.69314718 -0.69314718]
=======
theta: [ -0.5 -16.2275926 -30.47992258]
--
X: [[ 1. 34.62365962 78.02469282]
[ 1. 30.28671077 43.89499752]]
--
y=1: [ 0. 0.] y=0: [ 1. 1.]
log probabilities:
y=1: [-inf -inf]
y=0: [ 0. 0.]
这会在第二次迭代时溢出!!
通过将数据集除以 1/10 并收敛,我很快确认这似乎确实存在问题。我想我将不得不查看功能 scaling/normalization 或其他一些避免溢出的策略。
这是逻辑回归的实现,使用玩具数据集。来自 @dermen 的一些反馈帮助我解决了我使用 scipy.optimize.minimize
的方式的一个基本问题,但即使在解决该问题之后,优化也无法收敛,即使只使用测试数据集的前五行也是如此。这是代码的独立版本:
import numpy as np
from scipy.optimize import minimize
# `data` is a subset of a toy dataset. The full dataset is ~100 rows, linearly seperable and located at
# https://github.com/liavkoren/ng-redux/blob/master/ex2/ex2data1.txt
data = np.array([
[ 34.62365962, 78.02469282],
[ 30.28671077, 43.89499752],
[ 35.84740877, 72.90219803],
[ 60.18259939, 86.3085521 ],
[ 79.03273605, 75.34437644],
[ 45.08327748, 56.31637178],
[ 61.10666454, 96.51142588],
[ 75.02474557, 46.55401354],
[ 76.0987867, 87.42056972],
[ 84.43281996, 43.53339331],
])
# Ground truth
y = np.array([0., 0., 0., 1., 1., 0., 1., 1., 1., 1.])
def sigmoid(z):
return 1/(1 + np.power(np.e, -z))
h = lambda theta, x: sigmoid(x.dot(theta))
def cost(theta, X, y):
m = X.shape[0]
j = y.dot(np.log(h(theta, X))) + (1 - y).dot(np.log(1 - h(theta, X)))
return (-j/m)
def grad(theta, X, y):
m = X.shape[0]
return ((h(theta, X) - y).dot(X))/m
# Add a column of ones:
m, features = np.shape(X_initial)
features += 1
X = np.concatenate([np.ones((m, 1)), X_initial], axis=1)
initial_theta = np.zeros((features))
def check_functions(grad_func, cost_func):
'''
Asserts that the cost and gradient functions return known corret values for a given theta, X, y.
Test case from https://www.coursera.org/learn/machine-learning/discussions/weeks/3/threads/tA3ESpq0EeW70BJZtLVfGQ
The expected cost is 4.6832.
The expected gradient = [0.31722, 0.87232, 1.64812, 2.23787]
'''
test_X = np.array([[1, 8, 1, 6], [1, 3, 5, 7], [1, 4, 9, 2]]) # X
test_y = np.array([[1, 0, 1]]) # y
test_theta = np.array([-2, -1, 1, 2])
grad_diff = grad_func(test_theta, test_X, test_y) - np.array([0.31722, 0.87232, 1.64812, 2.23787])
assert grad_diff.dot(grad_diff.T) < 0.0001
assert abs(cost_func(test_theta, test_X, test_y, debug=False) - 4.6832) < 0.0001
check_functions(grad, cost)
# `cutoff` slices out a subset of rows.
cutoff = 2
print minimize(fun=cost, x0=initial_theta, args=(X[0:cutoff, :], y[0:cutoff]), jac=grad)
此代码失败:
fun: nan
hess_inv: array([[1, 0, 0],
[0, 1, 0],
[0, 0, 1]])
jac: array([ 0., 0., 0.])
message: 'Desired error not necessarily achieved due to precision loss.'
nfev: 32
nit: 1
njev: 32
status: 2
success: False
x: array([ -0.5 , -16.2275926 , -30.47992258])
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:25: RuntimeWarning: overflow encountered in power
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:38: RuntimeWarning: divide by zero encountered in log
/Users/liavkoren/Envs/data-sci/lib/python2.7/site-packages/ipykernel/__main__.py:42: RuntimeWarning: divide by zero encountered in log
sigma
函数中对 np.power
的调用发生溢出。我将调试消息添加到 cost
函数中,看到以下内容:
theta: [ 0. 0. 0.]
--
X: [[ 1. 34.62365962 78.02469282]
[ 1. 30.28671077 43.89499752]]
--
y=1: [ 0.5 0.5] y=0: [ 0.5 0.5]
log probabilities:
y=1: [-0.69314718 -0.69314718]
y=0: [-0.69314718 -0.69314718]
=======
theta: [ -0.5 -16.2275926 -30.47992258]
--
X: [[ 1. 34.62365962 78.02469282]
[ 1. 30.28671077 43.89499752]]
--
y=1: [ 0. 0.] y=0: [ 1. 1.]
log probabilities:
y=1: [-inf -inf]
y=0: [ 0. 0.]
这会在第二次迭代时溢出!!
通过将数据集除以 1/10 并收敛,我很快确认这似乎确实存在问题。我想我将不得不查看功能 scaling/normalization 或其他一些避免溢出的策略。