theano函数在前馈神经网络的梯度优化过程中不更新参数
theano function not updating parameters during gradient optimization in feed forward neural net
通过从三层前馈神经网络的一个非常简单的实现开始并在 mnist 数据集上测试它,试图让我的手弄湿 theano 和深度网络。
我开始时使用的是随机梯度下降的基本实现,但网络训练不正确。网络参数未更新。
想知道是否有人可以指出我做错了什么。
下面的代码是我的lstm模块。我之所以这样称呼它,是因为我计划在未来实施 lstm 网络。
import theano, theano.tensor as T
import numpy as np
from collections import OrderedDict
np_rng = np.random.RandomState(1234)
class FeedForwardLayer(object):
def __init__(self, input_size, hidden_size, activation):
self.input_size = input_size
self.hidden_size = hidden_size
self.activation = activation
self.create_layer()
def create_layer(self):
self.W = create_shared(self.hidden_size, self.input_size, "weight")
self.b = create_shared(self.hidden_size, name="bias")
def activate(self, x):
if x.ndim > 1:
return self.activation(T.dot(self.W, x.T) + self.b[:, None]).T
else:
return self.activation(T.dot(self.W, x) + self.b)
@property
def params(self):
return [self.W, self.b]
@params.setter
def params(self, param_list):
self.W.set_value(param_list[0])
self.b.set_value(param_list[1])
class Network(object):
def __init__(self, input_size, celltype=FeedForwardLayer, layer_sizes=None):
self.input_size = input_size
self.celltype = celltype
self.layer_sizes = layer_sizes
self.create_layers()
def create_layers(self):
self.layers = []
input_size = self.input_size
for layer_size in self.layer_sizes:
self.layers.append(self.celltype(input_size, layer_size, activation=T.nnet.sigmoid))
input_size = layer_size
def forward(self, x):
out = []
layer_input = x
for layer in self.layers:
layer_out = layer.activate(layer_input)
out.append(layer_out)
layer_input = layer_out
return out
@property
def params(self):
return [param for layer in self.layers for param in layer.params]
@params.setter
def params(self, param_list):
start = 0
for layer in self.layers:
end = start + len(layer.params)
layer.params = param_list[start:end]
start = end
def create_shared(m, n=None, name=None):
if n is None:
return theano.shared(np_rng.standard_normal((m, )), name=name)
else:
return theano.shared(np_rng.standard_normal((m, n)), name=name)
def optimization_updates(cost, params, lr=.01):
"""
implements stochastic gradient descent
Inputs
---------------
cost -- theano variable to minimize
params -- network weights to take gradient with respect to
lr -- learning rate
"""
lr = theano.shared(np.float64(lr).astype(theano.config.floatX))
gparams = T.grad(cost, params)
updates = OrderedDict()
for gparam, param in zip(gparams, params):
updates[param] = param - lr * gparam
return updates
以下代码是我在 mnist 数据集上创建、训练和测试一个简单的三层前馈网络的地方。
from lstm import Network
import theano, theano.tensor as T
import numpy as np
import lstm as L
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
# load and normalize dataset
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min()
X /= X.max()
# create network
model = Network(64, layer_sizes=[100, 10])
# prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
data = T.vector()
result = model.forward(data)[-1]
label = T.vector()
cost = (result - label).norm(L=2)
updates = L.optimization_updates(cost, model.params)
update = theano.function([data, label], cost, updates=updates, allow_input_downcast=True)
predict = theano.function([data], result, allow_input_downcast=True)
for X, y in zip(X_train, labels_train):
c = update(X, y)
predictions = []
for X in X_test:
prediction = predict(X)
predictions.append(np.argmax(prediction))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
我面临的问题是参数没有正确更新。我不确定那是因为我没有正确计算梯度,还是因为我没有正确使用 theano 函数。
使用随机梯度下降时,您必须对数据集进行多次传递。
分类错误和混淆矩阵在第一个时期变化不大的情况并不罕见,尤其是在数据集很小的情况下。
我对您的代码进行了以下更改以训练 100 个时期
for i in xrange(100):
for X, y in zip(X_train, labels_train):
c = update(X, y)
混淆矩阵似乎开始改善了:
[[ 0 0 18 0 13 4 5 0 5 0]
[ 0 42 0 2 0 0 0 0 2 0]
[ 0 0 51 0 0 0 0 1 0 0]
[ 0 0 0 45 0 1 0 1 2 0]
[ 0 0 0 0 33 0 0 0 0 0]
[ 0 0 0 0 0 47 0 0 0 0]
[ 0 0 0 0 0 0 45 0 0 0]
[ 0 0 0 0 1 0 0 48 0 0]
[ 0 2 1 0 0 0 0 0 34 0]
[ 0 1 0 25 0 3 0 2 16 0]]
通过从三层前馈神经网络的一个非常简单的实现开始并在 mnist 数据集上测试它,试图让我的手弄湿 theano 和深度网络。
我开始时使用的是随机梯度下降的基本实现,但网络训练不正确。网络参数未更新。
想知道是否有人可以指出我做错了什么。
下面的代码是我的lstm模块。我之所以这样称呼它,是因为我计划在未来实施 lstm 网络。
import theano, theano.tensor as T
import numpy as np
from collections import OrderedDict
np_rng = np.random.RandomState(1234)
class FeedForwardLayer(object):
def __init__(self, input_size, hidden_size, activation):
self.input_size = input_size
self.hidden_size = hidden_size
self.activation = activation
self.create_layer()
def create_layer(self):
self.W = create_shared(self.hidden_size, self.input_size, "weight")
self.b = create_shared(self.hidden_size, name="bias")
def activate(self, x):
if x.ndim > 1:
return self.activation(T.dot(self.W, x.T) + self.b[:, None]).T
else:
return self.activation(T.dot(self.W, x) + self.b)
@property
def params(self):
return [self.W, self.b]
@params.setter
def params(self, param_list):
self.W.set_value(param_list[0])
self.b.set_value(param_list[1])
class Network(object):
def __init__(self, input_size, celltype=FeedForwardLayer, layer_sizes=None):
self.input_size = input_size
self.celltype = celltype
self.layer_sizes = layer_sizes
self.create_layers()
def create_layers(self):
self.layers = []
input_size = self.input_size
for layer_size in self.layer_sizes:
self.layers.append(self.celltype(input_size, layer_size, activation=T.nnet.sigmoid))
input_size = layer_size
def forward(self, x):
out = []
layer_input = x
for layer in self.layers:
layer_out = layer.activate(layer_input)
out.append(layer_out)
layer_input = layer_out
return out
@property
def params(self):
return [param for layer in self.layers for param in layer.params]
@params.setter
def params(self, param_list):
start = 0
for layer in self.layers:
end = start + len(layer.params)
layer.params = param_list[start:end]
start = end
def create_shared(m, n=None, name=None):
if n is None:
return theano.shared(np_rng.standard_normal((m, )), name=name)
else:
return theano.shared(np_rng.standard_normal((m, n)), name=name)
def optimization_updates(cost, params, lr=.01):
"""
implements stochastic gradient descent
Inputs
---------------
cost -- theano variable to minimize
params -- network weights to take gradient with respect to
lr -- learning rate
"""
lr = theano.shared(np.float64(lr).astype(theano.config.floatX))
gparams = T.grad(cost, params)
updates = OrderedDict()
for gparam, param in zip(gparams, params):
updates[param] = param - lr * gparam
return updates
以下代码是我在 mnist 数据集上创建、训练和测试一个简单的三层前馈网络的地方。
from lstm import Network
import theano, theano.tensor as T
import numpy as np
import lstm as L
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
# load and normalize dataset
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min()
X /= X.max()
# create network
model = Network(64, layer_sizes=[100, 10])
# prepare training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
data = T.vector()
result = model.forward(data)[-1]
label = T.vector()
cost = (result - label).norm(L=2)
updates = L.optimization_updates(cost, model.params)
update = theano.function([data, label], cost, updates=updates, allow_input_downcast=True)
predict = theano.function([data], result, allow_input_downcast=True)
for X, y in zip(X_train, labels_train):
c = update(X, y)
predictions = []
for X in X_test:
prediction = predict(X)
predictions.append(np.argmax(prediction))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
我面临的问题是参数没有正确更新。我不确定那是因为我没有正确计算梯度,还是因为我没有正确使用 theano 函数。
使用随机梯度下降时,您必须对数据集进行多次传递。 分类错误和混淆矩阵在第一个时期变化不大的情况并不罕见,尤其是在数据集很小的情况下。
我对您的代码进行了以下更改以训练 100 个时期
for i in xrange(100):
for X, y in zip(X_train, labels_train):
c = update(X, y)
混淆矩阵似乎开始改善了:
[[ 0 0 18 0 13 4 5 0 5 0]
[ 0 42 0 2 0 0 0 0 2 0]
[ 0 0 51 0 0 0 0 1 0 0]
[ 0 0 0 45 0 1 0 1 2 0]
[ 0 0 0 0 33 0 0 0 0 0]
[ 0 0 0 0 0 47 0 0 0 0]
[ 0 0 0 0 0 0 45 0 0 0]
[ 0 0 0 0 1 0 0 48 0 0]
[ 0 2 1 0 0 0 0 0 34 0]
[ 0 1 0 25 0 3 0 2 16 0]]