Lasagne LSTM 回归产生 zerros 作为输出
Lasagne LSTM regression produces zerros as output
受到 Andrej Karpathy 的启发 blog 我想制作我自己的循环神经网络版本,它选择下一个单词而不是字符。
由于文本中不同单词的数量太多,我使用 word2vec 将单词表示为向量(向量中相似的单词更接近 - space)。 NN 现在应该训练从旧向量的模式中学习新向量。
-一个重要的注意事项是,在 Karpathy 使用分类器的地方,我正在尝试回归方法(平方损失成本)。
我的问题是无论训练多少,我的神经网络都会预测输出 [0,0,0....,0]。所以我的猜测是我训练或预测的方法有问题(平均误差在训练过程中下降了一点,所以必须进行一些训练)
下面是我的全部代码,如果有人想 运行 它(它使用棕色语料库,因此需要安装 nltk 才能按原样工作)。
这是我在烤宽面条中的 "Hello World" 项目,所以如果我做了一些愚蠢的事情,欢迎任何指点。
提前致谢:)
from gensim.models import Word2Vec
import gensim
import sys
from datetime import timedelta
import matplotlib.pyplot as plt
from nltk.corpus import brown
import theano.tensor as T
import theano
import time
import numpy as np
from lasagne import layers
import lasagne
from lasagne.updates import nesterov_momentum
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
def modelExcept(input, model, size):
try:
out = model[input]
return out
except Exception:
out = np.zeros((size))
print 'exception ' + str(input)
return out
def plot_TSNE(model,nr_words=None):
tsne = TSNE(n_components=2)
if nr_words == None:
X_tsne = tsne.fit_transform(model[model.wv.vocab][:])
else:
X_tsne = tsne.fit_transform(model[model.wv.vocab][0:nr_words])
X_names = [key for key in model.wv.vocab]
plt.figure()
ax = plt.subplot(111)
for i in range(X_tsne.shape[0]):
plt.text(X_tsne[i, 0], X_tsne[i, 1], str(X_names[i]),
#color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
plt.xticks([]), plt.yticks([])
plt.draw()
#plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
#plt.show()
def getBatch(words_as_vecs , wordSize,totalwords, windowSize, BATCHSIZE):
BatchIndexes = np.random.randint(0,totalwords-windowSize, size=BATCHSIZE)
input = np.empty((BATCHSIZE,windowSize,wordSize),dtype=np.float32)
target = np.empty((BATCHSIZE,wordSize),dtype=np.float32)
for i in range(BATCHSIZE):
k = BatchIndexes[i]
input[i,:,:] = words_as_vecs[k:k+windowSize,:]
target[i,:] = words_as_vecs[k+windowSize,:]
return input, target
wordSize = 30
windowSize = 5
BATCHSIZE = 128
LEARNING_RATE = .1
Nr_EPOCHS = 100
NR_Predictions = 15
model_raw = Word2Vec(brown.sents(),workers=4,window=10,iter=15,size=wordSize, min_count=10)
#plot_TSNE(model_raw,None)
model = model_raw.wv #trim model after training to save RAM
del model_raw
words_filtered = filter(lambda x: x in model.vocab, brown.words())#filter away words that are not in vocabulary
words_as_vecs = np.asarray([modelExcept(word, model,wordSize) for word in words_filtered],dtype = np.float32) #create all vector representations beforehand to save time!!
scaler = MinMaxScaler(feature_range=(0,1))
words_as_vecs = scaler.fit_transform(words_as_vecs)
print 'creating neural net...'
Num_units_per_layer = 512
GRAD_CLIP = 100
l_in = lasagne.layers.InputLayer(shape=(None,None,wordSize))
l_LSTM1 = lasagne.layers.LSTMLayer(l_in,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify)
l_drop1 = lasagne.layers.DropoutLayer(l_LSTM1,p=0.5)
l_LSTM2 = lasagne.layers.LSTMLayer(l_drop1,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify, only_return_final=True)
l_drop2 = lasagne.layers.DropoutLayer(l_LSTM2,p=0.5)
l_shp = lasagne.layers.ReshapeLayer(l_drop2,(-1,Num_units_per_layer))
l_out = lasagne.layers.DenseLayer(l_shp,num_units=wordSize,W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.rectify)
target_vals = T.imatrix('target values')
net_out = lasagne.layers.get_output(l_out)
net_out_predict = lasagne.layers.get_output(l_out,deterministic = True)
#use squared error because the problem is now a regession problem
cost = T.sum(lasagne.objectives.squared_error(net_out,target_vals))
all_params = lasagne.layers.get_all_params(l_out, trainable = True)
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
net_train = theano.function([l_in.input_var, target_vals], cost, updates=updates, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, target_vals], cost, allow_input_downcast=True)
net_predict = theano.function([l_in.input_var],net_out_predict,allow_input_downcast=True)
print 'creating testphrase...'
testphrase_vectors = np.empty((1,5,wordSize),dtype=np.float32)
testphrase_vectors[0,:,:] = words_as_vecs[1:6,:]
testphrase_words = words_filtered[0:6]
#testphrase_words = brown.words()[0:6]
print 'training...'
avg_cost = 0
totalwords = len(words_filtered)
#totalwords = len(brown.words())
print_freq = totalwords/BATCHSIZE #print example every epoch
nrItterations = Nr_EPOCHS*totalwords/BATCHSIZE
for i in range(nrItterations):
inTrain, target = getBatch(words_as_vecs, wordSize, totalwords, windowSize, BATCHSIZE)
avg_cost += net_train(inTrain,target)
#generate text sample
if (i%print_freq == 0) and (i != 0):
print 'prediction of train'
print 'average cost is {0}' .format(avg_cost/(BATCHSIZE*print_freq))
avg_cost = 0
generated_example = ' '.join(testphrase_words)
testphrase_vectors_copy = testphrase_vectors
for k in range(NR_Predictions):
prediction = np.asarray(net_predict(testphrase_vectors_copy))
prediction_unscaled = scaler.inverse_transform(prediction.reshape(1,-1)).reshape(-1)
current_word = model.most_similar(positive=[prediction_unscaled], topn=1)
generated_example = ' '.join((generated_example, current_word[0][0]))
#insert new word in testphrase (and delete first)
testphrase_vectors_copy[0,0:-1,:] = testphrase_vectors_copy[0,1:,:]
testphrase_vectors_copy[0,-1,:] = model[current_word[0][0]]
#print testphrase_vectors_copy
print 'example nr. {}' .format(i/print_freq + 1)
print generated_example
print '\n \n'
终于找到错误了
问题出在这一行:
target_vals = T.imatrix('target values')
应该是:
target_vals = T.fmatrix('target values')
因为我的目标是浮点数而不是整数。
受到 Andrej Karpathy 的启发 blog 我想制作我自己的循环神经网络版本,它选择下一个单词而不是字符。 由于文本中不同单词的数量太多,我使用 word2vec 将单词表示为向量(向量中相似的单词更接近 - space)。 NN 现在应该训练从旧向量的模式中学习新向量。
-一个重要的注意事项是,在 Karpathy 使用分类器的地方,我正在尝试回归方法(平方损失成本)。
我的问题是无论训练多少,我的神经网络都会预测输出 [0,0,0....,0]。所以我的猜测是我训练或预测的方法有问题(平均误差在训练过程中下降了一点,所以必须进行一些训练)
下面是我的全部代码,如果有人想 运行 它(它使用棕色语料库,因此需要安装 nltk 才能按原样工作)。
这是我在烤宽面条中的 "Hello World" 项目,所以如果我做了一些愚蠢的事情,欢迎任何指点。 提前致谢:)
from gensim.models import Word2Vec
import gensim
import sys
from datetime import timedelta
import matplotlib.pyplot as plt
from nltk.corpus import brown
import theano.tensor as T
import theano
import time
import numpy as np
from lasagne import layers
import lasagne
from lasagne.updates import nesterov_momentum
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
def modelExcept(input, model, size):
try:
out = model[input]
return out
except Exception:
out = np.zeros((size))
print 'exception ' + str(input)
return out
def plot_TSNE(model,nr_words=None):
tsne = TSNE(n_components=2)
if nr_words == None:
X_tsne = tsne.fit_transform(model[model.wv.vocab][:])
else:
X_tsne = tsne.fit_transform(model[model.wv.vocab][0:nr_words])
X_names = [key for key in model.wv.vocab]
plt.figure()
ax = plt.subplot(111)
for i in range(X_tsne.shape[0]):
plt.text(X_tsne[i, 0], X_tsne[i, 1], str(X_names[i]),
#color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
plt.xticks([]), plt.yticks([])
plt.draw()
#plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
#plt.show()
def getBatch(words_as_vecs , wordSize,totalwords, windowSize, BATCHSIZE):
BatchIndexes = np.random.randint(0,totalwords-windowSize, size=BATCHSIZE)
input = np.empty((BATCHSIZE,windowSize,wordSize),dtype=np.float32)
target = np.empty((BATCHSIZE,wordSize),dtype=np.float32)
for i in range(BATCHSIZE):
k = BatchIndexes[i]
input[i,:,:] = words_as_vecs[k:k+windowSize,:]
target[i,:] = words_as_vecs[k+windowSize,:]
return input, target
wordSize = 30
windowSize = 5
BATCHSIZE = 128
LEARNING_RATE = .1
Nr_EPOCHS = 100
NR_Predictions = 15
model_raw = Word2Vec(brown.sents(),workers=4,window=10,iter=15,size=wordSize, min_count=10)
#plot_TSNE(model_raw,None)
model = model_raw.wv #trim model after training to save RAM
del model_raw
words_filtered = filter(lambda x: x in model.vocab, brown.words())#filter away words that are not in vocabulary
words_as_vecs = np.asarray([modelExcept(word, model,wordSize) for word in words_filtered],dtype = np.float32) #create all vector representations beforehand to save time!!
scaler = MinMaxScaler(feature_range=(0,1))
words_as_vecs = scaler.fit_transform(words_as_vecs)
print 'creating neural net...'
Num_units_per_layer = 512
GRAD_CLIP = 100
l_in = lasagne.layers.InputLayer(shape=(None,None,wordSize))
l_LSTM1 = lasagne.layers.LSTMLayer(l_in,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify)
l_drop1 = lasagne.layers.DropoutLayer(l_LSTM1,p=0.5)
l_LSTM2 = lasagne.layers.LSTMLayer(l_drop1,Num_units_per_layer,grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.rectify, only_return_final=True)
l_drop2 = lasagne.layers.DropoutLayer(l_LSTM2,p=0.5)
l_shp = lasagne.layers.ReshapeLayer(l_drop2,(-1,Num_units_per_layer))
l_out = lasagne.layers.DenseLayer(l_shp,num_units=wordSize,W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.rectify)
target_vals = T.imatrix('target values')
net_out = lasagne.layers.get_output(l_out)
net_out_predict = lasagne.layers.get_output(l_out,deterministic = True)
#use squared error because the problem is now a regession problem
cost = T.sum(lasagne.objectives.squared_error(net_out,target_vals))
all_params = lasagne.layers.get_all_params(l_out, trainable = True)
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
net_train = theano.function([l_in.input_var, target_vals], cost, updates=updates, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, target_vals], cost, allow_input_downcast=True)
net_predict = theano.function([l_in.input_var],net_out_predict,allow_input_downcast=True)
print 'creating testphrase...'
testphrase_vectors = np.empty((1,5,wordSize),dtype=np.float32)
testphrase_vectors[0,:,:] = words_as_vecs[1:6,:]
testphrase_words = words_filtered[0:6]
#testphrase_words = brown.words()[0:6]
print 'training...'
avg_cost = 0
totalwords = len(words_filtered)
#totalwords = len(brown.words())
print_freq = totalwords/BATCHSIZE #print example every epoch
nrItterations = Nr_EPOCHS*totalwords/BATCHSIZE
for i in range(nrItterations):
inTrain, target = getBatch(words_as_vecs, wordSize, totalwords, windowSize, BATCHSIZE)
avg_cost += net_train(inTrain,target)
#generate text sample
if (i%print_freq == 0) and (i != 0):
print 'prediction of train'
print 'average cost is {0}' .format(avg_cost/(BATCHSIZE*print_freq))
avg_cost = 0
generated_example = ' '.join(testphrase_words)
testphrase_vectors_copy = testphrase_vectors
for k in range(NR_Predictions):
prediction = np.asarray(net_predict(testphrase_vectors_copy))
prediction_unscaled = scaler.inverse_transform(prediction.reshape(1,-1)).reshape(-1)
current_word = model.most_similar(positive=[prediction_unscaled], topn=1)
generated_example = ' '.join((generated_example, current_word[0][0]))
#insert new word in testphrase (and delete first)
testphrase_vectors_copy[0,0:-1,:] = testphrase_vectors_copy[0,1:,:]
testphrase_vectors_copy[0,-1,:] = model[current_word[0][0]]
#print testphrase_vectors_copy
print 'example nr. {}' .format(i/print_freq + 1)
print generated_example
print '\n \n'
终于找到错误了
问题出在这一行:
target_vals = T.imatrix('target values')
应该是:
target_vals = T.fmatrix('target values')
因为我的目标是浮点数而不是整数。