无法理解 Theano 如何在 RNN NLP 中进行分类
Unable to make sense of how Theano works in RNN NLP for classification
import os
import theano, numpy
from theano import tensor as T
from collections import OrderedDict
class RNNSLU(object):
""" Elman neural net"""
def __init__(self, nh, nc, ne, de, cs):
"""
Hyperparameters used for initialization
nh : dimension of the hidden layer
nc : number of classes (labels)
ne : size of vocabulary
de : dimension of embedding
cs : word context window size
"""
Parameter to be learnt : word embeddings
self.embeddings = theano.shared(name='embeddings',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping input to the hidden layer (de*cs x nh)
self.wx = theano.shared(name='wx',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping hidden layer from the
# previous time step to the current one
self.wh = theano.shared(name='wh',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping hidden to output layer (nh x nc)
self.w = theano.shared(name='w',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc))
.astype(theano.config.floatX))
# Parameter to be learnt : Bias at the hidden layer
self.bh = theano.shared(name='bh',
value = numpy.zeros(nh,
dtype=theano.config.floatX))
# Parameter to be learnt : The bias of the output layer
self.b = theano.shared(name='b',
value = numpy.zeros(nc,
dtype=theano.config.floatX))
# Parameter to be learnt : The hidden layer at time t=0
self.h0 = theano.shared(name='h0',
value = numpy.zeros(nh,
dtype=theano.config.floatX))
# Bundle the parameters
self.params = [self.embeddings, self.wx, self.wh, self.w, self.bh, self.b, self.h0]
self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0']
#Compile training function
self.prepare_train(de, cs)
def prepare_train(self, de, cs):
"""
Trains the recurrent neural net
"""
idxs = T.imatrix() # columns = no of words in window, rows = len of sentence
# Prepare to recieve input and output labels
x = self.embeddings[idxs].reshape((idxs.shape[0], de*cs))
y = T.iscalar('y')
def recurrence(x_t, h_tm1):
"""
x_t : Input at time t
h_tm1 : Hidden state at time t-1
"""
# Compute the hidden state at time time
# h_t = g(x_t . w_x + h_tm1 . w_h + b_h)
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh)
# Compute the output layer
# s_t = g(h_t . w + b)
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
return [h_t, s_t]
[h,s], _ = theano.scan(fn=recurrence,
sequences=x,
outputs_info=[self.h0, None],
n_steps=x.shape[0])
#print h.ndim
#print s.ndim
# TODO: What is the structure of s? What does the selection of axis do ?
p_y_given_sentence = s[:,0,:]
y_pred = T.argmax(p_y_given_sentence, axis=1)
# Learning rate
lr = T.scalar('lr')
# Sentence negative log-likelihood (The objective function)
sentence_nll = - T.mean(T.log(p_y_given_sentence)[T.arange(x.shape[0]), y])
# Compute paramter wise gradients
sentence_gradients = T.grad(sentence_nll, self.params)
# Compute updats
sentence_updates = OrderedDict((p, p - lr*g) for p,g in zip(self.params, sentence_gradients))
# Compile functions
self.classify = theano.function(inputs=[idxs], outputs=y_pred)
self.sentence_train = theano.function(inputs=[idxs, y, lr], outputs=sentence_nll, updates=sentence_updates)
#### Main Function from which we are calling class
rnn = RNNSLU(nh=s['nhidden'], nc=nClasses, ne=vocSize, de=s['emb_dimension'], cs=s['win'])
for word_batch, label_last_word in zip(words, labels):
rnn.sentence_train(word_batch, label_last_word, s['clr'])
rnn.normalize()
代码说明:
我知道这在 Whosebug 中不是一件好事。但是我花了一个多星期的时间来解码这段用于训练递归神经网络的代码。首先我是theano的新手。
word_batch = array([[ -1, -1, -1, 194, 358, 463, 208]], dtype=int32)
label_last_word = 126
Thw word_batch 是如下句子的索引:
'I am going to USA from England'
这里的 word_batch 是与一个特定单词相关联的上下文 window,比如 USA。因此,如果上下文 windows 是 7,则词批中的中间 ( 194 ) 表示该词在数据集中的索引。我想知道,当我将其作为参数传递给 rnn.sentence_train 时,训练是如何在 RNNSLU class 内部进行的。我对 class 中 idx、x 等变量的使用感到困惑。我知道这在理论上是如何发生的,但无法明确解码 theano 部分。如果我的问题没有意义,请告诉我。
谢谢。
rnn.sentence_train
是具有 updates=sentence_updates
的 Theano 函数。这意味着每次调用 rnn.sentence_train
时,sentence_updates
字典键中的所有共享变量都将根据相应 sentence_updates
字典值中的符号更新表达式进行更新。这些表达式都是经典的梯度下降法(当前参数值-学习率*成本相对于参数的梯度)。
idxs
是训练函数输入的符号占位符。在您的示例中,word_batch
在调用训练函数时填充该占位符。
import os
import theano, numpy
from theano import tensor as T
from collections import OrderedDict
class RNNSLU(object):
""" Elman neural net"""
def __init__(self, nh, nc, ne, de, cs):
"""
Hyperparameters used for initialization
nh : dimension of the hidden layer
nc : number of classes (labels)
ne : size of vocabulary
de : dimension of embedding
cs : word context window size
"""
Parameter to be learnt : word embeddings
self.embeddings = theano.shared(name='embeddings',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping input to the hidden layer (de*cs x nh)
self.wx = theano.shared(name='wx',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (de * cs, nh))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping hidden layer from the
# previous time step to the current one
self.wh = theano.shared(name='wh',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh))
.astype(theano.config.floatX))
# Parameter to be learnt : Weight matrix mapping hidden to output layer (nh x nc)
self.w = theano.shared(name='w',
value = 0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc))
.astype(theano.config.floatX))
# Parameter to be learnt : Bias at the hidden layer
self.bh = theano.shared(name='bh',
value = numpy.zeros(nh,
dtype=theano.config.floatX))
# Parameter to be learnt : The bias of the output layer
self.b = theano.shared(name='b',
value = numpy.zeros(nc,
dtype=theano.config.floatX))
# Parameter to be learnt : The hidden layer at time t=0
self.h0 = theano.shared(name='h0',
value = numpy.zeros(nh,
dtype=theano.config.floatX))
# Bundle the parameters
self.params = [self.embeddings, self.wx, self.wh, self.w, self.bh, self.b, self.h0]
self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0']
#Compile training function
self.prepare_train(de, cs)
def prepare_train(self, de, cs):
"""
Trains the recurrent neural net
"""
idxs = T.imatrix() # columns = no of words in window, rows = len of sentence
# Prepare to recieve input and output labels
x = self.embeddings[idxs].reshape((idxs.shape[0], de*cs))
y = T.iscalar('y')
def recurrence(x_t, h_tm1):
"""
x_t : Input at time t
h_tm1 : Hidden state at time t-1
"""
# Compute the hidden state at time time
# h_t = g(x_t . w_x + h_tm1 . w_h + b_h)
h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + self.bh)
# Compute the output layer
# s_t = g(h_t . w + b)
s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
return [h_t, s_t]
[h,s], _ = theano.scan(fn=recurrence,
sequences=x,
outputs_info=[self.h0, None],
n_steps=x.shape[0])
#print h.ndim
#print s.ndim
# TODO: What is the structure of s? What does the selection of axis do ?
p_y_given_sentence = s[:,0,:]
y_pred = T.argmax(p_y_given_sentence, axis=1)
# Learning rate
lr = T.scalar('lr')
# Sentence negative log-likelihood (The objective function)
sentence_nll = - T.mean(T.log(p_y_given_sentence)[T.arange(x.shape[0]), y])
# Compute paramter wise gradients
sentence_gradients = T.grad(sentence_nll, self.params)
# Compute updats
sentence_updates = OrderedDict((p, p - lr*g) for p,g in zip(self.params, sentence_gradients))
# Compile functions
self.classify = theano.function(inputs=[idxs], outputs=y_pred)
self.sentence_train = theano.function(inputs=[idxs, y, lr], outputs=sentence_nll, updates=sentence_updates)
#### Main Function from which we are calling class
rnn = RNNSLU(nh=s['nhidden'], nc=nClasses, ne=vocSize, de=s['emb_dimension'], cs=s['win'])
for word_batch, label_last_word in zip(words, labels):
rnn.sentence_train(word_batch, label_last_word, s['clr'])
rnn.normalize()
代码说明:
我知道这在 Whosebug 中不是一件好事。但是我花了一个多星期的时间来解码这段用于训练递归神经网络的代码。首先我是theano的新手。
word_batch = array([[ -1, -1, -1, 194, 358, 463, 208]], dtype=int32) label_last_word = 126
Thw word_batch 是如下句子的索引:
'I am going to USA from England'
这里的 word_batch 是与一个特定单词相关联的上下文 window,比如 USA。因此,如果上下文 windows 是 7,则词批中的中间 ( 194 ) 表示该词在数据集中的索引。我想知道,当我将其作为参数传递给 rnn.sentence_train 时,训练是如何在 RNNSLU class 内部进行的。我对 class 中 idx、x 等变量的使用感到困惑。我知道这在理论上是如何发生的,但无法明确解码 theano 部分。如果我的问题没有意义,请告诉我。
谢谢。
rnn.sentence_train
是具有 updates=sentence_updates
的 Theano 函数。这意味着每次调用 rnn.sentence_train
时,sentence_updates
字典键中的所有共享变量都将根据相应 sentence_updates
字典值中的符号更新表达式进行更新。这些表达式都是经典的梯度下降法(当前参数值-学习率*成本相对于参数的梯度)。
idxs
是训练函数输入的符号占位符。在您的示例中,word_batch
在调用训练函数时填充该占位符。