Theano:使用 ifelse 来抑制扫描的评估
Theano: use ifelse to suppress evaluation of scan
我正在尝试实现这篇论文:Deep Learning for Answer Sentence Selection 更准确地说,使用 theano 的扫描函数的 Bigram 模型是这样的:
Tl = theano.shared(...)
Tr = theano.shared(...)
b = theano.shared(...)
s = T.matrix('s')
results, updates = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_m1) + b),
sequences = dict(input = s, taps = [0, 1]),
outputs_info = T.zeros_like(b, dtype = 'float64')
non_sequences = [Tl, Tr, b],
strict = True
)
final_result = ifelse(T.eq(s.shape[0],1), s[0], result[-1])
我的问题是有些答案只有一个字长,所以 s 是一个向量而不是矩阵。这会导致扫描功能出现问题,因为缺少序列的 +1 抽头。为了处理这个问题,我想使用 ifelse 语句。
现在我的第一个问题:
这可能吗?还是会始终对扫描进行评估,然后 ifelse 才决定使用哪个值?
第二个问题:
如何使 tensor.eq(a,b) return 成为标量?因为我收到以下错误消息:
TypeError: Condition given to the op has to be a scalar with 0
standing for False, anything else for True
编辑一些触发索引错误的代码,因为如果 theano 计算梯度,ifelse 将不会被惰性求值
import numpy as np
import theano
import theano.tensor as T
from theano.ifelse import ifelse
def trainBigram(q, a, y, seed = 8024, lRate = 0.1, maxEpochs = 1,
modelSize = 3):
# q: list (all questions) of lists(words per question) of list (values of word embedding)
# a: list (all questions) of lists(1 possible answers) of lists(words per possible answer) of list (values of word embedding)
# y: list (all questions) of list (correctness of answer)
# trains on triplets of(q(i), a(i,j), y(i,j)) instead of softmaxing all 4 answer possibilities
# uses SGD
np.random.seed(seed)
# theano variables
q_t = T.matrix('q_t')
a_t = T.matrix('a_t')
y_t = T.scalar('y_t')
# initialize weights (distribution from paper)
M = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bm = theano.shared(np.random.normal(0,0.01,1))
Tl = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
Tr = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bt = theano.shared(np.random.normal(0,0.01,modelSize))
# define graph for Bigram Model
q_em_scan, scanUpdatesQ = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = q_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
q_em = q_em_scan[-1]
a_em_scan, scanUpdatesA = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = a_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
# printing calculated values to check for lazy evaluation of ifelse
MultipleWords= theano.printing.Print('multiple Words answer')(a_em_scan[-1])
OneWord = theano.printing.Print('1 Word answer')(a_t[0])
a_em = ifelse(T.eq(a_t.shape[0], 1), OneWord, MultipleWords)
# define graph for Question/Answer model
prob = 1 / (1 + T.exp( - T.dot(T.dot(M, a_em), q_em) + bm))
xent = - y_t * T.log(prob) - (1 - y_t) * T.log((1 - prob))
loss = xent.sum()
g_M,g_bm, g_Tl, g_Tr, g_bt = T.grad(loss, [M, bm, Tl, Tr, bt])
updates = (
(M, M - lRate * g_M),
(bm, bm - lRate * g_bm),
(Tl, Tl - lRate * g_Tl),
(Tr, Tr - lRate * g_Tr),
(bt, bt - lRate * g_bt)
)
# compile function
train = theano.function(
inputs = [q_t, a_t, y_t],
outputs = prob,
updates = updates
)
# training
for question, answers, labels in zip(q,a,y):
# for triplets instead of softmax
for answer, label in zip(answers, labels):
answer = np.asarray(answer)
if (answer.shape[0] == 1):
print "!!! One-Word-Answer !!!"
print "shape:", answer.shape
prob = train(question, answer, label)
print prob
def main():
questionOne = [[1,2,3],[1,2,3]]
answerOne_One = [[1,2,3], [1,2,3]]
answerOne_Two = [[1,2,3], [1,2,3]]
answersOne = [answerOne_One, answerOne_Two]
correctnessOne = [0,1]
questionTwo = [[4,5,6],[4,5,6]]
answerTwo_One = [[4,5,6]]
answerTwo_Two = [[4,5,6]]
answersTwo = [answerTwo_One, answerTwo_Two]
correctnessTwo = [1,0]
q = [questionOne, questionTwo]
a = [answersOne, answersTwo]
y = [correctnessOne, correctnessTwo]
trainBigram(q,a,y)
main()
我对您的代码进行了表面更改以使其可执行,但它确实有效,并且 ifelse
是 惰性的(即它仅在需要时计算扫描)。为了演示,我添加了一些符号打印语句来显示计算了哪些值。
Tl = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
Tr = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
b = theano.shared(numpy.zeros((2,), dtype=theano.config.floatX))
s = tt.matrix('s')
results, updates = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + tt.tanh(tt.dot(Tl, t_0) + tt.dot(Tr, t_p1) + b),
sequences=dict(input=s, taps=[0, 1]),
outputs_info=tt.zeros_like(b, dtype='float64'),
non_sequences=[Tl, Tr, b],
strict=True
)
true_value = theano.printing.Print('true')(s[0])
false_value = theano.printing.Print('false')(results[-1])
final_result = theano.ifelse.ifelse(tt.eq(s.shape[0], 1), true_value, false_value)
f = theano.function(inputs=[s], outputs=final_result)
print 'a ', f([[1, 2, 3], [4, 5, 6]])
print 'b ', f([[1, 2, 3]])
我正在尝试实现这篇论文:Deep Learning for Answer Sentence Selection 更准确地说,使用 theano 的扫描函数的 Bigram 模型是这样的:
Tl = theano.shared(...)
Tr = theano.shared(...)
b = theano.shared(...)
s = T.matrix('s')
results, updates = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_m1) + b),
sequences = dict(input = s, taps = [0, 1]),
outputs_info = T.zeros_like(b, dtype = 'float64')
non_sequences = [Tl, Tr, b],
strict = True
)
final_result = ifelse(T.eq(s.shape[0],1), s[0], result[-1])
我的问题是有些答案只有一个字长,所以 s 是一个向量而不是矩阵。这会导致扫描功能出现问题,因为缺少序列的 +1 抽头。为了处理这个问题,我想使用 ifelse 语句。
现在我的第一个问题:
这可能吗?还是会始终对扫描进行评估,然后 ifelse 才决定使用哪个值?
第二个问题:
如何使 tensor.eq(a,b) return 成为标量?因为我收到以下错误消息:
TypeError: Condition given to the op has to be a scalar with 0 standing for False, anything else for True
编辑一些触发索引错误的代码,因为如果 theano 计算梯度,ifelse 将不会被惰性求值
import numpy as np
import theano
import theano.tensor as T
from theano.ifelse import ifelse
def trainBigram(q, a, y, seed = 8024, lRate = 0.1, maxEpochs = 1,
modelSize = 3):
# q: list (all questions) of lists(words per question) of list (values of word embedding)
# a: list (all questions) of lists(1 possible answers) of lists(words per possible answer) of list (values of word embedding)
# y: list (all questions) of list (correctness of answer)
# trains on triplets of(q(i), a(i,j), y(i,j)) instead of softmaxing all 4 answer possibilities
# uses SGD
np.random.seed(seed)
# theano variables
q_t = T.matrix('q_t')
a_t = T.matrix('a_t')
y_t = T.scalar('y_t')
# initialize weights (distribution from paper)
M = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bm = theano.shared(np.random.normal(0,0.01,1))
Tl = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
Tr = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
bt = theano.shared(np.random.normal(0,0.01,modelSize))
# define graph for Bigram Model
q_em_scan, scanUpdatesQ = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = q_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
q_em = q_em_scan[-1]
a_em_scan, scanUpdatesA = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
sequences = dict(input = a_t, taps = [0, 1]),
outputs_info = T.zeros_like(bt, dtype = 'float64'),
non_sequences = [Tl, Tr, bt],
strict = True
)
# printing calculated values to check for lazy evaluation of ifelse
MultipleWords= theano.printing.Print('multiple Words answer')(a_em_scan[-1])
OneWord = theano.printing.Print('1 Word answer')(a_t[0])
a_em = ifelse(T.eq(a_t.shape[0], 1), OneWord, MultipleWords)
# define graph for Question/Answer model
prob = 1 / (1 + T.exp( - T.dot(T.dot(M, a_em), q_em) + bm))
xent = - y_t * T.log(prob) - (1 - y_t) * T.log((1 - prob))
loss = xent.sum()
g_M,g_bm, g_Tl, g_Tr, g_bt = T.grad(loss, [M, bm, Tl, Tr, bt])
updates = (
(M, M - lRate * g_M),
(bm, bm - lRate * g_bm),
(Tl, Tl - lRate * g_Tl),
(Tr, Tr - lRate * g_Tr),
(bt, bt - lRate * g_bt)
)
# compile function
train = theano.function(
inputs = [q_t, a_t, y_t],
outputs = prob,
updates = updates
)
# training
for question, answers, labels in zip(q,a,y):
# for triplets instead of softmax
for answer, label in zip(answers, labels):
answer = np.asarray(answer)
if (answer.shape[0] == 1):
print "!!! One-Word-Answer !!!"
print "shape:", answer.shape
prob = train(question, answer, label)
print prob
def main():
questionOne = [[1,2,3],[1,2,3]]
answerOne_One = [[1,2,3], [1,2,3]]
answerOne_Two = [[1,2,3], [1,2,3]]
answersOne = [answerOne_One, answerOne_Two]
correctnessOne = [0,1]
questionTwo = [[4,5,6],[4,5,6]]
answerTwo_One = [[4,5,6]]
answerTwo_Two = [[4,5,6]]
answersTwo = [answerTwo_One, answerTwo_Two]
correctnessTwo = [1,0]
q = [questionOne, questionTwo]
a = [answersOne, answersTwo]
y = [correctnessOne, correctnessTwo]
trainBigram(q,a,y)
main()
我对您的代码进行了表面更改以使其可执行,但它确实有效,并且 ifelse
是 惰性的(即它仅在需要时计算扫描)。为了演示,我添加了一些符号打印语句来显示计算了哪些值。
Tl = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
Tr = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
b = theano.shared(numpy.zeros((2,), dtype=theano.config.floatX))
s = tt.matrix('s')
results, updates = theano.scan(
lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + tt.tanh(tt.dot(Tl, t_0) + tt.dot(Tr, t_p1) + b),
sequences=dict(input=s, taps=[0, 1]),
outputs_info=tt.zeros_like(b, dtype='float64'),
non_sequences=[Tl, Tr, b],
strict=True
)
true_value = theano.printing.Print('true')(s[0])
false_value = theano.printing.Print('false')(results[-1])
final_result = theano.ifelse.ifelse(tt.eq(s.shape[0], 1), true_value, false_value)
f = theano.function(inputs=[s], outputs=final_result)
print 'a ', f([[1, 2, 3], [4, 5, 6]])
print 'b ', f([[1, 2, 3]])