Theano:使用 ifelse 来抑制扫描的评估

Theano: use ifelse to suppress evaluation of scan

我正在尝试实现这篇论文:Deep Learning for Answer Sentence Selection 更准确地说,使用 theano 的扫描函数的 Bigram 模型是这样的:

Tl = theano.shared(...)
Tr = theano.shared(...)
b = theano.shared(...)

s = T.matrix('s')

results, updates = theano.scan(
        lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_m1) + b),
        sequences = dict(input = s, taps = [0, 1]),
        outputs_info = T.zeros_like(b, dtype = 'float64')
        non_sequences = [Tl, Tr, b],
        strict = True
        )
final_result = ifelse(T.eq(s.shape[0],1), s[0], result[-1])

我的问题是有些答案只有一个字长,所以 s 是一个向量而不是矩阵。这会导致扫描功能出现问题,因为缺少序列的 +1 抽头。为了处理这个问题,我想使用 ifelse 语句。

现在我的第一个问题:

这可能吗?还是会始终对扫描进行评估,然后 ifelse 才决定使用哪个值?


第二个问题:

如何使 tensor.eq(a,b) return 成为标量?因为我收到以下错误消息:

TypeError: Condition given to the op has to be a scalar with 0 standing for False, anything else for True


编辑一些触发索引错误的代码,因为如果 theano 计算梯度,ifelse 将不会被惰性求值

import numpy as np 

import theano
import theano.tensor as T 

from theano.ifelse import ifelse

def trainBigram(q, a, y, seed = 8024, lRate = 0.1, maxEpochs = 1,
    modelSize = 3):

    # q: list (all questions) of lists(words per question) of list (values of word embedding)
    # a: list (all questions) of lists(1 possible answers) of lists(words per possible answer) of list (values of word embedding)
    # y: list (all questions) of list (correctness of answer)

    # trains on triplets of(q(i), a(i,j), y(i,j)) instead of softmaxing all 4 answer possibilities
    # uses SGD

    np.random.seed(seed)

    # theano variables
    q_t = T.matrix('q_t')
    a_t = T.matrix('a_t')
    y_t = T.scalar('y_t')

    # initialize weights (distribution from paper)
    M = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
    bm = theano.shared(np.random.normal(0,0.01,1))

    Tl = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
    Tr = theano.shared(np.random.normal(0,0.01,(modelSize,modelSize)))
    bt = theano.shared(np.random.normal(0,0.01,modelSize))


    # define graph for Bigram Model
    q_em_scan, scanUpdatesQ = theano.scan(
                    lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
                    sequences = dict(input = q_t, taps = [0, 1]),
                    outputs_info = T.zeros_like(bt, dtype = 'float64'),
                    non_sequences = [Tl, Tr, bt],
                    strict = True
    )
    q_em = q_em_scan[-1]

    a_em_scan, scanUpdatesA = theano.scan(
                    lambda t_0, t_p1, prior_result, Tl, Tr, bt: prior_result + T.tanh(T.dot(Tl, t_0) + T.dot(Tr, t_p1) + bt),
                    sequences = dict(input = a_t, taps = [0, 1]),
                    outputs_info = T.zeros_like(bt, dtype = 'float64'), 
                    non_sequences = [Tl, Tr, bt],
                    strict = True
    )

    # printing calculated values to check for lazy evaluation of ifelse
    MultipleWords= theano.printing.Print('multiple Words answer')(a_em_scan[-1])
    OneWord = theano.printing.Print('1 Word answer')(a_t[0])
    a_em = ifelse(T.eq(a_t.shape[0], 1), OneWord, MultipleWords)

    # define graph for Question/Answer model
    prob = 1 / (1 + T.exp( - T.dot(T.dot(M, a_em), q_em) + bm))
    xent = - y_t * T.log(prob) - (1 - y_t) * T.log((1 - prob))
    loss = xent.sum()

    g_M,g_bm, g_Tl, g_Tr, g_bt = T.grad(loss, [M, bm, Tl, Tr, bt])


    updates = (
            (M, M - lRate * g_M), 
            (bm, bm - lRate * g_bm), 
            (Tl, Tl - lRate * g_Tl),
            (Tr, Tr - lRate * g_Tr),
            (bt, bt - lRate * g_bt)
    )

    # compile function
    train = theano.function(
        inputs = [q_t, a_t, y_t],
        outputs = prob,
        updates = updates
    )

    # training
    for question, answers, labels in zip(q,a,y):


        # for triplets instead of softmax
        for answer, label in zip(answers, labels):

            answer = np.asarray(answer)

            if (answer.shape[0] == 1):
                print "!!! One-Word-Answer !!!"
                print "shape:", answer.shape

            prob = train(question, answer, label)
            print prob




def main():

    questionOne = [[1,2,3],[1,2,3]]
    answerOne_One = [[1,2,3], [1,2,3]]
    answerOne_Two = [[1,2,3], [1,2,3]]
    answersOne = [answerOne_One, answerOne_Two]
    correctnessOne = [0,1]

    questionTwo = [[4,5,6],[4,5,6]]
    answerTwo_One = [[4,5,6]]
    answerTwo_Two = [[4,5,6]]
    answersTwo = [answerTwo_One, answerTwo_Two]
    correctnessTwo = [1,0]

    q = [questionOne, questionTwo]
    a = [answersOne, answersTwo]
    y = [correctnessOne, correctnessTwo]

    trainBigram(q,a,y)

main()

我对您的代码进行了表面更改以使其可执行,但它确实有效,并且 ifelse 惰性的(即它仅在需要时计算扫描)。为了演示,我添加了一些符号打印语句来显示计算了哪些值。

Tl = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
Tr = theano.shared(numpy.random.randn(2, 3).astype(theano.config.floatX))
b = theano.shared(numpy.zeros((2,), dtype=theano.config.floatX))

s = tt.matrix('s')

results, updates = theano.scan(
    lambda t_0, t_p1, prior_result, Tl, Tr, b: prior_result + tt.tanh(tt.dot(Tl, t_0) + tt.dot(Tr, t_p1) + b),
    sequences=dict(input=s, taps=[0, 1]),
    outputs_info=tt.zeros_like(b, dtype='float64'),
    non_sequences=[Tl, Tr, b],
    strict=True
)

true_value = theano.printing.Print('true')(s[0])
false_value = theano.printing.Print('false')(results[-1])
final_result = theano.ifelse.ifelse(tt.eq(s.shape[0], 1), true_value, false_value)
f = theano.function(inputs=[s], outputs=final_result)
print 'a ', f([[1, 2, 3], [4, 5, 6]])
print 'b ', f([[1, 2, 3]])