Hierarchical Attention Network - model.fit generates error 'ValueError: Input dimension mis-match'

Question

关于背景，我指的是用于情感分类的Hierarchical Attention Network。

对于代码：我的完整代码在下面posted，但它只是作者对原始代码posted的简单修改上面的link。我在下面解释了我的变化。 对于训练数据：here 对于词嵌入：这是 Glove 嵌入 here 关键配置：Keras 2.0.9，Scikit-Learn 0.19.1，Theano 0.9.0

在上面 link 中编辑的原始代码 post 采用 3D 形状输入，即（评论、句子、单词）。注意力机制应用于句子，也应用于单词。所以它有两个注意力组件，正如您在网页上的第四个代码块中看到的那样。

我想将其更改为仅需要 2D 形状输入的。我这样做

更改输入形状和输入嵌入矩阵（请参阅下面我的代码中的内联注释）
通过删除句子注意力组件来更改模型构建部分，仅保留单词注意力组件（请参阅下面我的代码中的内联注释）

但是，调用 'model.fit' 时代码会产生错误。我post下面是完整的代码和错误。

代码:

import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup

import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical

from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

MAX_SENT_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


#replace this to your own file path
data_train = pd.read_csv('/home/zz/Work/wop/data/sentiment/labeledTrainData_small.tsv', sep='\t')
print(data_train.shape)

labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    text = clean_str(text.get_text().encode('ascii', 'ignore').decode('ascii'))
    texts.append(text)
    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

##################################
# Change 1. The input shape is now 2D (sentence, words) instead of 3D
##################################
data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')
for i, content in enumerate(texts):
    wordTokens = text_to_word_sequence(content)
    k = 0
    for _, word in enumerate(wordTokens):
        if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
            data[i, k] = tokenizer.word_index[word]
            k = k + 1
##################################

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

#replace with your own embedding file path
GLOVE_DIR = "/home/zz/Work/data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)


class AttLayer(Layer):
def __init__(self, attention_dim,**kwargs):
    self.init = initializers.get('normal')
    self.supports_masking = True
    self.attention_dim = attention_dim
    super(AttLayer, self).__init__(**kwargs)

def build(self, input_shape):
    assert len(input_shape) == 3
    self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
    self.b = K.variable(self.init((self.attention_dim,)))
    self.u = K.variable(self.init((self.attention_dim, 1)))
    self.trainable_weights = [self.W, self.b, self.u]
    super(AttLayer, self).build(input_shape)

def compute_mask(self, inputs, mask=None):
    return None

def call(self, x, mask=None):
    # size of x :[batch_size, sel_len, attention_dim]
    # size of u :[batch_size, attention_dim]
    # uit = tanh(xW+b)
    uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
    uit = tf.matmul(x, uit)
    uit = K.tanh(K.bias_add(uit, self.b))
    ait = K.dot(uit, self.u)
    ait = K.squeeze(ait, -1)

    ait = K.exp(ait)

    if mask is not None:
        # Cast the mask to floatX to avoid float64 upcasting in theano
        ait *= K.cast(mask, K.floatx())
    ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
    ait = K.expand_dims(ait)
    weighted_input = x * ait
    output = K.sum(weighted_input, axis=1)

    return output

def compute_output_shape(self, input_shape):
    return (input_shape[0], input_shape[-1])

#################################################
# Change 2. The model contains only one attention block now
#################################################
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer(100)(l_dense)
############################################

preds = Dense(2, activation='softmax')(l_att)
model = Model(sentence_input, preds)

#### clone the model #### Line X
model_copy = clone_model(model)

plot_model(model, to_file="model.png")
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=50,verbose=2)

ERROR：代码的最后一行生成错误跟踪：

Epoch 1/10
Traceback (most recent call last):
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
    self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/zz/Work/wop/code/python/src/3rdparty/han/textClassfierHATT2D.py", line 187, in <module>
    nb_epoch=10, batch_size=50,verbose=2)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1631, in fit
    validation_steps=validation_steps)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in _fit_loop
    outs = f(ins_batch)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/backend/theano_backend.py", line 1223, in __call__
    return self.function(*inputs)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 898, in __call__
    storage_map=getattr(self.fn, 'storage_map', None))
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/gof/link.py", line 325, in raise_with_op
    reraise(exc_type, exc_value, exc_trace)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/six.py", line 692, in reraise
    raise value.with_traceback(tb)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
    self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
Apply node that caused the error: Elemwise{mul,no_inplace}(InplaceDimShuffle{x,0}.0, Elemwise{Cast{float32}}.0)
Toposort index: 459
Inputs types: [TensorType(float32, row), TensorType(float32, matrix)]
Inputs shapes: [(1, 50), (50, 100)]
Inputs strides: [(200, 4), (400, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

我非常感谢对此的一些建议，非常感谢！

Answer 1

在参考教程中，它选择使用 theano 而不是 tensorflow 因为 dot 在 tensorflow 中的行为不同于 numpy ].但是我不熟悉 theano 所以我很难通过使用 theano 后端让它正常工作。我宁愿使用一系列操作来模仿 dot 在 numpy 中的行为。接下来我将 K.dot 更改为一系列操作。

import tensorflow as tf
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed, Lambda
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

class AttLayer(Layer):
    def __init__(self, attention_dim, **kwargs):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
        uit = tf.matmul(x, uit)
        uit = K.tanh(K.bias_add(uit, self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    # https://github.com/keras-team/keras/issues/5401
    # solve the problem of keras.models.clone_model
    # and model.save_weights, model.load_weights
    def get_config(self):
        config = {'attention_dim': self.attention_dim}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

还有compute_mask现在returnsNone因为AttLayer的输出中没有sel_len轴。

以下是验证这两个操作等效的脚本：

B = 8
S = 100
E = 200
A = 50
X = np.random.randn(B, S, E)
W = np.random.randn(E, A)
np_result = np.dot(X, W) #shape correct

X_ph = tf.placeholder(tf.float64)
W_ph = tf.placeholder(tf.float64)

tf_dot = tf.matmul(X_ph, 
                   tf.tile(
                           tf.expand_dims(W_ph, axis=0),
                           (K.shape(X_ph)[0], 1, 1)))

with tf.Session() as sess:
    tf_result = sess.run(tf_dot,
                         feed_dict = {X_ph:X, W_ph:W})

print(np.allclose(np_result, tf_result)) #True

训练历史（我设置batch_size为8）：

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 1247s 62ms/step - loss: 0.4203 - acc: 0.8044 - val_loss: 0.3520 - val_acc: 0.8468
Epoch 2/10
20000/20000 [==============================] - 985s 49ms/step - loss: 0.2344 - acc: 0.9070 - val_loss: 0.3411 - val_acc: 0.8586
Epoch 3/10
20000/20000 [==============================] - 996s 50ms/step - loss: 0.0982 - acc: 0.9628 - val_loss: 0.4474 - val_acc: 0.8512
Epoch 4/10
20000/20000 [==============================] - 966s 48ms/step - loss: 0.0285 - acc: 0.9904 - val_loss: 0.7837 - val_acc: 0.8408
Epoch 5/10
20000/20000 [==============================] - 912s 46ms/step - loss: 0.0179 - acc: 0.9936 - val_loss: 1.0177 - val_acc: 0.8440
Epoch 6/10
20000/20000 [==============================] - 910s 45ms/step - loss: 0.0105 - acc: 0.9963 - val_loss: 1.0635 - val_acc: 0.8418
Epoch 7/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0101 - acc: 0.9964 - val_loss: 1.0966 - val_acc: 0.8372
Epoch 8/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0057 - acc: 0.9981 - val_loss: 1.2678 - val_acc: 0.8392
Epoch 9/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0077 - acc: 0.9974 - val_loss: 1.2166 - val_acc: 0.8258
Epoch 10/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0056 - acc: 0.9985 - val_loss: 1.4640 - val_acc: 0.8204

Hierarchical Attention Network - model.fit generates error 'ValueError: Input dimension mis-match'

Hierarchical Attention Network - model.fit generates error 'ValueError: Input dimension mis-match'

python

neural-network

deep-learning

keras

attention-model