Hierarchical Attention Network - model.fit generates error 'ValueError: Input dimension mis-match'

Hierarchical Attention Network - model.fit generates error 'ValueError: Input dimension mis-match'

关于背景,我指的是用于情感分类的Hierarchical Attention Network

对于代码:我的完整代码在下面posted,但它只是作者对原始代码posted的简单修改上面的link。我在下面解释了我的变化。 对于训练数据here 对于词嵌入:这是 Glove 嵌入 here 关键配置:Keras 2.0.9,Scikit-Learn 0.19.1,Theano 0.9.0

在上面 link 中编辑的原始代码 post 采用 3D 形状输入,即(评论、句子、单词)。注意力机制应用于句子,也应用于单词。所以它有两个注意力组件,正如您在网页上的第四个代码块中看到的那样。

我想将其更改为仅需要 2D 形状输入的。我这样做

  1. 更改输入形状和输入嵌入矩阵(请参阅下面我的代码中的内联注释)
  2. 通过删除句子注意力组件来更改模型构建部分,仅保留单词注意力组件(请参阅下面我的代码中的内联注释)

但是,调用 'model.fit' 时代码会产生错误。我post下面是完整的代码和错误。

代码:

import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup

import os

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical

from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

MAX_SENT_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2


def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


#replace this to your own file path
data_train = pd.read_csv('/home/zz/Work/wop/data/sentiment/labeledTrainData_small.tsv', sep='\t')
print(data_train.shape)

labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    text = clean_str(text.get_text().encode('ascii', 'ignore').decode('ascii'))
    texts.append(text)
    labels.append(data_train.sentiment[idx])

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

##################################
# Change 1. The input shape is now 2D (sentence, words) instead of 3D
##################################
data = np.zeros((len(texts), MAX_SENT_LENGTH), dtype='int32')
for i, content in enumerate(texts):
    wordTokens = text_to_word_sequence(content)
    k = 0
    for _, word in enumerate(wordTokens):
        if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
            data[i, k] = tokenizer.word_index[word]
            k = k + 1
##################################

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

#replace with your own embedding file path
GLOVE_DIR = "/home/zz/Work/data/glove.6B"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True)


class AttLayer(Layer):
def __init__(self, attention_dim,**kwargs):
    self.init = initializers.get('normal')
    self.supports_masking = True
    self.attention_dim = attention_dim
    super(AttLayer, self).__init__(**kwargs)

def build(self, input_shape):
    assert len(input_shape) == 3
    self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
    self.b = K.variable(self.init((self.attention_dim,)))
    self.u = K.variable(self.init((self.attention_dim, 1)))
    self.trainable_weights = [self.W, self.b, self.u]
    super(AttLayer, self).build(input_shape)

def compute_mask(self, inputs, mask=None):
    return None

def call(self, x, mask=None):
    # size of x :[batch_size, sel_len, attention_dim]
    # size of u :[batch_size, attention_dim]
    # uit = tanh(xW+b)
    uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
    uit = tf.matmul(x, uit)
    uit = K.tanh(K.bias_add(uit, self.b))
    ait = K.dot(uit, self.u)
    ait = K.squeeze(ait, -1)

    ait = K.exp(ait)

    if mask is not None:
        # Cast the mask to floatX to avoid float64 upcasting in theano
        ait *= K.cast(mask, K.floatx())
    ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
    ait = K.expand_dims(ait)
    weighted_input = x * ait
    output = K.sum(weighted_input, axis=1)

    return output

def compute_output_shape(self, input_shape):
    return (input_shape[0], input_shape[-1])

#################################################
# Change 2. The model contains only one attention block now
#################################################
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer(100)(l_dense)
############################################

preds = Dense(2, activation='softmax')(l_att)
model = Model(sentence_input, preds)

#### clone the model #### Line X
model_copy = clone_model(model)

plot_model(model, to_file="model.png")
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=50,verbose=2)

ERROR:代码的最后一行生成错误跟踪:

Epoch 1/10
Traceback (most recent call last):
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
    self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/zz/Work/wop/code/python/src/3rdparty/han/textClassfierHATT2D.py", line 187, in <module>
    nb_epoch=10, batch_size=50,verbose=2)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1631, in fit
    validation_steps=validation_steps)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in _fit_loop
    outs = f(ins_batch)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/keras/backend/theano_backend.py", line 1223, in __call__
    return self.function(*inputs)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 898, in __call__
    storage_map=getattr(self.fn, 'storage_map', None))
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/gof/link.py", line 325, in raise_with_op
    reraise(exc_type, exc_value, exc_trace)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/six.py", line 692, in reraise
    raise value.with_traceback(tb)
  File "/home/zz/Programs/anaconda3/lib/python3.6/site-packages/theano/compile/function_module.py", line 884, in __call__
    self.fn() if output_subset is None else\
ValueError: Input dimension mis-match. (input[0].shape[1] = 50, input[1].shape[1] = 100)
Apply node that caused the error: Elemwise{mul,no_inplace}(InplaceDimShuffle{x,0}.0, Elemwise{Cast{float32}}.0)
Toposort index: 459
Inputs types: [TensorType(float32, row), TensorType(float32, matrix)]
Inputs shapes: [(1, 50), (50, 100)]
Inputs strides: [(200, 4), (400, 4)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

我非常感谢对此的一些建议,非常感谢!

在参考教程中,它选择使用 theano 而不是 tensorflow 因为 dottensorflow 中的行为不同于 numpy ].但是我不熟悉 theano 所以我很难通过使用 theano 后端让它正常工作。我宁愿使用一系列操作来模仿 dotnumpy 中的行为。接下来我将 K.dot 更改为一系列操作。

import tensorflow as tf
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import os
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, GRU, Bidirectional, TimeDistributed, Lambda
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers

class AttLayer(Layer):
    def __init__(self, attention_dim, **kwargs):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tile(K.expand_dims(self.W, axis=0), (K.shape(x)[0], 1, 1))
        uit = tf.matmul(x, uit)
        uit = K.tanh(K.bias_add(uit, self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    # https://github.com/keras-team/keras/issues/5401
    # solve the problem of keras.models.clone_model
    # and model.save_weights, model.load_weights
    def get_config(self):
        config = {'attention_dim': self.attention_dim}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

还有compute_mask现在returnsNone因为AttLayer的输出中没有sel_len轴。

以下是验证这两个操作等效的脚本:

B = 8
S = 100
E = 200
A = 50
X = np.random.randn(B, S, E)
W = np.random.randn(E, A)
np_result = np.dot(X, W) #shape correct

X_ph = tf.placeholder(tf.float64)
W_ph = tf.placeholder(tf.float64)

tf_dot = tf.matmul(X_ph, 
                   tf.tile(
                           tf.expand_dims(W_ph, axis=0),
                           (K.shape(X_ph)[0], 1, 1)))

with tf.Session() as sess:
    tf_result = sess.run(tf_dot,
                         feed_dict = {X_ph:X, W_ph:W})

print(np.allclose(np_result, tf_result)) #True

训练历史(我设置batch_size为8):

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 1247s 62ms/step - loss: 0.4203 - acc: 0.8044 - val_loss: 0.3520 - val_acc: 0.8468
Epoch 2/10
20000/20000 [==============================] - 985s 49ms/step - loss: 0.2344 - acc: 0.9070 - val_loss: 0.3411 - val_acc: 0.8586
Epoch 3/10
20000/20000 [==============================] - 996s 50ms/step - loss: 0.0982 - acc: 0.9628 - val_loss: 0.4474 - val_acc: 0.8512
Epoch 4/10
20000/20000 [==============================] - 966s 48ms/step - loss: 0.0285 - acc: 0.9904 - val_loss: 0.7837 - val_acc: 0.8408
Epoch 5/10
20000/20000 [==============================] - 912s 46ms/step - loss: 0.0179 - acc: 0.9936 - val_loss: 1.0177 - val_acc: 0.8440
Epoch 6/10
20000/20000 [==============================] - 910s 45ms/step - loss: 0.0105 - acc: 0.9963 - val_loss: 1.0635 - val_acc: 0.8418
Epoch 7/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0101 - acc: 0.9964 - val_loss: 1.0966 - val_acc: 0.8372
Epoch 8/10
20000/20000 [==============================] - 909s 45ms/step - loss: 0.0057 - acc: 0.9981 - val_loss: 1.2678 - val_acc: 0.8392
Epoch 9/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0077 - acc: 0.9974 - val_loss: 1.2166 - val_acc: 0.8258
Epoch 10/10
20000/20000 [==============================] - 910s 46ms/step - loss: 0.0056 - acc: 0.9985 - val_loss: 1.4640 - val_acc: 0.8204