Keras AdditiveAttention Layer 的输出形状
Output shapes of Keras AdditiveAttention Layer
正在尝试使用 Keras
中的 AdditiveAttention
图层。关于从 tensorflow 教程手动实现层 https://www.tensorflow.org/tutorials/text/nmt_with_attention
import tensorflow as tf
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
context_vector
的形状是(batch_size, units)
而使用来自 keras built-in
的相同 AdditiveAttention
图层
from tensorflow.keras.layers import AdditiveAttention
context_vector
的shape
=[batch_size, Tq, dim]
任何有关造成这种 OP shape
差异的建议都会很有用。
除了一些差异外,这两种实现方式彼此相似。该教程中 BahdanauAttention
的实现是一种简化和改编版本,并使用了一些线性变换。你想知道的 context_vector
的 return 形状只不过是输入数据形状的问题。这里有一些演示,让我们看教程实现:
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
现在,我们将一些输入传递给它,3D
和 2D
。
attention_layer = BahdanauAttention(10)
y = tf.random.uniform((2, 60, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 2, 60, 1]))
y = tf.random.uniform((2, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 512]), TensorShape([2, 2, 1]))
现在,将相同的输入传递给内置 AdditiveAttention
,看看我们会得到什么
buit_attn = tf.keras.layers.AdditiveAttention()
y = tf.random.uniform((2, 60, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))
y = tf.random.uniform((2, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 512]), TensorShape([2, 2]))
因此,context_vector
的形状在这里是可比较的,但 attention_weights
的形状不是。原因是,正如我们提到的,我相信该教程的实施有点修改和采用。如果我们查看 BahdanauAttention
or AdditiveAttention
的计算,我们将得到:
- 将
query
和 value
分别重塑为 [batch_size, Tq, 1, dim]
和 [batch_size, 1, Tv, dim]
形状。
- 计算形状为
[batch_size, Tq, Tv]
的分数作为非线性总和:scores = tf.reduce_sum(tf.tanh(query + value), axis=-1)
- 使用分数计算形状为
[batch_size, Tq, Tv]: distribution = tf.nn.softmax(scores)
的分布。
- 使用分布创建形状为
batch_size, Tq, dim]: return tf.matmul(distribution, value)
的值的线性组合。
而且我认为该教程中的实现在计算注意力权重特征方面有点不同。如果我们按照上述方法(1 到 4),我们也会得到 attention_weights
相同的输出形状。方法如下,(但不是这里只是为了演示目的,并不笼统。)
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 2) # [batch_size, Tq, 1, dim]
value_with_time_axis = tf.expand_dims(values, 1) # [batch_size, 1, Tv, dim]
scores = tf.reduce_sum(tf.tanh(query_with_time_axis +
value_with_time_axis), axis=-1)
distribution = tf.nn.softmax(scores)
return tf.matmul(distribution, values), distribution
现在,如果我们传递相同的输入,我们将从两种实现中获得相同的输出形状。但是,一般情况下,用例,应该选择内置实现。
attention_layer = BahdanauAttention(10)
y = tf.random.uniform((2, 60, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))
buit_attn = tf.keras.layers.AdditiveAttention()
y = tf.random.uniform((2, 60, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))
正在尝试使用 Keras
中的 AdditiveAttention
图层。关于从 tensorflow 教程手动实现层 https://www.tensorflow.org/tutorials/text/nmt_with_attention
import tensorflow as tf
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
context_vector
的形状是(batch_size, units)
而使用来自 keras built-in
AdditiveAttention
图层
from tensorflow.keras.layers import AdditiveAttention
context_vector
的shape
=[batch_size, Tq, dim]
任何有关造成这种 OP shape
差异的建议都会很有用。
除了一些差异外,这两种实现方式彼此相似。该教程中 BahdanauAttention
的实现是一种简化和改编版本,并使用了一些线性变换。你想知道的 context_vector
的 return 形状只不过是输入数据形状的问题。这里有一些演示,让我们看教程实现:
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 1)
score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
现在,我们将一些输入传递给它,3D
和 2D
。
attention_layer = BahdanauAttention(10)
y = tf.random.uniform((2, 60, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 2, 60, 1]))
y = tf.random.uniform((2, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 512]), TensorShape([2, 2, 1]))
现在,将相同的输入传递给内置 AdditiveAttention
,看看我们会得到什么
buit_attn = tf.keras.layers.AdditiveAttention()
y = tf.random.uniform((2, 60, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))
y = tf.random.uniform((2, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 512]), TensorShape([2, 2]))
因此,context_vector
的形状在这里是可比较的,但 attention_weights
的形状不是。原因是,正如我们提到的,我相信该教程的实施有点修改和采用。如果我们查看 BahdanauAttention
or AdditiveAttention
的计算,我们将得到:
- 将
query
和value
分别重塑为[batch_size, Tq, 1, dim]
和[batch_size, 1, Tv, dim]
形状。 - 计算形状为
[batch_size, Tq, Tv]
的分数作为非线性总和:scores = tf.reduce_sum(tf.tanh(query + value), axis=-1)
- 使用分数计算形状为
[batch_size, Tq, Tv]: distribution = tf.nn.softmax(scores)
的分布。 - 使用分布创建形状为
batch_size, Tq, dim]: return tf.matmul(distribution, value)
的值的线性组合。
而且我认为该教程中的实现在计算注意力权重特征方面有点不同。如果我们按照上述方法(1 到 4),我们也会得到 attention_weights
相同的输出形状。方法如下,(但不是这里只是为了演示目的,并不笼统。)
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
query_with_time_axis = tf.expand_dims(query, 2) # [batch_size, Tq, 1, dim]
value_with_time_axis = tf.expand_dims(values, 1) # [batch_size, 1, Tv, dim]
scores = tf.reduce_sum(tf.tanh(query_with_time_axis +
value_with_time_axis), axis=-1)
distribution = tf.nn.softmax(scores)
return tf.matmul(distribution, values), distribution
现在,如果我们传递相同的输入,我们将从两种实现中获得相同的输出形状。但是,一般情况下,用例,应该选择内置实现。
attention_layer = BahdanauAttention(10)
y = tf.random.uniform((2, 60, 512))
out, attn = attention_layer(y, y)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))
buit_attn = tf.keras.layers.AdditiveAttention()
y = tf.random.uniform((2, 60, 512))
out, attn = buit_attn([y, y], return_attention_scores=True)
out.shape , attn.shape
(TensorShape([2, 60, 512]), TensorShape([2, 60, 60]))