如何使用 keras-self-attention 包可视化注意力 LSTM?
How visualize attention LSTM using keras-self-attention package?
我正在使用 (keras-self-attention) 在 KERAS 中实现注意力 LSTM。训练模型后如何可视化注意力部分?这是一个时间序列预测案例。
from keras.models import Sequential
from keras_self_attention import SeqWeightedAttention
from keras.layers import LSTM, Dense, Flatten
model = Sequential()
model.add(LSTM(activation = 'tanh' ,units = 200, return_sequences = True,
input_shape = (TrainD[0].shape[1], TrainD[0].shape[2])))
model.add(SeqSelfAttention())
model.add(Flatten())
model.add(Dense(1, activation = 'relu'))
model.compile(optimizer = 'adam', loss = 'mse')
一种方法是获取给定输入的 SeqSelfAttention
的输出,并组织它们以显示每个通道 的预测(见下文)。对于更高级的内容,请查看 iNNvestigate library(包括使用示例)。
更新: 也可以推荐See RNN,我写的一个包
解释:
show_features_1D
获取 layer_name
(可以是子字符串)层输出并显示每个通道的预测(标记),时间步长沿 x 轴,输出值沿 y 轴。
input_data
= 单批次 形状 (1, input_shape)
的数据
prefetched_outputs
= 已经获取的图层输出;覆盖 input_data
max_timesteps
= 要显示的最大时间步数
max_col_subplots
= 沿水平方向的最大子图数
equate_axes
= 强制所有 x 轴和 y 轴相等(推荐用于公平比较)
show_y_zero
= y=0是否显示为红线
channel_axis
= 层特征维度(例如,units
对于 LSTM,这是最后一个)
scale_width, scale_height
= 缩放显示的图像宽度和高度
dpi
= 图像质量(每英寸点数)
视觉效果(下)解释:
- First 对于查看所提取特征的 形状 很有用,无论大小如何 - 提供有关信息,例如频率 内容
- 第二个有助于查看 特征关系 - 例如相对大小、偏差和频率。下面的结果与上面的图像形成鲜明对比,因为 运行
print(outs_1)
表明所有量级都非常小并且变化不大,因此包括 y=0 点和等轴会产生一条线-like visual,可以解释为 self-attention being bias-oriented。
- Third 用于可视化太多无法像上面那样可视化的特征;使用
batch_shape
而不是 input_shape
定义模型会删除打印形状中的所有 ?
,我们可以看到第一个输出的形状是 (10, 60, 240)
,第二个是 (10, 240, 240)
。也就是说,第一个输出returns LSTM channel attention,第二个输出"timesteps attention"。下面的热图结果可以解释为显示注意力 "cooling down" w.r.t。时间步长。
SeqWeightedAttention可视化容易很多,但可视化的东西不多;您需要删除上面的 Flatten
才能使其正常工作。注意力的输出形状然后变成 (10, 60)
和 (10, 240)
- 你可以使用一个简单的直方图 plt.hist
(只要确保你排除了批次维度 - 即 feed (60,)
或(240,)
).
from keras.layers import Input, Dense, LSTM, Flatten, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras_self_attention import SeqSelfAttention
import numpy as np
ipt = Input(shape=(240,4))
x = LSTM(60, activation='tanh', return_sequences=True)(ipt)
x = SeqSelfAttention(return_attention=True)(x)
x = concatenate(x)
x = Flatten()(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(ipt,out)
model.compile(Adam(lr=1e-2), loss='binary_crossentropy')
X = np.random.rand(10,240,4) # dummy data
Y = np.random.randint(0,2,(10,1)) # dummy labels
model.train_on_batch(X, Y)
outs = get_layer_outputs(model, 'seq', X[0:1], 1)
outs_1 = outs[0]
outs_2 = outs[1]
show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=False,show_y_zero=False)
show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=True, show_y_zero=True)
show_features_2D(outs_2[0]) # [0] for 2D since 'outs_2' is 3D
def show_features_1D(model=None, layer_name=None, input_data=None,
prefetched_outputs=None, max_timesteps=100,
max_col_subplots=10, equate_axes=False,
show_y_zero=True, channel_axis=-1,
scale_width=1, scale_height=1, dpi=76):
if prefetched_outputs is None:
layer_outputs = get_layer_outputs(model, layer_name, input_data, 1)[0]
else:
layer_outputs = prefetched_outputs
n_features = layer_outputs.shape[channel_axis]
for _int in range(1, max_col_subplots+1):
if (n_features/_int).is_integer():
n_cols = int(n_features/_int)
n_rows = int(n_features/n_cols)
fig, axes = plt.subplots(n_rows,n_cols,sharey=equate_axes,dpi=dpi)
fig.set_size_inches(24*scale_width,16*scale_height)
subplot_idx = 0
for row_idx in range(axes.shape[0]):
for col_idx in range(axes.shape[1]):
subplot_idx += 1
feature_output = layer_outputs[:,subplot_idx-1]
feature_output = feature_output[:max_timesteps]
ax = axes[row_idx,col_idx]
if show_y_zero:
ax.axhline(0,color='red')
ax.plot(feature_output)
ax.axis(xmin=0,xmax=len(feature_output))
ax.axis('off')
ax.annotate(str(subplot_idx),xy=(0,.99),xycoords='axes fraction',
weight='bold',fontsize=14,color='g')
if equate_axes:
y_new = []
for row_axis in axes:
y_new += [np.max(np.abs([col_axis.get_ylim() for
col_axis in row_axis]))]
y_new = np.max(y_new)
for row_axis in axes:
[col_axis.set_ylim(-y_new,y_new) for col_axis in row_axis]
plt.show()
def show_features_2D(data, cmap='bwr', norm=None,
scale_width=1, scale_height=1):
if norm is not None:
vmin, vmax = norm
else:
vmin, vmax = None, None # scale automatically per min-max of 'data'
plt.imshow(data, cmap=cmap, vmin=vmin, vmax=vmax)
plt.xlabel('Timesteps', weight='bold', fontsize=14)
plt.ylabel('Attention features', weight='bold', fontsize=14)
plt.colorbar(fraction=0.046, pad=0.04) # works for any size plot
plt.gcf().set_size_inches(8*scale_width, 8*scale_height)
plt.show()
def get_layer_outputs(model, layer_name, input_data, learning_phase=1):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return layers_fn([input_data, learning_phase])
SeqWeightedAttention 示例 每个请求:
ipt = Input(batch_shape=(10,240,4))
x = LSTM(60, activation='tanh', return_sequences=True)(ipt)
x = SeqWeightedAttention(return_attention=True)(x)
x = concatenate(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(ipt,out)
model.compile(Adam(lr=1e-2), loss='binary_crossentropy')
X = np.random.rand(10,240,4) # dummy data
Y = np.random.randint(0,2,(10,1)) # dummy labels
model.train_on_batch(X, Y)
outs = get_layer_outputs(model, 'seq', X, 1)
outs_1 = outs[0][0] # additional index since using batch_shape
outs_2 = outs[1][0]
plt.hist(outs_1, bins=500); plt.show()
plt.hist(outs_2, bins=500); plt.show()
我正在使用 (keras-self-attention) 在 KERAS 中实现注意力 LSTM。训练模型后如何可视化注意力部分?这是一个时间序列预测案例。
from keras.models import Sequential
from keras_self_attention import SeqWeightedAttention
from keras.layers import LSTM, Dense, Flatten
model = Sequential()
model.add(LSTM(activation = 'tanh' ,units = 200, return_sequences = True,
input_shape = (TrainD[0].shape[1], TrainD[0].shape[2])))
model.add(SeqSelfAttention())
model.add(Flatten())
model.add(Dense(1, activation = 'relu'))
model.compile(optimizer = 'adam', loss = 'mse')
一种方法是获取给定输入的 SeqSelfAttention
的输出,并组织它们以显示每个通道 的预测(见下文)。对于更高级的内容,请查看 iNNvestigate library(包括使用示例)。
更新: 也可以推荐See RNN,我写的一个包
解释:
show_features_1D
获取 layer_name
(可以是子字符串)层输出并显示每个通道的预测(标记),时间步长沿 x 轴,输出值沿 y 轴。
input_data
= 单批次 形状(1, input_shape)
的数据
prefetched_outputs
= 已经获取的图层输出;覆盖input_data
max_timesteps
= 要显示的最大时间步数max_col_subplots
= 沿水平方向的最大子图数equate_axes
= 强制所有 x 轴和 y 轴相等(推荐用于公平比较)show_y_zero
= y=0是否显示为红线channel_axis
= 层特征维度(例如,units
对于 LSTM,这是最后一个)scale_width, scale_height
= 缩放显示的图像宽度和高度dpi
= 图像质量(每英寸点数)
视觉效果(下)解释:
- First 对于查看所提取特征的 形状 很有用,无论大小如何 - 提供有关信息,例如频率 内容
- 第二个有助于查看 特征关系 - 例如相对大小、偏差和频率。下面的结果与上面的图像形成鲜明对比,因为 运行
print(outs_1)
表明所有量级都非常小并且变化不大,因此包括 y=0 点和等轴会产生一条线-like visual,可以解释为 self-attention being bias-oriented。 - Third 用于可视化太多无法像上面那样可视化的特征;使用
batch_shape
而不是input_shape
定义模型会删除打印形状中的所有?
,我们可以看到第一个输出的形状是(10, 60, 240)
,第二个是(10, 240, 240)
。也就是说,第一个输出returns LSTM channel attention,第二个输出"timesteps attention"。下面的热图结果可以解释为显示注意力 "cooling down" w.r.t。时间步长。
SeqWeightedAttention可视化容易很多,但可视化的东西不多;您需要删除上面的 Flatten
才能使其正常工作。注意力的输出形状然后变成 (10, 60)
和 (10, 240)
- 你可以使用一个简单的直方图 plt.hist
(只要确保你排除了批次维度 - 即 feed (60,)
或(240,)
).
from keras.layers import Input, Dense, LSTM, Flatten, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras_self_attention import SeqSelfAttention
import numpy as np
ipt = Input(shape=(240,4))
x = LSTM(60, activation='tanh', return_sequences=True)(ipt)
x = SeqSelfAttention(return_attention=True)(x)
x = concatenate(x)
x = Flatten()(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(ipt,out)
model.compile(Adam(lr=1e-2), loss='binary_crossentropy')
X = np.random.rand(10,240,4) # dummy data
Y = np.random.randint(0,2,(10,1)) # dummy labels
model.train_on_batch(X, Y)
outs = get_layer_outputs(model, 'seq', X[0:1], 1)
outs_1 = outs[0]
outs_2 = outs[1]
show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=False,show_y_zero=False)
show_features_1D(model,'lstm',X[0:1],max_timesteps=100,equate_axes=True, show_y_zero=True)
show_features_2D(outs_2[0]) # [0] for 2D since 'outs_2' is 3D
def show_features_1D(model=None, layer_name=None, input_data=None,
prefetched_outputs=None, max_timesteps=100,
max_col_subplots=10, equate_axes=False,
show_y_zero=True, channel_axis=-1,
scale_width=1, scale_height=1, dpi=76):
if prefetched_outputs is None:
layer_outputs = get_layer_outputs(model, layer_name, input_data, 1)[0]
else:
layer_outputs = prefetched_outputs
n_features = layer_outputs.shape[channel_axis]
for _int in range(1, max_col_subplots+1):
if (n_features/_int).is_integer():
n_cols = int(n_features/_int)
n_rows = int(n_features/n_cols)
fig, axes = plt.subplots(n_rows,n_cols,sharey=equate_axes,dpi=dpi)
fig.set_size_inches(24*scale_width,16*scale_height)
subplot_idx = 0
for row_idx in range(axes.shape[0]):
for col_idx in range(axes.shape[1]):
subplot_idx += 1
feature_output = layer_outputs[:,subplot_idx-1]
feature_output = feature_output[:max_timesteps]
ax = axes[row_idx,col_idx]
if show_y_zero:
ax.axhline(0,color='red')
ax.plot(feature_output)
ax.axis(xmin=0,xmax=len(feature_output))
ax.axis('off')
ax.annotate(str(subplot_idx),xy=(0,.99),xycoords='axes fraction',
weight='bold',fontsize=14,color='g')
if equate_axes:
y_new = []
for row_axis in axes:
y_new += [np.max(np.abs([col_axis.get_ylim() for
col_axis in row_axis]))]
y_new = np.max(y_new)
for row_axis in axes:
[col_axis.set_ylim(-y_new,y_new) for col_axis in row_axis]
plt.show()
def show_features_2D(data, cmap='bwr', norm=None,
scale_width=1, scale_height=1):
if norm is not None:
vmin, vmax = norm
else:
vmin, vmax = None, None # scale automatically per min-max of 'data'
plt.imshow(data, cmap=cmap, vmin=vmin, vmax=vmax)
plt.xlabel('Timesteps', weight='bold', fontsize=14)
plt.ylabel('Attention features', weight='bold', fontsize=14)
plt.colorbar(fraction=0.046, pad=0.04) # works for any size plot
plt.gcf().set_size_inches(8*scale_width, 8*scale_height)
plt.show()
def get_layer_outputs(model, layer_name, input_data, learning_phase=1):
outputs = [layer.output for layer in model.layers if layer_name in layer.name]
layers_fn = K.function([model.input, K.learning_phase()], outputs)
return layers_fn([input_data, learning_phase])
SeqWeightedAttention 示例 每个请求:
ipt = Input(batch_shape=(10,240,4))
x = LSTM(60, activation='tanh', return_sequences=True)(ipt)
x = SeqWeightedAttention(return_attention=True)(x)
x = concatenate(x)
out = Dense(1, activation='sigmoid')(x)
model = Model(ipt,out)
model.compile(Adam(lr=1e-2), loss='binary_crossentropy')
X = np.random.rand(10,240,4) # dummy data
Y = np.random.randint(0,2,(10,1)) # dummy labels
model.train_on_batch(X, Y)
outs = get_layer_outputs(model, 'seq', X, 1)
outs_1 = outs[0][0] # additional index since using batch_shape
outs_2 = outs[1][0]
plt.hist(outs_1, bins=500); plt.show()
plt.hist(outs_2, bins=500); plt.show()